From a58ee49307557e5030dd7e22c38be39ca2d643f3 Mon Sep 17 00:00:00 2001 From: reger Date: Sat, 31 Oct 2015 19:18:46 +0100 Subject: [PATCH 1/8] Optimize internal imagequery focus on using content_type to select images (in favor of url file extension) --- source/net/yacy/search/query/QueryGoal.java | 36 +++++++++++++++---- .../schema/CollectionConfiguration.java | 7 ++-- 2 files changed, 33 insertions(+), 10 deletions(-) diff --git a/source/net/yacy/search/query/QueryGoal.java b/source/net/yacy/search/query/QueryGoal.java index 70b551c4c..5f38ad4b1 100644 --- a/source/net/yacy/search/query/QueryGoal.java +++ b/source/net/yacy/search/query/QueryGoal.java @@ -330,16 +330,29 @@ public class QueryGoal { for (final byte[] b: blues) this.include_hashes.remove(b); } + /** + * Generate a Solr filter query to receive valid urls + * + * This filters out error-urls. + * On noimages=true a filter is added to exclude links to images + * using the content_type (as well as urls with common image file extension) + * + * @param noimages true if filter for images should be included + * @return Solr filter query + */ public List collectionTextFilterQuery(boolean noimages) { final ArrayList fqs = new ArrayList<>(); // add filter to prevent that results come from failed urls fqs.add(CollectionSchema.httpstatus_i.getSolrFieldName() + ":200"); - if (noimages) fqs.add("-" + CollectionSchema.url_file_ext_s.getSolrFieldName() + ":(jpg OR png OR gif)"); + if (noimages) { + fqs.add("-" + CollectionSchema.content_type.getSolrFieldName() + ":(image/*)"); + fqs.add("-" + CollectionSchema.url_file_ext_s.getSolrFieldName() + ":(jpg OR png OR gif)"); + } return fqs; } - + public StringBuilder collectionTextQuery() { // parse special requests @@ -348,16 +361,27 @@ public class QueryGoal { // add goal query return getGoalQuery(); } - + + /** + * Generate a Solr filter query to receive valid image results. + * + * This filters error-urls out and includes urls with mime image/* as well + * as urls with links to images. + * We use the mime (image/*) only to find images as the parser assigned the + * best mime to index documents. This applies also to parsed file systems. + * This ensures that no text urls with image-fileextension is returned + * (as some large internet sites like to use such urls) + * + * @return Solr filter query for image urls + */ public List collectionImageFilterQuery() { final ArrayList fqs = new ArrayList<>(); // add filter to prevent that results come from failed urls fqs.add(CollectionSchema.httpstatus_i.getSolrFieldName() + ":200"); fqs.add( - CollectionSchema.images_urlstub_sxt.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM + " OR " + - CollectionSchema.url_file_ext_s.getSolrFieldName() + ":(jpg OR png OR gif) OR " + - CollectionSchema.content_type.getSolrFieldName() + ":(image/*)"); + CollectionSchema.content_type.getSolrFieldName() + ":(image/*) OR " + + CollectionSchema.images_urlstub_sxt.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM); return fqs; } diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java index f98d49c78..047aee7a0 100644 --- a/source/net/yacy/search/schema/CollectionConfiguration.java +++ b/source/net/yacy/search/schema/CollectionConfiguration.java @@ -419,7 +419,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri final DigestURL digestURL = document.dc_source(); boolean allAttr = this.isEmpty(); String url = addURIAttributes(doc, allAttr, digestURL); - if (allAttr || contains(CollectionSchema.content_type)) add(doc, CollectionSchema.content_type, new String[]{document.dc_format()}); + add(doc, CollectionSchema.content_type, new String[]{document.dc_format()}); // content_type (mime) is defined a schema field and we rely on it in some queries like imagequery (makes it mandatory, no need to check) Set processTypes = new LinkedHashSet(); String host = digestURL.getHost(); @@ -2028,9 +2028,8 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri final SolrInputDocument doc = new SolrInputDocument(); String url = configuration.addURIAttributes(doc, allAttr, this.getDigestURL()); - - if (allAttr || configuration.contains(CollectionSchema.content_type)) configuration.add(doc, CollectionSchema.content_type, new String[]{Classification.url2mime(this.digestURL)}); - + // content_type (mime) is defined a schema field and we rely on it in some queries like imagequery (makes it mandatory, no need to check) + CollectionSchema.content_type.add(doc, new String[]{Classification.url2mime(this.digestURL)}); if (allAttr || configuration.contains(CollectionSchema.load_date_dt)) configuration.add(doc, CollectionSchema.load_date_dt, getFailDate()); if (allAttr || configuration.contains(CollectionSchema.crawldepth_i)) configuration.add(doc, CollectionSchema.crawldepth_i, this.crawldepth); From 11f36666602cead6a420f7349e934462a74b4364 Mon Sep 17 00:00:00 2001 From: reger Date: Sat, 31 Oct 2015 19:44:31 +0100 Subject: [PATCH 2/8] increase use of pre.defined CATCHALL_QUERY string --- source/net/yacy/search/index/Fulltext.java | 2 +- source/net/yacy/search/query/QueryGoal.java | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/source/net/yacy/search/index/Fulltext.java b/source/net/yacy/search/index/Fulltext.java index 1b460d0a1..d6d6b411b 100644 --- a/source/net/yacy/search/index/Fulltext.java +++ b/source/net/yacy/search/index/Fulltext.java @@ -649,7 +649,7 @@ public final class Fulltext { // format: 0=text, 1=html, 2=rss/xml this.f = f; this.pattern = filter == null ? null : Pattern.compile(filter); - this.query = query == null? "*:*" : query; + this.query = query == null? AbstractSolrConnector.CATCHALL_QUERY : query; this.count = 0; this.failure = null; this.format = format; diff --git a/source/net/yacy/search/query/QueryGoal.java b/source/net/yacy/search/query/QueryGoal.java index 5f38ad4b1..e3338dbd2 100644 --- a/source/net/yacy/search/query/QueryGoal.java +++ b/source/net/yacy/search/query/QueryGoal.java @@ -356,7 +356,7 @@ public class QueryGoal { public StringBuilder collectionTextQuery() { // parse special requests - if (isCatchall()) return new StringBuilder("*:*"); + if (isCatchall()) return new StringBuilder(AbstractSolrConnector.CATCHALL_QUERY); // add goal query return getGoalQuery(); @@ -389,7 +389,7 @@ public class QueryGoal { final StringBuilder q = new StringBuilder(80); // parse special requests - if (isCatchall()) return new StringBuilder("*:*"); + if (isCatchall()) return new StringBuilder(AbstractSolrConnector.CATCHALL_QUERY); // add goal query StringBuilder w = getGoalQuery(); From 02afba730eb3d0f9110e3c1a875297ceb32b4602 Mon Sep 17 00:00:00 2001 From: reger Date: Sat, 31 Oct 2015 22:53:59 +0100 Subject: [PATCH 3/8] fix detection of https port changed after set in System Admin --- htroot/SettingsAck_p.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/htroot/SettingsAck_p.java b/htroot/SettingsAck_p.java index 70e125474..601ab2cbf 100644 --- a/htroot/SettingsAck_p.java +++ b/htroot/SettingsAck_p.java @@ -494,7 +494,7 @@ public class SettingsAck_p { // change https port if (post.containsKey("port.ssl")) { int port = post.getInt("port.ssl", 8443); - if (port > 0 && port != env.getLocalPort("port", 8090)) { + if (port > 0 && port != env.getConfigInt("port.ssl", 8443)) { env.setConfig("port.ssl", port); } prop.put("info_port.ssl", port); From a60b1fb6c2eb72ee33d6ecee12e362853f410bf2 Mon Sep 17 00:00:00 2001 From: reger Date: Sat, 31 Oct 2015 23:09:03 +0100 Subject: [PATCH 4/8] differentiate api call getLocalPort() from getConfigInt() --- htroot/ConfigBasic.java | 4 ++-- htroot/ConfigPortal.java | 2 +- htroot/ConfigSearchBox.java | 2 +- htroot/CrawlStartScanner_p.java | 4 ++-- htroot/Load_MediawikiWiki.java | 2 +- htroot/Load_PHPBB3.java | 2 +- htroot/SettingsAck_p.java | 2 +- htroot/Settings_p.java | 2 +- htroot/Table_API_p.java | 2 +- htroot/api/push_p.java | 2 +- htroot/opensearchdescription.java | 2 +- htroot/yacysearch.java | 4 ++-- htroot/yacysearch_location.java | 4 ++-- source/net/yacy/gui/Tray.java | 4 ++-- source/net/yacy/http/Jetty9HttpServerImpl.java | 6 +++--- source/net/yacy/peers/SeedDB.java | 2 +- source/net/yacy/search/Switchboard.java | 4 ++-- source/net/yacy/server/serverSwitch.java | 10 ++++------ source/net/yacy/yacy.java | 4 ++-- 19 files changed, 31 insertions(+), 33 deletions(-) diff --git a/htroot/ConfigBasic.java b/htroot/ConfigBasic.java index 0e2d5a6fc..5027819b0 100644 --- a/htroot/ConfigBasic.java +++ b/htroot/ConfigBasic.java @@ -100,7 +100,7 @@ public class ConfigBasic { port = post.getLong("port", 8090); ssl = post.getBoolean("withssl"); } else { - port = env.getLocalPort("port", 8090); //this allows a low port, but it will only get one, if the user edits the config himself. + port = env.getLocalPort(); //this allows a low port, but it will only get one, if the user edits the config himself. ssl = env.getConfigBool("server.https", false); } if (ssl) prop.put("withsslenabled_sslport",env.getHttpServer().getSslPort()); @@ -266,7 +266,7 @@ public class ConfigBasic { // set default values prop.putHTML("defaultName", sb.peers.mySeed().getName()); - prop.put("defaultPort", env.getLocalPort("port", 8090)); + prop.put("defaultPort", env.getLocalPort()); prop.put("withsslenabled", env.getConfigBool("server.https", false) ? 1 : 0); lang = env.getConfig("locale.language", "default"); // re-assign lang, may have changed prop.put("lang_de", "0"); diff --git a/htroot/ConfigPortal.java b/htroot/ConfigPortal.java index 3aa5627b3..2205e2fa9 100644 --- a/htroot/ConfigPortal.java +++ b/htroot/ConfigPortal.java @@ -224,7 +224,7 @@ public class ConfigPortal { String myaddress = (sb.peers == null) || sb.peers.mySeed() == null || sb.peers.mySeed().getIP() == null ? null : sb.peers.mySeed().getPublicAddress(sb.peers.mySeed().getIP()); if (myaddress == null) { - myaddress = "localhost:" + sb.getLocalPort("port", 8090); + myaddress = "localhost:" + sb.getLocalPort(); } prop.put("myaddress", myaddress); return prop; diff --git a/htroot/ConfigSearchBox.java b/htroot/ConfigSearchBox.java index c64bf7293..eb9d953e9 100644 --- a/htroot/ConfigSearchBox.java +++ b/htroot/ConfigSearchBox.java @@ -35,7 +35,7 @@ public class ConfigSearchBox { final Switchboard sb = (Switchboard) env; String myaddress = sb.peers.mySeed().getPublicAddress(sb.peers.mySeed().getIP()); - if (myaddress == null) myaddress = "localhost:" + sb.getLocalPort("port", 8090); + if (myaddress == null) myaddress = "localhost:" + sb.getLocalPort(); prop.put("myaddress", myaddress); return prop; } diff --git a/htroot/CrawlStartScanner_p.java b/htroot/CrawlStartScanner_p.java index ed5386376..1b93abb4a 100644 --- a/htroot/CrawlStartScanner_p.java +++ b/htroot/CrawlStartScanner_p.java @@ -214,7 +214,7 @@ public class CrawlStartScanner_p path += "&crawlingURL=" + url.toNormalform(true); WorkTables.execAPICall( Domains.LOCALHOST, - sb.getLocalPort("port", 8090), + sb.getLocalPort(), path, pk, sb.getConfig(SwitchboardConstants.ADMIN_ACCOUNT_USER_NAME, "admin"), @@ -261,7 +261,7 @@ public class CrawlStartScanner_p path += "&crawlingURL=" + urlString; WorkTables.execAPICall( Domains.LOCALHOST, - sb.getLocalPort("port", 8090), + sb.getLocalPort(), path, u.hash(), sb.getConfig(SwitchboardConstants.ADMIN_ACCOUNT_USER_NAME, "admin"), diff --git a/htroot/Load_MediawikiWiki.java b/htroot/Load_MediawikiWiki.java index bb18ba755..5260a68d6 100644 --- a/htroot/Load_MediawikiWiki.java +++ b/htroot/Load_MediawikiWiki.java @@ -39,7 +39,7 @@ public class Load_MediawikiWiki { // define visible variables String a = sb.peers.mySeed().getPublicAddress(sb.peers.mySeed().getIP()); - if (a == null) a = "localhost:" + sb.getLocalPort("port", 8090); + if (a == null) a = "localhost:" + sb.getLocalPort(); final boolean intranet = sb.getConfig(SwitchboardConstants.NETWORK_NAME, "").equals("intranet"); final String repository = "http://" + a + "/repository/"; prop.put("starturl", (intranet) ? repository : "http://"); diff --git a/htroot/Load_PHPBB3.java b/htroot/Load_PHPBB3.java index bc9b3d9f0..e7d03398f 100644 --- a/htroot/Load_PHPBB3.java +++ b/htroot/Load_PHPBB3.java @@ -39,7 +39,7 @@ public class Load_PHPBB3 { // define visible variables String a = sb.peers.mySeed().getPublicAddress(sb.peers.mySeed().getIP()); - if (a == null) a = "localhost:" + sb.getLocalPort("port", 8090); + if (a == null) a = "localhost:" + sb.getLocalPort(); final boolean intranet = sb.getConfig(SwitchboardConstants.NETWORK_NAME, "").equals("intranet"); final String repository = "http://" + a + "/repository/"; prop.put("starturl", (intranet) ? repository : "http://"); diff --git a/htroot/SettingsAck_p.java b/htroot/SettingsAck_p.java index 601ab2cbf..f5ed41553 100644 --- a/htroot/SettingsAck_p.java +++ b/htroot/SettingsAck_p.java @@ -99,7 +99,7 @@ public class SettingsAck_p { /* * display port info */ - prop.put("info_port", env.getLocalPort("port", 8090)); + prop.put("info_port", env.getLocalPort()); prop.put("info_restart", "0"); // read and process data diff --git a/htroot/Settings_p.java b/htroot/Settings_p.java index af1a7c0ac..0c03defbb 100644 --- a/htroot/Settings_p.java +++ b/htroot/Settings_p.java @@ -73,7 +73,7 @@ public final class Settings_p { prop.put("settingsTables", ""); } - prop.put("port", env.getLocalPort("port", 8090)); + prop.put("port", env.getLocalPort()); prop.putHTML("peerName", sb.peers.mySeed().getName()); prop.putHTML("staticIP", env.getConfig("staticIP", "")); diff --git a/htroot/Table_API_p.java b/htroot/Table_API_p.java index 513bb46ae..117cf30b4 100644 --- a/htroot/Table_API_p.java +++ b/htroot/Table_API_p.java @@ -208,7 +208,7 @@ public class Table_API_p { } // now call the api URLs and store the result status - final Map l = sb.tables.execAPICalls(Domains.LOCALHOST, sb.getLocalPort("port", 8090), pks, sb.getConfig(SwitchboardConstants.ADMIN_ACCOUNT_USER_NAME, "admin"), sb.getConfig(SwitchboardConstants.ADMIN_ACCOUNT_B64MD5, "")); + final Map l = sb.tables.execAPICalls(Domains.LOCALHOST, sb.getLocalPort(), pks, sb.getConfig(SwitchboardConstants.ADMIN_ACCOUNT_USER_NAME, "admin"), sb.getConfig(SwitchboardConstants.ADMIN_ACCOUNT_B64MD5, "")); // construct result table prop.put("showexec", l.isEmpty() ? 0 : 1); diff --git a/htroot/api/push_p.java b/htroot/api/push_p.java index 84689af62..912586220 100644 --- a/htroot/api/push_p.java +++ b/htroot/api/push_p.java @@ -126,7 +126,7 @@ public class push_p { Set ips = Domains.myPublicIPs(); String address = ips.size() == 0 ? "127.0.0.1" : ips.iterator().next(); if (address == null) address = "127.0.0.1"; - prop.put("mode_results_" + i + "_success_message", "http://" + address + ":" + sb.getLocalPort("port", 8090) + "/solr/select?q=sku:%22" + u + "%22"); + prop.put("mode_results_" + i + "_success_message", "http://" + address + ":" + sb.getLocalPort() + "/solr/select?q=sku:%22" + u + "%22"); countsuccess++; } catch (MalformedURLException e) { e.printStackTrace(); diff --git a/htroot/opensearchdescription.java b/htroot/opensearchdescription.java index baa39ad2c..de72ff1c1 100644 --- a/htroot/opensearchdescription.java +++ b/htroot/opensearchdescription.java @@ -40,7 +40,7 @@ public class opensearchdescription { if (env.getConfigBool(SwitchboardConstants.GREETING_NETWORK_NAME, false)) promoteSearchPageGreeting = env.getConfig("network.unit.description", ""); String thisaddress = header.get("Host", Domains.LOCALHOST); - if (thisaddress.indexOf(':',0) == -1) thisaddress += ":" + env.getLocalPort("port", 8090); + if (thisaddress.indexOf(':',0) == -1) thisaddress += ":" + env.getLocalPort(); String thisprotocol = env.getConfigBool("server.https", false) ? "https" : "http"; final serverObjects prop = new serverObjects(); diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index fee284c8d..0aea12e23 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -149,7 +149,7 @@ public class yacysearch { // adding some additional properties needed for the rss feed String hostName = header.get("Host", Domains.LOCALHOST); if ( hostName.indexOf(':', 0) == -1 ) { - hostName += ":" + env.getLocalPort("port", 8090); + hostName += ":" + env.getLocalPort(); } prop.put("searchBaseURL", "http://" + hostName + "/yacysearch.html"); prop.put("rssYacyImageURL", "http://" + hostName + "/env/grafics/yacy.png"); @@ -900,7 +900,7 @@ public class yacysearch { // hostname and port (assume locahost if nothing helps) final String hostIP = sb.peers.mySeed().getIP(); prop.put("myhost", hostIP != null ? hostIP : Domains.LOCALHOST); - prop.put("myport", Domains.LOCALHOST.equals(hostIP) ? sb.getLocalPort("port", 8090) : sb.getPublicPort("port", 8090)); + prop.put("myport", Domains.LOCALHOST.equals(hostIP) ? sb.getLocalPort() : sb.getPublicPort("port", 8090)); // return rewrite properties return prop; diff --git a/htroot/yacysearch_location.java b/htroot/yacysearch_location.java index f4203772f..febe02816 100644 --- a/htroot/yacysearch_location.java +++ b/htroot/yacysearch_location.java @@ -96,7 +96,7 @@ public class yacysearch_location { if (query.length() > 0 && (metatag || search_title || search_publisher || search_creator || search_subject)) try { // get a queue of search results - final String rssSearchServiceURL = "http://127.0.0.1:" + sb.getLocalPort("port", 8090) + "/yacysearch.rss"; + final String rssSearchServiceURL = "http://127.0.0.1:" + sb.getLocalPort() + "/yacysearch.rss"; final BlockingQueue results = new LinkedBlockingQueue(); SRURSSConnector.searchSRURSS(results, rssSearchServiceURL, lon == 0.0d && lat == 0.0d ? query : query + " /radius/" + lat + "/" + lon + "/" + radius, maximumTime, Integer.MAX_VALUE, null, false, ClientIdentification.yacyInternetCrawlerAgent); @@ -129,7 +129,7 @@ public class yacysearch_location { String promoteSearchPageGreeting = env.getConfig(SwitchboardConstants.GREETING, ""); if (env.getConfigBool(SwitchboardConstants.GREETING_NETWORK_NAME, false)) promoteSearchPageGreeting = env.getConfig("network.unit.description", ""); String hostName = header.get("Host", Domains.LOCALHOST); - if (hostName.indexOf(':',0) == -1) hostName += ":" + env.getLocalPort("port", 8090); + if (hostName.indexOf(':',0) == -1) hostName += ":" + env.getLocalPort(); final String originalquerystring = (post == null) ? "" : post.get("query", post.get("search", "")).trim(); // SRU compliance final boolean global = post.get("kml_resource", "local").equals("global"); diff --git a/source/net/yacy/gui/Tray.java b/source/net/yacy/gui/Tray.java index 8de4d0b6d..9a61251a7 100644 --- a/source/net/yacy/gui/Tray.java +++ b/source/net/yacy/gui/Tray.java @@ -239,8 +239,8 @@ public final class Tray { } private String readyMessage() { - if (deutsch) return "YaCy laeuft unter http://localhost:" + sb.getLocalPort("port", 8090); - return "YaCy is running at http://localhost:" + sb.getLocalPort("port", 8090); + if (deutsch) return "YaCy laeuft unter http://localhost:" + sb.getLocalPort(); + return "YaCy is running at http://localhost:" + sb.getLocalPort(); } private String shutdownMessage() { diff --git a/source/net/yacy/http/Jetty9HttpServerImpl.java b/source/net/yacy/http/Jetty9HttpServerImpl.java index cf4123640..431a41325 100644 --- a/source/net/yacy/http/Jetty9HttpServerImpl.java +++ b/source/net/yacy/http/Jetty9HttpServerImpl.java @@ -83,7 +83,7 @@ public class Jetty9HttpServerImpl implements YaCyHttpServer { final SSLContext sslContext = initSslContext(sb); if (sslContext != null) { - int sslport = sb.getLocalPort("port.ssl", 8443); + int sslport = sb.getConfigInt("port.ssl", 8443); sslContextFactory.setSslContext(sslContext); // SSL HTTP Configuration @@ -289,8 +289,8 @@ public class Jetty9HttpServerImpl implements YaCyHttpServer { } try { // reconnect with new settings (instead to stop/start server, just manipulate connectors final Connector[] cons = server.getConnectors(); - final int port = Switchboard.getSwitchboard().getLocalPort("port", 8090); - final int sslport = Switchboard.getSwitchboard().getLocalPort("port.ssl", 8443); + final int port = Switchboard.getSwitchboard().getLocalPort(); + final int sslport = Switchboard.getSwitchboard().getConfigInt("port.ssl", 8443); for (Connector con : cons) { // check http connector if (con.getName().startsWith("httpd") && ((ServerConnector)con).getPort() != port) { diff --git a/source/net/yacy/peers/SeedDB.java b/source/net/yacy/peers/SeedDB.java index 533715e77..f4869638b 100644 --- a/source/net/yacy/peers/SeedDB.java +++ b/source/net/yacy/peers/SeedDB.java @@ -959,7 +959,7 @@ public final class SeedDB implements AlternativeDomainNames { if (this.mySeed == null) initMySeed(); if (seed == this.mySeed && !(seed.isOnline())) { // take local ip instead of external - return Switchboard.getSwitchboard().myPublicIP() + ":" + Switchboard.getSwitchboard().getLocalPort("port", 8090) + ((subdom == null) ? "" : ("/" + subdom)); + return Switchboard.getSwitchboard().myPublicIP() + ":" + Switchboard.getSwitchboard().getLocalPort() + ((subdom == null) ? "" : ("/" + subdom)); } return seed.getPublicAddress(seed.getIP()) + ((subdom == null) ? "" : ("/" + subdom)); } else { diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 1ecbffa5e..9597b9438 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -304,7 +304,7 @@ public final class Switchboard extends serverSwitch { super(dataPath, appPath, initPath, configPath); sb = this; // check if port is already occupied - final int port = getLocalPort("port", 8090); + final int port = getLocalPort(); if (TimeoutRequest.ping(Domains.LOCALHOST, port, 500)) { throw new RuntimeException( "a server is already running on the YaCy port " @@ -2200,7 +2200,7 @@ public final class Switchboard extends serverSwitch { startupAction = false; // execute api calls - final Map callResult = this.tables.execAPICalls("localhost", getLocalPort("port", 8090), pks, getConfig(SwitchboardConstants.ADMIN_ACCOUNT_USER_NAME, "admin"), getConfig(SwitchboardConstants.ADMIN_ACCOUNT_B64MD5, "")); + final Map callResult = this.tables.execAPICalls("localhost", getLocalPort(), pks, getConfig(SwitchboardConstants.ADMIN_ACCOUNT_USER_NAME, "admin"), getConfig(SwitchboardConstants.ADMIN_ACCOUNT_B64MD5, "")); for ( final Map.Entry call : callResult.entrySet() ) { this.log.info("Scheduler executed api call, response " + call.getValue() + ": " + call.getKey()); } diff --git a/source/net/yacy/server/serverSwitch.java b/source/net/yacy/server/serverSwitch.java index a63fb760e..02fe4443e 100644 --- a/source/net/yacy/server/serverSwitch.java +++ b/source/net/yacy/server/serverSwitch.java @@ -222,16 +222,14 @@ public class serverSwitch { * Wrapper for {@link #getConfigInt(String, int)} to have a more consistent * API. * - * @param key - * original key from config (for example "port" or "port.ssl") - * @param dflt - * default value which will be used if no value is found + * Default value 8090 will be used if no value is found + * * @return the local port of this system * @see #getPublicPort(String, int) */ - public int getLocalPort(final String key, final int dflt) { + public int getLocalPort() { - return getConfigInt(key, dflt); + return getConfigInt("port", 8090); } // a logger for this switchboard diff --git a/source/net/yacy/yacy.java b/source/net/yacy/yacy.java index 4726fde80..4d2edc64f 100644 --- a/source/net/yacy/yacy.java +++ b/source/net/yacy/yacy.java @@ -218,7 +218,7 @@ public final class yacy { sb.setConfig("memoryTotalAfterStartup", startupMemTotal); // start gui if wanted - if (gui) YaCyApp.start("localhost", sb.getLocalPort("port", 8090)); + if (gui) YaCyApp.start("localhost", sb.getLocalPort()); // hardcoded, forced, temporary value-migration sb.setConfig("htTemplatePath", "htroot/env/templates"); @@ -293,7 +293,7 @@ public final class yacy { HTTPClient.setDefaultUserAgent(ClientIdentification.yacyInternetCrawlerAgent.userAgent); // start main threads - final int port = sb.getLocalPort("port", 8090); + final int port = sb.getLocalPort(); try { // start http server YaCyHttpServer httpServer; From 2fcf6f104caa92d542a27db91662f9a6afcaa72c Mon Sep 17 00:00:00 2001 From: reger Date: Tue, 3 Nov 2015 03:35:01 +0100 Subject: [PATCH 5/8] fix bzipParser recognition - Bzip2Inputstream checks magic byte itself to identify bz2 (leave it in input) - try to suppy fitting mime for parsing bz2 content --- .../net/yacy/document/parser/bzipParser.java | 21 +++++++------------ 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/source/net/yacy/document/parser/bzipParser.java b/source/net/yacy/document/parser/bzipParser.java index 4e16fbfce..fe95e8ab7 100644 --- a/source/net/yacy/document/parser/bzipParser.java +++ b/source/net/yacy/document/parser/bzipParser.java @@ -32,6 +32,7 @@ import java.io.FileOutputStream; import java.io.InputStream; import net.yacy.cora.document.id.AnchorURL; +import net.yacy.cora.document.id.DigestURL; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; @@ -40,6 +41,7 @@ import net.yacy.document.VocabularyScraper; import net.yacy.kelondro.util.FileUtils; import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream; +import org.apache.commons.compress.compressors.bzip2.BZip2Utils; public class bzipParser extends AbstractParser implements Parser { @@ -69,21 +71,9 @@ public class bzipParser extends AbstractParser implements Parser { File tempFile = null; Document[] docs; try { - /* - * First we have to consume the first two char from the stream. Otherwise - * the bzip decompression will fail with a nullpointerException! - */ - int b = source.read(); - if (b != 'B') { - throw new Exception("Invalid bz2 content."); - } - b = source.read(); - if (b != 'Z') { - throw new Exception("Invalid bz2 content."); - } - int read = 0; final byte[] data = new byte[1024]; + // BZip2CompressorInputStream checks filecontent (magic start-bytes "BZh") and throws ioexception if no match final BZip2CompressorInputStream zippedContent = new BZip2CompressorInputStream(source); tempFile = File.createTempFile("bunzip","tmp"); @@ -100,7 +90,10 @@ public class bzipParser extends AbstractParser implements Parser { out.close(); // creating a new parser class to parse the unzipped content - docs = TextParser.parseSource(location, null, null, scraper, timezoneOffset, 999, tempFile); + final String contentfilename = BZip2Utils.getUncompressedFilename(location.getFileName()); + final String mime = TextParser.mimeOf(DigestURL.getFileExtension(contentfilename)); + docs = TextParser.parseSource(location, mime, null, scraper, timezoneOffset, 999, tempFile); + // TODO: this could return null from content parsing, even if bz2 successful read (see zipParser for alternative coding) } catch (final Exception e) { if (e instanceof InterruptedException) throw (InterruptedException) e; if (e instanceof Parser.Failure) throw (Parser.Failure) e; From 5d71fc70e3f21b46a87f4f3edd708196e11f8caf Mon Sep 17 00:00:00 2001 From: reger Date: Tue, 3 Nov 2015 22:14:14 +0100 Subject: [PATCH 6/8] fix tarParser early exit on looping content - adjust check of data available according to doc - return null on no recognized content (to not exit TextParser next parser try) - use commons.compress directly --- source/net/yacy/document/parser/tarParser.java | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/source/net/yacy/document/parser/tarParser.java b/source/net/yacy/document/parser/tarParser.java index 52a84e296..be4b515fd 100644 --- a/source/net/yacy/document/parser/tarParser.java +++ b/source/net/yacy/document/parser/tarParser.java @@ -43,8 +43,8 @@ import net.yacy.document.TextParser; import net.yacy.document.VocabularyScraper; import net.yacy.kelondro.util.FileUtils; -import org.apache.tools.tar.TarEntry; -import org.apache.tools.tar.TarInputStream; +import org.apache.commons.compress.archivers.tar.TarArchiveEntry; +import org.apache.commons.compress.archivers.tar.TarArchiveInputStream; // this is a new implementation of this parser idiom using multiple documents as result set @@ -80,15 +80,14 @@ public class tarParser extends AbstractParser implements Parser { throw new Parser.Failure("tar parser: " + e.getMessage(), location); } } - TarEntry entry; - final TarInputStream tis = new TarInputStream(source); + TarArchiveEntry entry; + final TarArchiveInputStream tis = new TarArchiveInputStream(source); File tmp = null; // loop through the elements in the tar file and parse every single file inside while (true) { try { - if (tis.available() <= 0) break; - entry = tis.getNextEntry(); + entry = tis.getNextTarEntry(); if (entry == null) break; if (entry.isDirectory() || entry.getSize() <= 0) continue; final String name = entry.getName(); @@ -110,6 +109,7 @@ public class tarParser extends AbstractParser implements Parser { break; } } + if (docacc.isEmpty()) return null; return docacc.toArray(new Document[docacc.size()]); } From 681889ae64921745078523ae2283a89c95bd4776 Mon Sep 17 00:00:00 2001 From: reger Date: Wed, 4 Nov 2015 02:57:00 +0100 Subject: [PATCH 7/8] use current tar library for untar files - remove old source copy --- source/net/yacy/utils/tarTools.java | 19 +- source/org/apache/tools/tar/TarBuffer.java | 461 ------------- source/org/apache/tools/tar/TarConstants.java | 158 ----- source/org/apache/tools/tar/TarEntry.java | 635 ------------------ .../org/apache/tools/tar/TarInputStream.java | 409 ----------- .../org/apache/tools/tar/TarOutputStream.java | 359 ---------- source/org/apache/tools/tar/TarUtils.java | 210 ------ 7 files changed, 10 insertions(+), 2241 deletions(-) delete mode 100644 source/org/apache/tools/tar/TarBuffer.java delete mode 100644 source/org/apache/tools/tar/TarConstants.java delete mode 100644 source/org/apache/tools/tar/TarEntry.java delete mode 100644 source/org/apache/tools/tar/TarInputStream.java delete mode 100644 source/org/apache/tools/tar/TarOutputStream.java delete mode 100644 source/org/apache/tools/tar/TarUtils.java diff --git a/source/net/yacy/utils/tarTools.java b/source/net/yacy/utils/tarTools.java index 074440204..9af805e3b 100644 --- a/source/net/yacy/utils/tarTools.java +++ b/source/net/yacy/utils/tarTools.java @@ -35,8 +35,9 @@ import java.util.zip.GZIPInputStream; import net.yacy.cora.util.ConcurrentLog; -import org.apache.tools.tar.TarEntry; -import org.apache.tools.tar.TarInputStream; +import org.apache.commons.compress.archivers.tar.TarArchiveEntry; +import org.apache.commons.compress.archivers.tar.TarArchiveInputStream; +import org.apache.commons.io.IOUtils; public class tarTools { @@ -69,19 +70,19 @@ public class tarTools { public static void unTar(final InputStream in, final String untarDir) throws Exception{ ConcurrentLog.info("UNTAR", "starting"); if(new File(untarDir).exists()){ - final TarInputStream tin = new TarInputStream(in); - TarEntry tarEntry = tin.getNextEntry(); + final TarArchiveInputStream tin = new TarArchiveInputStream(in); + TarArchiveEntry tarEntry = tin.getNextTarEntry(); while(tarEntry != null){ final File destPath = new File(untarDir + File.separator + tarEntry.getName()); if (!tarEntry.isDirectory()) { new File(destPath.getParent()).mkdirs(); // create missing subdirectories final FileOutputStream fout = new FileOutputStream(destPath); - tin.copyEntryContents(fout); + IOUtils.copyLarge(tin,fout,0,tarEntry.getSize()); fout.close(); } else { destPath.mkdir(); } - tarEntry = tin.getNextEntry(); + tarEntry = tin.getNextTarEntry(); } tin.close(); } else { // untarDir doesn't exist @@ -89,8 +90,8 @@ public class tarTools { } ConcurrentLog.info("UNTAR", "finished"); } - - public static void main(final String args[]) { + + public static void main(final String args[]) { // @arg0 source // @arg1 destination if(args.length == 2){ @@ -103,4 +104,4 @@ public class tarTools { System.out.println("usage: "); } } -} +} diff --git a/source/org/apache/tools/tar/TarBuffer.java b/source/org/apache/tools/tar/TarBuffer.java deleted file mode 100644 index f33882bab..000000000 --- a/source/org/apache/tools/tar/TarBuffer.java +++ /dev/null @@ -1,461 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - */ - -/* - * This package is based on the work done by Timothy Gerard Endres - * (time@ice.com) to whom the Ant project is very grateful for his great code. - */ - -package org.apache.tools.tar; - -import java.io.InputStream; -import java.io.OutputStream; -import java.io.IOException; -import java.util.Arrays; - -/** - * The TarBuffer class implements the tar archive concept - * of a buffered input stream. This concept goes back to the - * days of blocked tape drives and special io devices. In the - * Java universe, the only real function that this class - * performs is to ensure that files have the correct "block" - * size, or other tars will complain. - *

- * You should never have a need to access this class directly. - * TarBuffers are created by Tar IO Streams. - * - */ - -public class TarBuffer { - - /** Default record size */ - public static final int DEFAULT_RCDSIZE = (512); - - /** Default block size */ - public static final int DEFAULT_BLKSIZE = (DEFAULT_RCDSIZE * 20); - - private InputStream inStream; - private OutputStream outStream; - private byte[] blockBuffer; - private int currBlkIdx; - private int currRecIdx; - private int blockSize; - private int recordSize; - private int recsPerBlock; - private boolean debug; - - /** - * Constructor for a TarBuffer on an input stream. - * @param inStream the input stream to use - */ - public TarBuffer(InputStream inStream) { - this(inStream, TarBuffer.DEFAULT_BLKSIZE); - } - - /** - * Constructor for a TarBuffer on an input stream. - * @param inStream the input stream to use - * @param blockSize the block size to use - */ - public TarBuffer(InputStream inStream, int blockSize) { - this(inStream, blockSize, TarBuffer.DEFAULT_RCDSIZE); - } - - /** - * Constructor for a TarBuffer on an input stream. - * @param inStream the input stream to use - * @param blockSize the block size to use - * @param recordSize the record size to use - */ - public TarBuffer(InputStream inStream, int blockSize, int recordSize) { - this.inStream = inStream; - this.outStream = null; - - this.initialize(blockSize, recordSize); - } - - /** - * Constructor for a TarBuffer on an output stream. - * @param outStream the output stream to use - */ - public TarBuffer(OutputStream outStream) { - this(outStream, TarBuffer.DEFAULT_BLKSIZE); - } - - /** - * Constructor for a TarBuffer on an output stream. - * @param outStream the output stream to use - * @param blockSize the block size to use - */ - public TarBuffer(OutputStream outStream, int blockSize) { - this(outStream, blockSize, TarBuffer.DEFAULT_RCDSIZE); - } - - /** - * Constructor for a TarBuffer on an output stream. - * @param outStream the output stream to use - * @param blockSize the block size to use - * @param recordSize the record size to use - */ - public TarBuffer(OutputStream outStream, int blockSize, int recordSize) { - this.inStream = null; - this.outStream = outStream; - - this.initialize(blockSize, recordSize); - } - - /** - * Initialization common to all constructors. - */ - private void initialize(int blockSize, int recordSize) { - this.debug = false; - this.blockSize = blockSize; - this.recordSize = recordSize; - this.recsPerBlock = (this.blockSize / this.recordSize); - this.blockBuffer = new byte[this.blockSize]; - - if (this.inStream != null) { - this.currBlkIdx = -1; - this.currRecIdx = this.recsPerBlock; - } else { - this.currBlkIdx = 0; - this.currRecIdx = 0; - } - } - - /** - * Get the TAR Buffer's block size. Blocks consist of multiple records. - * @return the block size - */ - public int getBlockSize() { - return this.blockSize; - } - - /** - * Get the TAR Buffer's record size. - * @return the record size - */ - public int getRecordSize() { - return this.recordSize; - } - - /** - * Set the debugging flag for the buffer. - * - * @param debug If true, print debugging output. - */ - public void setDebug(boolean debug) { - this.debug = debug; - } - - /** - * Determine if an archive record indicate End of Archive. End of - * archive is indicated by a record that consists entirely of null bytes. - * - * @param record The record data to check. - * @return true if the record data is an End of Archive - */ - public boolean isEOFRecord(byte[] record) { - for (int i = 0, sz = this.getRecordSize(); i < sz; ++i) { - if (record[i] != 0) { - return false; - } - } - - return true; - } - - /** - * Skip over a record on the input stream. - * @throws IOException on error - */ - public void skipRecord() throws IOException { - if (this.debug) { - System.err.println("SkipRecord: recIdx = " + this.currRecIdx - + " blkIdx = " + this.currBlkIdx); - } - - if (this.inStream == null) { - throw new IOException("reading (via skip) from an output buffer"); - } - - if (this.currRecIdx >= this.recsPerBlock) { - if (!this.readBlock()) { - return; // UNDONE - } - } - - this.currRecIdx++; - } - - /** - * Read a record from the input stream and return the data. - * - * @return The record data. - * @throws IOException on error - */ - public byte[] readRecord() throws IOException { - if (this.debug) { - System.err.println("ReadRecord: recIdx = " + this.currRecIdx - + " blkIdx = " + this.currBlkIdx); - } - - if (this.inStream == null) { - throw new IOException("reading from an output buffer"); - } - - if (this.currRecIdx >= this.recsPerBlock) { - if (!this.readBlock()) { - return null; - } - } - - byte[] result = new byte[this.recordSize]; - - System.arraycopy(this.blockBuffer, - (this.currRecIdx * this.recordSize), result, 0, - this.recordSize); - - this.currRecIdx++; - - return result; - } - - /** - * @return false if End-Of-File, else true - */ - private boolean readBlock() throws IOException { - if (this.debug) { - System.err.println("ReadBlock: blkIdx = " + this.currBlkIdx); - } - - if (this.inStream == null) { - throw new IOException("reading from an output buffer"); - } - - this.currRecIdx = 0; - - int offset = 0; - int bytesNeeded = this.blockSize; - - while (bytesNeeded > 0) { - long numBytes = this.inStream.read(this.blockBuffer, offset, - bytesNeeded); - - // - // NOTE - // We have fit EOF, and the block is not full! - // - // This is a broken archive. It does not follow the standard - // blocking algorithm. However, because we are generous, and - // it requires little effort, we will simply ignore the error - // and continue as if the entire block were read. This does - // not appear to break anything upstream. We used to return - // false in this case. - // - // Thanks to 'Yohann.Roussel@alcatel.fr' for this fix. - // - if (numBytes == -1) { - if (offset == 0) { - // Ensure that we do not read gigabytes of zeros - // for a corrupt tar file. - // See http://issues.apache.org/bugzilla/show_bug.cgi?id=39924 - return false; - } - // However, just leaving the unread portion of the buffer dirty does - // cause problems in some cases. This problem is described in - // http://issues.apache.org/bugzilla/show_bug.cgi?id=29877 - // - // The solution is to fill the unused portion of the buffer with zeros. - - Arrays.fill(blockBuffer, offset, offset + bytesNeeded, (byte) 0); - - break; - } - - offset += numBytes; - bytesNeeded -= numBytes; - - if (numBytes != this.blockSize) { - if (this.debug) { - System.err.println("ReadBlock: INCOMPLETE READ " - + numBytes + " of " + this.blockSize - + " bytes read."); - } - } - } - - this.currBlkIdx++; - - return true; - } - - /** - * Get the current block number, zero based. - * - * @return The current zero based block number. - */ - public int getCurrentBlockNum() { - return this.currBlkIdx; - } - - /** - * Get the current record number, within the current block, zero based. - * Thus, current offset = (currentBlockNum * recsPerBlk) + currentRecNum. - * - * @return The current zero based record number. - */ - public int getCurrentRecordNum() { - return this.currRecIdx - 1; - } - - /** - * Write an archive record to the archive. - * - * @param record The record data to write to the archive. - * @throws IOException on error - */ - public void writeRecord(byte[] record) throws IOException { - if (this.debug) { - System.err.println("WriteRecord: recIdx = " + this.currRecIdx - + " blkIdx = " + this.currBlkIdx); - } - - if (this.outStream == null) { - throw new IOException("writing to an input buffer"); - } - - if (record.length != this.recordSize) { - throw new IOException("record to write has length '" - + record.length - + "' which is not the record size of '" - + this.recordSize + "'"); - } - - if (this.currRecIdx >= this.recsPerBlock) { - this.writeBlock(); - } - - System.arraycopy(record, 0, this.blockBuffer, - (this.currRecIdx * this.recordSize), - this.recordSize); - - this.currRecIdx++; - } - - /** - * Write an archive record to the archive, where the record may be - * inside of a larger array buffer. The buffer must be "offset plus - * record size" long. - * - * @param buf The buffer containing the record data to write. - * @param offset The offset of the record data within buf. - * @throws IOException on error - */ - public void writeRecord(byte[] buf, int offset) throws IOException { - if (this.debug) { - System.err.println("WriteRecord: recIdx = " + this.currRecIdx - + " blkIdx = " + this.currBlkIdx); - } - - if (this.outStream == null) { - throw new IOException("writing to an input buffer"); - } - - if ((offset + this.recordSize) > buf.length) { - throw new IOException("record has length '" + buf.length - + "' with offset '" + offset - + "' which is less than the record size of '" - + this.recordSize + "'"); - } - - if (this.currRecIdx >= this.recsPerBlock) { - this.writeBlock(); - } - - System.arraycopy(buf, offset, this.blockBuffer, - (this.currRecIdx * this.recordSize), - this.recordSize); - - this.currRecIdx++; - } - - /** - * Write a TarBuffer block to the archive. - */ - private void writeBlock() throws IOException { - if (this.debug) { - System.err.println("WriteBlock: blkIdx = " + this.currBlkIdx); - } - - if (this.outStream == null) { - throw new IOException("writing to an input buffer"); - } - - this.outStream.write(this.blockBuffer, 0, this.blockSize); - this.outStream.flush(); - - this.currRecIdx = 0; - this.currBlkIdx++; - } - - /** - * Flush the current data block if it has any data in it. - */ - private void flushBlock() throws IOException { - if (this.debug) { - System.err.println("TarBuffer.flushBlock() called."); - } - - if (this.outStream == null) { - throw new IOException("writing to an input buffer"); - } - - if (this.currRecIdx > 0) { - this.writeBlock(); - } - } - - /** - * Close the TarBuffer. If this is an output buffer, also flush the - * current block before closing. - * @throws IOException on error - */ - public synchronized void close() throws IOException { - if (this.debug) { - System.err.println("TarBuffer.closeBuffer()."); - } - - if (this.outStream != null) { - this.flushBlock(); - - if (this.outStream != System.out - && this.outStream != System.err) { - this.outStream.close(); - - this.outStream = null; - } - } else if (this.inStream != null) { - if (this.inStream != System.in) { - this.inStream.close(); - - this.inStream = null; - } - } - } -} diff --git a/source/org/apache/tools/tar/TarConstants.java b/source/org/apache/tools/tar/TarConstants.java deleted file mode 100644 index 2ba5d6667..000000000 --- a/source/org/apache/tools/tar/TarConstants.java +++ /dev/null @@ -1,158 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - */ - -/* - * This package is based on the work done by Timothy Gerard Endres - * (time@ice.com) to whom the Ant project is very grateful for his great code. - */ - -package org.apache.tools.tar; - -/** - * This interface contains all the definitions used in the package. - * - */ -// CheckStyle:InterfaceIsTypeCheck OFF (bc) -public interface TarConstants { - - /** - * The length of the name field in a header buffer. - */ - int NAMELEN = 100; - - /** - * The length of the mode field in a header buffer. - */ - int MODELEN = 8; - - /** - * The length of the user id field in a header buffer. - */ - int UIDLEN = 8; - - /** - * The length of the group id field in a header buffer. - */ - int GIDLEN = 8; - - /** - * The length of the checksum field in a header buffer. - */ - int CHKSUMLEN = 8; - - /** - * The length of the size field in a header buffer. - */ - int SIZELEN = 12; - - /** - * The maximum size of a file in a tar archive (That's 11 sevens, octal). - */ - long MAXSIZE = 077777777777L; - - /** - * The length of the magic field in a header buffer. - */ - int MAGICLEN = 8; - - /** - * The length of the modification time field in a header buffer. - */ - int MODTIMELEN = 12; - - /** - * The length of the user name field in a header buffer. - */ - int UNAMELEN = 32; - - /** - * The length of the group name field in a header buffer. - */ - int GNAMELEN = 32; - - /** - * The length of the devices field in a header buffer. - */ - int DEVLEN = 8; - - /** - * LF_ constants represent the "link flag" of an entry, or more commonly, - * the "entry type". This is the "old way" of indicating a normal file. - */ - byte LF_OLDNORM = 0; - - /** - * Normal file type. - */ - byte LF_NORMAL = (byte) '0'; - - /** - * Link file type. - */ - byte LF_LINK = (byte) '1'; - - /** - * Symbolic link file type. - */ - byte LF_SYMLINK = (byte) '2'; - - /** - * Character device file type. - */ - byte LF_CHR = (byte) '3'; - - /** - * Block device file type. - */ - byte LF_BLK = (byte) '4'; - - /** - * Directory file type. - */ - byte LF_DIR = (byte) '5'; - - /** - * FIFO (pipe) file type. - */ - byte LF_FIFO = (byte) '6'; - - /** - * Contiguous file type. - */ - byte LF_CONTIG = (byte) '7'; - - /** - * The magic tag representing a POSIX tar archive. - */ - String TMAGIC = "ustar"; - - /** - * The magic tag representing a GNU tar archive. - */ - String GNU_TMAGIC = "ustar "; - - /** - * The namr of the GNU tar entry which contains a long name. - */ - String GNU_LONGLINK = "././@LongLink"; - - /** - * Identifies the *next* file on the tape as having a long name. - */ - byte LF_GNUTYPE_LONGNAME = (byte) 'L'; -} diff --git a/source/org/apache/tools/tar/TarEntry.java b/source/org/apache/tools/tar/TarEntry.java deleted file mode 100644 index 6f815c532..000000000 --- a/source/org/apache/tools/tar/TarEntry.java +++ /dev/null @@ -1,635 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - */ - -/* - * This package is based on the work done by Timothy Gerard Endres - * (time@ice.com) to whom the Ant project is very grateful for his great code. - */ -/* - * Modifications (Michael Christen) - * - replaced StringBuffer with StringBuilder - */ - -package org.apache.tools.tar; - -import java.io.File; -import java.util.Date; -import java.util.Locale; - - -/** - * This class represents an entry in a Tar archive. It consists - * of the entry's header, as well as the entry's File. Entries - * can be instantiated in one of three ways, depending on how - * they are to be used. - *

- * TarEntries that are created from the header bytes read from - * an archive are instantiated with the TarEntry( byte[] ) - * constructor. These entries will be used when extracting from - * or listing the contents of an archive. These entries have their - * header filled in using the header bytes. They also set the File - * to null, since they reference an archive entry not a file. - *

- * TarEntries that are created from Files that are to be written - * into an archive are instantiated with the TarEntry( File ) - * constructor. These entries have their header filled in using - * the File's information. They also keep a reference to the File - * for convenience when writing entries. - *

- * Finally, TarEntries can be constructed from nothing but a name. - * This allows the programmer to construct the entry by hand, for - * instance when only an InputStream is available for writing to - * the archive, and the header information is constructed from - * other information. In this case the header fields are set to - * defaults and the File is set to null. - * - *

- * The C structure for a Tar Entry's header is: - *

- * struct header {
- * char name[NAMSIZ];
- * char mode[8];
- * char uid[8];
- * char gid[8];
- * char size[12];
- * char mtime[12];
- * char chksum[8];
- * char linkflag;
- * char linkname[NAMSIZ];
- * char magic[8];
- * char uname[TUNMLEN];
- * char gname[TGNMLEN];
- * char devmajor[8];
- * char devminor[8];
- * } header;
- * 
- * - */ - -public class TarEntry implements TarConstants { - /** The entry's name. */ - private StringBuilder name; - - /** The entry's permission mode. */ - private int mode; - - /** The entry's user id. */ - private int userId; - - /** The entry's group id. */ - private int groupId; - - /** The entry's size. */ - private long size; - - /** The entry's modification time. */ - private long modTime; - - /** The entry's link flag. */ - private byte linkFlag; - - /** The entry's link name. */ - private StringBuilder linkName; - - /** The entry's magic tag. */ - private StringBuilder magic; - - /** The entry's user name. */ - private StringBuilder userName; - - /** The entry's group name. */ - private StringBuilder groupName; - - /** The entry's major device number. */ - private int devMajor; - - /** The entry's minor device number. */ - private int devMinor; - - /** The entry's file reference */ - private File file; - - /** Maximum length of a user's name in the tar file */ - public static final int MAX_NAMELEN = 31; - - /** Default permissions bits for directories */ - public static final int DEFAULT_DIR_MODE = 040755; - - /** Default permissions bits for files */ - public static final int DEFAULT_FILE_MODE = 0100644; - - /** Convert millis to seconds */ - public static final int MILLIS_PER_SECOND = 1000; - - /** - * Construct an empty entry and prepares the header values. - */ - private TarEntry () { - this.magic = new StringBuilder(TMAGIC); - this.name = new StringBuilder(); - this.linkName = new StringBuilder(); - - String user = System.getProperty("user.name", ""); - - if (user.length() > MAX_NAMELEN) { - user = user.substring(0, MAX_NAMELEN); - } - - this.userId = 0; - this.groupId = 0; - this.userName = new StringBuilder(user); - this.groupName = new StringBuilder(""); - this.file = null; - } - - /** - * Construct an entry with only a name. This allows the programmer - * to construct the entry's header "by hand". File is set to null. - * - * @param name the entry name - */ - public TarEntry(final String name) { - this(); - - final boolean isDir = name.endsWith("/"); - - this.devMajor = 0; - this.devMinor = 0; - this.name = new StringBuilder(name); - this.mode = isDir ? DEFAULT_DIR_MODE : DEFAULT_FILE_MODE; - this.linkFlag = isDir ? LF_DIR : LF_NORMAL; - this.userId = 0; - this.groupId = 0; - this.size = 0; - this.modTime = (new Date()).getTime() / MILLIS_PER_SECOND; - this.linkName = new StringBuilder(""); - this.userName = new StringBuilder(""); - this.groupName = new StringBuilder(""); - this.devMajor = 0; - this.devMinor = 0; - - } - - /** - * Construct an entry with a name an a link flag. - * - * @param name the entry name - * @param linkFlag the entry link flag. - */ - public TarEntry(final String name, final byte linkFlag) { - this(name); - this.linkFlag = linkFlag; - } - - /** - * Construct an entry for a file. File is set to file, and the - * header is constructed from information from the file. - * - * @param file The file that the entry represents. - */ - public TarEntry(final File file) { - this(); - - this.file = file; - - String fileName = file.getPath(); - final String osname = System.getProperty("os.name").toLowerCase(Locale.US); - - if (osname != null) { - - // Strip off drive letters! - // REVIEW Would a better check be "(File.separator == '\')"? - - if (osname.startsWith("windows")) { - if (fileName.length() > 2) { - final char ch1 = fileName.charAt(0); - final char ch2 = fileName.charAt(1); - - if (ch2 == ':' - && ((ch1 >= 'a' && ch1 <= 'z') - || (ch1 >= 'A' && ch1 <= 'Z'))) { - fileName = fileName.substring(2); - } - } - } else if (osname.indexOf("netware",0) > -1) { - final int colon = fileName.indexOf(':'); - if (colon != -1) { - fileName = fileName.substring(colon + 1); - } - } - } - - fileName = fileName.replace(File.separatorChar, '/'); - - // No absolute pathnames - // Windows (and Posix?) paths can start with "\\NetworkDrive\", - // so we loop on starting /'s. - while (fileName.length() > 0 && fileName.charAt(0) == '/') { - fileName = fileName.substring(1); - } - - this.linkName = new StringBuilder(""); - this.name = new StringBuilder(fileName); - - if (file.isDirectory()) { - this.mode = DEFAULT_DIR_MODE; - this.linkFlag = LF_DIR; - - if (this.name.charAt(this.name.length() - 1) != '/') { - this.name.append("/"); - } - } else { - this.mode = DEFAULT_FILE_MODE; - this.linkFlag = LF_NORMAL; - } - - this.size = file.length(); - this.modTime = file.lastModified() / MILLIS_PER_SECOND; - this.devMajor = 0; - this.devMinor = 0; - } - - /** - * Construct an entry from an archive's header bytes. File is set - * to null. - * - * @param headerBuf The header bytes from a tar archive entry. - */ - public TarEntry(final byte[] headerBuf) { - this(); - parseTarHeader(headerBuf); - } - - /** - * Determine if the two entries are equal. Equality is determined - * by the header names being equal. - * - * @param it Entry to be checked for equality. - * @return True if the entries are equal. - */ - @Override - public boolean equals(final Object obj) { - if (this == obj) return true; - if (obj == null) return false; - if (!(obj instanceof TarEntry)) return false; - final TarEntry other = (TarEntry) obj; - return getName().equals(other.getName()); - } - - /** - * Hashcodes are based on entry names. - * - * @return the entry hashcode - */ - @Override - public int hashCode() { - return getName().hashCode(); - } - - /** - * Determine if the given entry is a descendant of this entry. - * Descendancy is determined by the name of the descendant - * starting with this entry's name. - * - * @param desc Entry to be checked as a descendent of this. - * @return True if entry is a descendant of this. - */ - public boolean isDescendent(final TarEntry desc) { - return desc.getName().startsWith(getName()); - } - - /** - * Get this entry's name. - * - * @return This entry's name. - */ - public String getName() { - return this.name.toString(); - } - - /** - * Set this entry's name. - * - * @param name This entry's new name. - */ - public void setName(final String name) { - this.name = new StringBuilder(name); - } - - /** - * Set the mode for this entry - * - * @param mode the mode for this entry - */ - public void setMode(final int mode) { - this.mode = mode; - } - - /** - * Get this entry's link name. - * - * @return This entry's link name. - */ - public String getLinkName() { - return this.linkName.toString(); - } - - /** - * Get this entry's user id. - * - * @return This entry's user id. - */ - public int getUserId() { - return this.userId; - } - - /** - * Set this entry's user id. - * - * @param userId This entry's new user id. - */ - public void setUserId(final int userId) { - this.userId = userId; - } - - /** - * Get this entry's group id. - * - * @return This entry's group id. - */ - public int getGroupId() { - return this.groupId; - } - - /** - * Set this entry's group id. - * - * @param groupId This entry's new group id. - */ - public void setGroupId(final int groupId) { - this.groupId = groupId; - } - - /** - * Get this entry's user name. - * - * @return This entry's user name. - */ - public String getUserName() { - return this.userName.toString(); - } - - /** - * Set this entry's user name. - * - * @param userName This entry's new user name. - */ - public void setUserName(final String userName) { - this.userName = new StringBuilder(userName); - } - - /** - * Get this entry's group name. - * - * @return This entry's group name. - */ - public String getGroupName() { - return this.groupName.toString(); - } - - /** - * Set this entry's group name. - * - * @param groupName This entry's new group name. - */ - public void setGroupName(final String groupName) { - this.groupName = new StringBuilder(groupName); - } - - /** - * Convenience method to set this entry's group and user ids. - * - * @param userId This entry's new user id. - * @param groupId This entry's new group id. - */ - public void setIds(final int userId, final int groupId) { - setUserId(userId); - setGroupId(groupId); - } - - /** - * Convenience method to set this entry's group and user names. - * - * @param userName This entry's new user name. - * @param groupName This entry's new group name. - */ - public void setNames(final String userName, final String groupName) { - setUserName(userName); - setGroupName(groupName); - } - - /** - * Set this entry's modification time. The parameter passed - * to this method is in "Java time". - * - * @param time This entry's new modification time. - */ - public void setModTime(final long time) { - this.modTime = time / MILLIS_PER_SECOND; - } - - /** - * Set this entry's modification time. - * - * @param time This entry's new modification time. - */ - public void setModTime(final Date time) { - this.modTime = time.getTime() / MILLIS_PER_SECOND; - } - - /** - * Set this entry's modification time. - * - * @return time This entry's new modification time. - */ - public Date getModTime() { - return new Date(this.modTime * MILLIS_PER_SECOND); - } - - /** - * Get this entry's file. - * - * @return This entry's file. - */ - public File getFile() { - return this.file; - } - - /** - * Get this entry's mode. - * - * @return This entry's mode. - */ - public int getMode() { - return this.mode; - } - - /** - * Get this entry's file size. - * - * @return This entry's file size. - */ - public long getSize() { - return this.size; - } - - /** - * Set this entry's file size. - * - * @param size This entry's new file size. - */ - public void setSize(final long size) { - this.size = size; - } - - - /** - * Indicate if this entry is a GNU long name block - * - * @return true if this is a long name extension provided by GNU tar - */ - public boolean isGNULongNameEntry() { - return this.linkFlag == LF_GNUTYPE_LONGNAME - && this.name.toString().equals(GNU_LONGLINK); - } - - /** - * Return whether or not this entry represents a directory. - * - * @return True if this entry is a directory. - */ - public boolean isDirectory() { - if (this.file != null) { - return this.file.isDirectory(); - } - - if (this.linkFlag == LF_DIR) { - return true; - } - - if (getName().endsWith("/")) { - return true; - } - - return false; - } - - /** - * If this entry represents a file, and the file is a directory, return - * an array of TarEntries for this entry's children. - * - * @return An array of TarEntry's for this entry's children. - */ - public TarEntry[] getDirectoryEntries() { - if (this.file == null || !this.file.isDirectory()) { - return new TarEntry[0]; - } - - final String[] list = this.file.list(); - final TarEntry[] result = new TarEntry[list.length]; - - for (int i = 0; i < list.length; ++i) { - result[i] = new TarEntry(new File(this.file, list[i])); - } - - return result; - } - - /** - * Write an entry's header information to a header buffer. - * - * @param outbuf The tar entry header buffer to fill in. - */ - public void writeEntryHeader(final byte[] outbuf) { - int offset = 0; - - offset = TarUtils.getNameBytes(this.name, outbuf, offset, NAMELEN); - offset = TarUtils.getOctalBytes(this.mode, outbuf, offset, MODELEN); - offset = TarUtils.getOctalBytes(this.userId, outbuf, offset, UIDLEN); - offset = TarUtils.getOctalBytes(this.groupId, outbuf, offset, GIDLEN); - offset = TarUtils.getLongOctalBytes(this.size, outbuf, offset, SIZELEN); - offset = TarUtils.getLongOctalBytes(this.modTime, outbuf, offset, MODTIMELEN); - - final int csOffset = offset; - - for (int c = 0; c < CHKSUMLEN; ++c) { - outbuf[offset++] = (byte) ' '; - } - - outbuf[offset++] = this.linkFlag; - offset = TarUtils.getNameBytes(this.linkName, outbuf, offset, NAMELEN); - offset = TarUtils.getNameBytes(this.magic, outbuf, offset, MAGICLEN); - offset = TarUtils.getNameBytes(this.userName, outbuf, offset, UNAMELEN); - offset = TarUtils.getNameBytes(this.groupName, outbuf, offset, GNAMELEN); - offset = TarUtils.getOctalBytes(this.devMajor, outbuf, offset, DEVLEN); - offset = TarUtils.getOctalBytes(this.devMinor, outbuf, offset, DEVLEN); - - while (offset < outbuf.length) { - outbuf[offset++] = 0; - } - - final long chk = TarUtils.computeCheckSum(outbuf); - - TarUtils.getCheckSumOctalBytes(chk, outbuf, csOffset, CHKSUMLEN); - } - - /** - * Parse an entry's header information from a header buffer. - * - * @param header The tar entry header buffer to get information from. - */ - public void parseTarHeader(final byte[] header) { - int offset = 0; - - this.name = TarUtils.parseName(header, offset, NAMELEN); - offset += NAMELEN; - this.mode = (int) TarUtils.parseOctal(header, offset, MODELEN); - offset += MODELEN; - this.userId = (int) TarUtils.parseOctal(header, offset, UIDLEN); - offset += UIDLEN; - this.groupId = (int) TarUtils.parseOctal(header, offset, GIDLEN); - offset += GIDLEN; - this.size = TarUtils.parseOctal(header, offset, SIZELEN); - offset += SIZELEN; - this.modTime = TarUtils.parseOctal(header, offset, MODTIMELEN); - offset += MODTIMELEN; - offset += CHKSUMLEN; - this.linkFlag = header[offset++]; - this.linkName = TarUtils.parseName(header, offset, NAMELEN); - offset += NAMELEN; - this.magic = TarUtils.parseName(header, offset, MAGICLEN); - offset += MAGICLEN; - this.userName = TarUtils.parseName(header, offset, UNAMELEN); - offset += UNAMELEN; - this.groupName = TarUtils.parseName(header, offset, GNAMELEN); - offset += GNAMELEN; - this.devMajor = (int) TarUtils.parseOctal(header, offset, DEVLEN); - offset += DEVLEN; - this.devMinor = (int) TarUtils.parseOctal(header, offset, DEVLEN); - } -} diff --git a/source/org/apache/tools/tar/TarInputStream.java b/source/org/apache/tools/tar/TarInputStream.java deleted file mode 100644 index 30365bbbe..000000000 --- a/source/org/apache/tools/tar/TarInputStream.java +++ /dev/null @@ -1,409 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - */ - -/* - * This package is based on the work done by Timothy Gerard Endres - * (time@ice.com) to whom the Ant project is very grateful for his great code. - */ -/* - * Modifications (Michael Christen) - * - replaced StringBuffer with StringBuilder - */ - -package org.apache.tools.tar; - -import java.io.FilterInputStream; -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; - -/** - * The TarInputStream reads a UNIX tar archive as an InputStream. - * methods are provided to position at each successive entry in - * the archive, and the read each entry as a normal input stream - * using read(). - * - */ -public class TarInputStream extends FilterInputStream { - private static final int SMALL_BUFFER_SIZE = 256; - private static final int BUFFER_SIZE = 8 * 1024; - private static final int LARGE_BUFFER_SIZE = 32 * 1024; - private static final int BYTE_MASK = 0xFF; - - // CheckStyle:VisibilityModifier OFF - bc - protected boolean debug; - protected boolean hasHitEOF; - protected long entrySize; - protected long entryOffset; - protected byte[] readBuf; - protected TarBuffer buffer; - protected TarEntry currEntry; - - /** - * This contents of this array is not used at all in this class, - * it is only here to avoid repreated object creation during calls - * to the no-arg read method. - */ - protected byte[] oneBuf; - - // CheckStyle:VisibilityModifier ON - - /** - * Constructor for TarInputStream. - * @param is the input stream to use - */ - public TarInputStream(InputStream is) { - this(is, TarBuffer.DEFAULT_BLKSIZE, TarBuffer.DEFAULT_RCDSIZE); - } - - /** - * Constructor for TarInputStream. - * @param is the input stream to use - * @param blockSize the block size to use - */ - public TarInputStream(InputStream is, int blockSize) { - this(is, blockSize, TarBuffer.DEFAULT_RCDSIZE); - } - - /** - * Constructor for TarInputStream. - * @param is the input stream to use - * @param blockSize the block size to use - * @param recordSize the record size to use - */ - public TarInputStream(InputStream is, int blockSize, int recordSize) { - super(is); - - this.buffer = new TarBuffer(is, blockSize, recordSize); - this.readBuf = null; - this.oneBuf = new byte[1]; - this.debug = false; - this.hasHitEOF = false; - } - - /** - * Sets the debugging flag. - * - * @param debug True to turn on debugging. - */ - public void setDebug(boolean debug) { - this.debug = debug; - this.buffer.setDebug(debug); - } - - /** - * Closes this stream. Calls the TarBuffer's close() method. - * @throws IOException on error - */ - @Override - public synchronized void close() throws IOException { - this.buffer.close(); - } - - /** - * Get the record size being used by this stream's TarBuffer. - * - * @return The TarBuffer record size. - */ - public int getRecordSize() { - return this.buffer.getRecordSize(); - } - - /** - * Get the available data that can be read from the current - * entry in the archive. This does not indicate how much data - * is left in the entire archive, only in the current entry. - * This value is determined from the entry's size header field - * and the amount of data already read from the current entry. - * Integer.MAX_VALUE is returen in case more than Integer.MAX_VALUE - * bytes are left in the current entry in the archive. - * - * @return The number of available bytes for the current entry. - * @throws IOException for signature - */ - @Override - public int available() throws IOException { - if (this.entrySize - this.entryOffset > Integer.MAX_VALUE) { - return Integer.MAX_VALUE; - } - return (int) (this.entrySize - this.entryOffset); - } - - /** - * Skip bytes in the input buffer. This skips bytes in the - * current entry's data, not the entire archive, and will - * stop at the end of the current entry's data if the number - * to skip extends beyond that point. - * - * @param numToSkip The number of bytes to skip. - * @return the number actually skipped - * @throws IOException on error - */ - @Override - public long skip(long numToSkip) throws IOException { - // REVIEW - // This is horribly inefficient, but it ensures that we - // properly skip over bytes via the TarBuffer... - // - byte[] skipBuf = new byte[BUFFER_SIZE]; - long skip = numToSkip; - while (skip > 0) { - int realSkip = (int) (skip > skipBuf.length ? skipBuf.length : skip); - int numRead = this.read(skipBuf, 0, realSkip); - if (numRead == -1) { - break; - } - skip -= numRead; - } - return (numToSkip - skip); - } - - /** - * Since we do not support marking just yet, we return false. - * - * @return False. - */ - @Override - public boolean markSupported() { - return false; - } - - /** - * Since we do not support marking just yet, we do nothing. - * - * @param markLimit The limit to mark. - */ - @Override - public synchronized void mark(int markLimit) { - } - - /** - * Since we do not support marking just yet, we do nothing. - */ - @Override - public synchronized void reset() { - } - - /** - * Get the next entry in this tar archive. This will skip - * over any remaining data in the current entry, if there - * is one, and place the input stream at the header of the - * next entry, and read the header and instantiate a new - * TarEntry from the header bytes and return that entry. - * If there are no more entries in the archive, null will - * be returned to indicate that the end of the archive has - * been reached. - * - * @return The next TarEntry in the archive, or null. - * @throws IOException on error - */ - public TarEntry getNextEntry() throws IOException { - if (this.hasHitEOF) { - return null; - } - - if (this.currEntry != null) { - long numToSkip = this.entrySize - this.entryOffset; - - if (this.debug) { - System.err.println("TarInputStream: SKIP currENTRY '" - + this.currEntry.getName() + "' SZ " - + this.entrySize + " OFF " - + this.entryOffset + " skipping " - + numToSkip + " bytes"); - } - - if (numToSkip > 0) { - this.skip(numToSkip); - } - - this.readBuf = null; - } - - byte[] headerBuf = this.buffer.readRecord(); - - if (headerBuf == null) { - if (this.debug) { - System.err.println("READ NULL RECORD"); - } - this.hasHitEOF = true; - } else if (this.buffer.isEOFRecord(headerBuf)) { - if (this.debug) { - System.err.println("READ EOF RECORD"); - } - this.hasHitEOF = true; - } - - if (this.hasHitEOF) { - this.currEntry = null; - } else { - this.currEntry = new TarEntry(headerBuf); - - if (this.debug) { - System.err.println("TarInputStream: SET CURRENTRY '" - + this.currEntry.getName() - + "' size = " - + this.currEntry.getSize()); - } - - this.entryOffset = 0; - - this.entrySize = this.currEntry.getSize(); - } - - if (this.currEntry != null && this.currEntry.isGNULongNameEntry()) { - // read in the name - StringBuilder longName = new StringBuilder(); - byte[] buf = new byte[SMALL_BUFFER_SIZE]; - int length = 0; - while ((length = read(buf)) >= 0) { - longName.append(new String(buf, 0, length)); - } - getNextEntry(); - if (this.currEntry == null) { - // Bugzilla: 40334 - // Malformed tar file - long entry name not followed by entry - return null; - } - // remove trailing null terminator - if (longName.length() > 0 - && longName.charAt(longName.length() - 1) == 0) { - longName.deleteCharAt(longName.length() - 1); - } - this.currEntry.setName(longName.toString()); - } - - return this.currEntry; - } - - /** - * Reads a byte from the current tar archive entry. - * - * This method simply calls read( byte[], int, int ). - * - * @return The byte read, or -1 at EOF. - * @throws IOException on error - */ - @Override - public int read() throws IOException { - int num = this.read(this.oneBuf, 0, 1); - return num == -1 ? -1 : (this.oneBuf[0]) & BYTE_MASK; - } - - /** - * Reads bytes from the current tar archive entry. - * - * This method is aware of the boundaries of the current - * entry in the archive and will deal with them as if they - * were this stream's start and EOF. - * - * @param buf The buffer into which to place bytes read. - * @param offset The offset at which to place bytes read. - * @param numToRead The number of bytes to read. - * @return The number of bytes read, or -1 at EOF. - * @throws IOException on error - */ - @Override - public int read(byte[] buf, int offset, int numToRead) throws IOException { - int totalRead = 0; - - if (this.entryOffset >= this.entrySize) { - return -1; - } - - if ((numToRead + this.entryOffset) > this.entrySize) { - numToRead = (int) (this.entrySize - this.entryOffset); - } - - if (this.readBuf != null) { - int sz = (numToRead > this.readBuf.length) ? this.readBuf.length - : numToRead; - - System.arraycopy(this.readBuf, 0, buf, offset, sz); - - if (sz >= this.readBuf.length) { - this.readBuf = null; - } else { - int newLen = this.readBuf.length - sz; - byte[] newBuf = new byte[newLen]; - - System.arraycopy(this.readBuf, sz, newBuf, 0, newLen); - - this.readBuf = newBuf; - } - - totalRead += sz; - numToRead -= sz; - offset += sz; - } - - while (numToRead > 0) { - byte[] rec = this.buffer.readRecord(); - - if (rec == null) { - // Unexpected EOF! - throw new IOException("unexpected EOF with " + numToRead - + " bytes unread"); - } - - int sz = numToRead; - int recLen = rec.length; - - if (recLen > sz) { - System.arraycopy(rec, 0, buf, offset, sz); - - this.readBuf = new byte[recLen - sz]; - - System.arraycopy(rec, sz, this.readBuf, 0, recLen - sz); - } else { - sz = recLen; - - System.arraycopy(rec, 0, buf, offset, recLen); - } - - totalRead += sz; - numToRead -= sz; - offset += sz; - } - - this.entryOffset += totalRead; - - return totalRead; - } - - /** - * Copies the contents of the current tar archive entry directly into - * an output stream. - * - * @param out The OutputStream into which to write the entry's data. - * @throws IOException on error - */ - public void copyEntryContents(OutputStream out) throws IOException { - byte[] buf = new byte[LARGE_BUFFER_SIZE]; - - while (true) { - int numRead = this.read(buf, 0, buf.length); - - if (numRead == -1) { - break; - } - - out.write(buf, 0, numRead); - } - } -} diff --git a/source/org/apache/tools/tar/TarOutputStream.java b/source/org/apache/tools/tar/TarOutputStream.java deleted file mode 100644 index c7afda383..000000000 --- a/source/org/apache/tools/tar/TarOutputStream.java +++ /dev/null @@ -1,359 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - */ - -/* - * This package is based on the work done by Timothy Gerard Endres - * (time@ice.com) to whom the Ant project is very grateful for his great code. - */ - -package org.apache.tools.tar; - -import java.io.FilterOutputStream; -import java.io.OutputStream; -import java.io.IOException; - -/** - * The TarOutputStream writes a UNIX tar archive as an OutputStream. - * Methods are provided to put entries, and then write their contents - * by writing to this stream using write(). - * - */ -public class TarOutputStream extends FilterOutputStream { - /** Fail if a long file name is required in the archive. */ - public static final int LONGFILE_ERROR = 0; - - /** Long paths will be truncated in the archive. */ - public static final int LONGFILE_TRUNCATE = 1; - - /** GNU tar extensions are used to store long file names in the archive. */ - public static final int LONGFILE_GNU = 2; - - // CheckStyle:VisibilityModifier OFF - bc - protected boolean debug; - protected long currSize; - protected String currName; - protected long currBytes; - protected byte[] oneBuf; - protected byte[] recordBuf; - protected int assemLen; - protected byte[] assemBuf; - protected TarBuffer buffer; - protected int longFileMode = LONGFILE_ERROR; - // CheckStyle:VisibilityModifier ON - - private boolean closed = false; - - /** - * Constructor for TarInputStream. - * @param os the output stream to use - */ - public TarOutputStream(OutputStream os) { - this(os, TarBuffer.DEFAULT_BLKSIZE, TarBuffer.DEFAULT_RCDSIZE); - } - - /** - * Constructor for TarInputStream. - * @param os the output stream to use - * @param blockSize the block size to use - */ - public TarOutputStream(OutputStream os, int blockSize) { - this(os, blockSize, TarBuffer.DEFAULT_RCDSIZE); - } - - /** - * Constructor for TarInputStream. - * @param os the output stream to use - * @param blockSize the block size to use - * @param recordSize the record size to use - */ - public TarOutputStream(OutputStream os, int blockSize, int recordSize) { - super(os); - - this.buffer = new TarBuffer(os, blockSize, recordSize); - this.debug = false; - this.assemLen = 0; - this.assemBuf = new byte[recordSize]; - this.recordBuf = new byte[recordSize]; - this.oneBuf = new byte[1]; - } - - /** - * Set the long file mode. - * This can be LONGFILE_ERROR(0), LONGFILE_TRUNCATE(1) or LONGFILE_GNU(2). - * This specifies the treatment of long file names (names >= TarConstants.NAMELEN). - * Default is LONGFILE_ERROR. - * @param longFileMode the mode to use - */ - public void setLongFileMode(int longFileMode) { - this.longFileMode = longFileMode; - } - - - /** - * Sets the debugging flag. - * - * @param debugF True to turn on debugging. - */ - public void setDebug(boolean debugF) { - this.debug = debugF; - } - - /** - * Sets the debugging flag in this stream's TarBuffer. - * - * @param debug True to turn on debugging. - */ - public void setBufferDebug(boolean debug) { - this.buffer.setDebug(debug); - } - - /** - * Ends the TAR archive without closing the underlying OutputStream. - * The result is that the two EOF records of nulls are written. - * @throws IOException on error - */ - public void finish() throws IOException { - // See Bugzilla 28776 for a discussion on this - // http://issues.apache.org/bugzilla/show_bug.cgi?id=28776 - this.writeEOFRecord(); - this.writeEOFRecord(); - } - - /** - * Ends the TAR archive and closes the underlying OutputStream. - * This means that finish() is called followed by calling the - * TarBuffer's close(). - * @throws IOException on error - */ - @Override - public synchronized void close() throws IOException { - if (!closed) { - this.finish(); - this.buffer.close(); - out.close(); - closed = true; - } - } - - /** - * Get the record size being used by this stream's TarBuffer. - * - * @return The TarBuffer record size. - */ - public int getRecordSize() { - return this.buffer.getRecordSize(); - } - - /** - * Put an entry on the output stream. This writes the entry's - * header record and positions the output stream for writing - * the contents of the entry. Once this method is called, the - * stream is ready for calls to write() to write the entry's - * contents. Once the contents are written, closeEntry() - * MUST be called to ensure that all buffered data - * is completely written to the output stream. - * - * @param entry The TarEntry to be written to the archive. - * @throws IOException on error - */ - public void putNextEntry(TarEntry entry) throws IOException { - if (entry.getName().length() >= TarConstants.NAMELEN) { - - if (longFileMode == LONGFILE_GNU) { - // create a TarEntry for the LongLink, the contents - // of which are the entry's name - TarEntry longLinkEntry = new TarEntry(TarConstants.GNU_LONGLINK, - TarConstants.LF_GNUTYPE_LONGNAME); - - longLinkEntry.setSize(entry.getName().length() + 1); - putNextEntry(longLinkEntry); - write(entry.getName().getBytes()); - write(0); - closeEntry(); - } else if (longFileMode != LONGFILE_TRUNCATE) { - throw new RuntimeException("file name '" + entry.getName() - + "' is too long ( > " - + TarConstants.NAMELEN + " bytes)"); - } - } - - entry.writeEntryHeader(this.recordBuf); - this.buffer.writeRecord(this.recordBuf); - - this.currBytes = 0; - - if (entry.isDirectory()) { - this.currSize = 0; - } else { - this.currSize = entry.getSize(); - } - currName = entry.getName(); - } - - /** - * Close an entry. This method MUST be called for all file - * entries that contain data. The reason is that we must - * buffer data written to the stream in order to satisfy - * the buffer's record based writes. Thus, there may be - * data fragments still being assembled that must be written - * to the output stream before this entry is closed and the - * next entry written. - * @throws IOException on error - */ - public void closeEntry() throws IOException { - if (this.assemLen > 0) { - for (int i = this.assemLen; i < this.assemBuf.length; ++i) { - this.assemBuf[i] = 0; - } - - this.buffer.writeRecord(this.assemBuf); - - this.currBytes += this.assemLen; - this.assemLen = 0; - } - - if (this.currBytes < this.currSize) { - throw new IOException("entry '" + currName + "' closed at '" - + this.currBytes - + "' before the '" + this.currSize - + "' bytes specified in the header were written"); - } - } - - /** - * Writes a byte to the current tar archive entry. - * - * This method simply calls read( byte[], int, int ). - * - * @param b The byte written. - * @throws IOException on error - */ - @Override - public void write(int b) throws IOException { - this.oneBuf[0] = (byte) b; - - this.write(this.oneBuf, 0, 1); - } - - /** - * Writes bytes to the current tar archive entry. - * - * This method simply calls write( byte[], int, int ). - * - * @param wBuf The buffer to write to the archive. - * @throws IOException on error - */ - @Override - public void write(byte[] wBuf) throws IOException { - this.write(wBuf, 0, wBuf.length); - } - - /** - * Writes bytes to the current tar archive entry. This method - * is aware of the current entry and will throw an exception if - * you attempt to write bytes past the length specified for the - * current entry. The method is also (painfully) aware of the - * record buffering required by TarBuffer, and manages buffers - * that are not a multiple of recordsize in length, including - * assembling records from small buffers. - * - * @param wBuf The buffer to write to the archive. - * @param wOffset The offset in the buffer from which to get bytes. - * @param numToWrite The number of bytes to write. - * @throws IOException on error - */ - @Override - public void write(byte[] wBuf, int wOffset, int numToWrite) throws IOException { - if ((this.currBytes + numToWrite) > this.currSize) { - throw new IOException("request to write '" + numToWrite - + "' bytes exceeds size in header of '" - + this.currSize + "' bytes for entry '" - + currName + "'"); - - // - // We have to deal with assembly!!! - // The programmer can be writing little 32 byte chunks for all - // we know, and we must assemble complete records for writing. - // REVIEW Maybe this should be in TarBuffer? Could that help to - // eliminate some of the buffer copying. - // - } - - if (this.assemLen > 0) { - if ((this.assemLen + numToWrite) >= this.recordBuf.length) { - int aLen = this.recordBuf.length - this.assemLen; - - System.arraycopy(this.assemBuf, 0, this.recordBuf, 0, - this.assemLen); - System.arraycopy(wBuf, wOffset, this.recordBuf, - this.assemLen, aLen); - this.buffer.writeRecord(this.recordBuf); - - this.currBytes += this.recordBuf.length; - wOffset += aLen; - numToWrite -= aLen; - this.assemLen = 0; - } else { - System.arraycopy(wBuf, wOffset, this.assemBuf, this.assemLen, - numToWrite); - - wOffset += numToWrite; - this.assemLen += numToWrite; - numToWrite -= numToWrite; - } - } - - // - // When we get here we have EITHER: - // o An empty "assemble" buffer. - // o No bytes to write (numToWrite == 0) - // - while (numToWrite > 0) { - if (numToWrite < this.recordBuf.length) { - System.arraycopy(wBuf, wOffset, this.assemBuf, this.assemLen, - numToWrite); - - this.assemLen += numToWrite; - - break; - } - - this.buffer.writeRecord(wBuf, wOffset); - - int num = this.recordBuf.length; - - this.currBytes += num; - numToWrite -= num; - wOffset += num; - } - } - - /** - * Write an EOF (end of archive) record to the tar archive. - * An EOF record consists of a record of all zeros. - */ - private void writeEOFRecord() throws IOException { - for (int i = 0; i < this.recordBuf.length; ++i) { - this.recordBuf[i] = 0; - } - - this.buffer.writeRecord(this.recordBuf); - } -} - - diff --git a/source/org/apache/tools/tar/TarUtils.java b/source/org/apache/tools/tar/TarUtils.java deleted file mode 100644 index 68636e6a8..000000000 --- a/source/org/apache/tools/tar/TarUtils.java +++ /dev/null @@ -1,210 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - */ - -/* - * This package is based on the work done by Timothy Gerard Endres - * (time@ice.com) to whom the Ant project is very grateful for his great code. - */ -/* - * Modifications (Michael Christen) - * - replaced StringBuffer with StringBuilder - */ - -package org.apache.tools.tar; - -/** - * This class provides static utility methods to work with byte streams. - * - */ -// CheckStyle:HideUtilityClassConstructorCheck OFF (bc) -public class TarUtils { - - private static final int BYTE_MASK = 255; - - /** - * Parse an octal string from a header buffer. This is used for the - * file permission mode value. - * - * @param header The header buffer from which to parse. - * @param offset The offset into the buffer from which to parse. - * @param length The number of header bytes to parse. - * @return The long value of the octal string. - */ - public static long parseOctal(byte[] header, int offset, int length) { - long result = 0; - boolean stillPadding = true; - int end = offset + length; - - for (int i = offset; i < end; ++i) { - if (header[i] == 0) { - break; - } - - if (header[i] == (byte) ' ' || header[i] == '0') { - if (stillPadding) { - continue; - } - - if (header[i] == (byte) ' ') { - break; - } - } - - stillPadding = false; - // CheckStyle:MagicNumber OFF - result = (result << 3) + (header[i] - '0'); - // CheckStyle:MagicNumber ON - } - - return result; - } - - /** - * Parse an entry name from a header buffer. - * - * @param header The header buffer from which to parse. - * @param offset The offset into the buffer from which to parse. - * @param length The number of header bytes to parse. - * @return The header's entry name. - */ - public static StringBuilder parseName(byte[] header, int offset, int length) { - StringBuilder result = new StringBuilder(length); - int end = offset + length; - - for (int i = offset; i < end; ++i) { - if (header[i] == 0) { - break; - } - - result.append((char) header[i]); - } - - return result; - } - - /** - * Determine the number of bytes in an entry name. - * - * @param name The header name from which to parse. - * @param buf The buffer from which to parse. - * @param offset The offset into the buffer from which to parse. - * @param length The number of header bytes to parse. - * @return The number of bytes in a header's entry name. - */ - public static int getNameBytes(StringBuilder name, byte[] buf, int offset, int length) { - int i; - - for (i = 0; i < length && i < name.length(); ++i) { - buf[offset + i] = (byte) name.charAt(i); - } - - for (; i < length; ++i) { - buf[offset + i] = 0; - } - - return offset + length; - } - - /** - * Parse an octal integer from a header buffer. - * - * @param value The header value - * @param buf The buffer from which to parse. - * @param offset The offset into the buffer from which to parse. - * @param length The number of header bytes to parse. - * @return The integer value of the octal bytes. - */ - public static int getOctalBytes(long value, byte[] buf, int offset, int length) { - int idx = length - 1; - - buf[offset + idx] = 0; - --idx; - buf[offset + idx] = (byte) ' '; - --idx; - - if (value == 0) { - buf[offset + idx] = (byte) '0'; - --idx; - } else { - for (long val = value; idx >= 0 && val > 0; --idx) { - // CheckStyle:MagicNumber OFF - buf[offset + idx] = (byte) ((byte) '0' + (byte) (val & 7)); - val = val >> 3; - // CheckStyle:MagicNumber ON - } - } - - for (; idx >= 0; --idx) { - buf[offset + idx] = (byte) ' '; - } - - return offset + length; - } - - /** - * Parse an octal long integer from a header buffer. - * - * @param value The header value - * @param buf The buffer from which to parse. - * @param offset The offset into the buffer from which to parse. - * @param length The number of header bytes to parse. - * @return The long value of the octal bytes. - */ - public static int getLongOctalBytes(long value, byte[] buf, int offset, int length) { - byte[] temp = new byte[length + 1]; - - getOctalBytes(value, temp, 0, length + 1); - System.arraycopy(temp, 0, buf, offset, length); - - return offset + length; - } - - /** - * Parse the checksum octal integer from a header buffer. - * - * @param value The header value - * @param buf The buffer from which to parse. - * @param offset The offset into the buffer from which to parse. - * @param length The number of header bytes to parse. - * @return The integer value of the entry's checksum. - */ - public static int getCheckSumOctalBytes(long value, byte[] buf, int offset, int length) { - getOctalBytes(value, buf, offset, length); - - buf[offset + length - 1] = (byte) ' '; - buf[offset + length - 2] = 0; - - return offset + length; - } - - /** - * Compute the checksum of a tar entry header. - * - * @param buf The tar entry's header buffer. - * @return The computed checksum. - */ - public static long computeCheckSum(byte[] buf) { - long sum = 0; - - for (int i = 0; i < buf.length; ++i) { - sum += BYTE_MASK & buf[i]; - } - - return sum; - } -} From 8532565c7d109ed5eed718602347fa9400ccbcc4 Mon Sep 17 00:00:00 2001 From: reger Date: Wed, 4 Nov 2015 21:52:02 +0100 Subject: [PATCH 8/8] optimize order of parsers to try - start with a parser matching the remote supplied mime --- source/net/yacy/document/TextParser.java | 33 ++++++++++++++---------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/source/net/yacy/document/TextParser.java b/source/net/yacy/document/TextParser.java index a5d980794..66f679f69 100644 --- a/source/net/yacy/document/TextParser.java +++ b/source/net/yacy/document/TextParser.java @@ -375,9 +375,10 @@ public final class TextParser { * because mime types returned by web severs are sometimes wrong, we also compute the mime type again * from the extension that can be extracted from the url path. That means that there are 3 criteria * that can be used to select a parser: - * - the given extension - * - the given mime type - * - the mime type computed from the extension + * - the given mime type (1.) + * - the extension of url (2.) + * - the mime type computed from the extension (3.) + * finally the generic parser is added as backup if all above fail * @param url the given url * @param mimeType the given mime type * @return a list of Idiom parsers that may be appropriate for the given criteria @@ -386,26 +387,30 @@ public final class TextParser { private static Set parsers(final MultiProtocolURL url, String mimeType1) throws Parser.Failure { final Set idioms = new LinkedHashSet(2); // LinkedSet to maintain order (genericParser should be last) - // check extension - String ext = MultiProtocolURL.getFileExtension(url.getFileName()); + // check given mime type, place this first because this is the most likely to work and the best fit to the supplied mime Set idiom; - if (ext != null && ext.length() > 0) { - if (denyExtensionx.containsKey(ext)) throw new Parser.Failure("file extension '" + ext + "' is denied (1)", url); - idiom = ext2parser.get(ext); - if (idiom != null) idioms.addAll(idiom); - } - - // check given mime type if (mimeType1 != null) { mimeType1 = normalizeMimeType(mimeType1); if (denyMime.containsKey(mimeType1)) throw new Parser.Failure("mime type '" + mimeType1 + "' is denied (1)", url); idiom = mime2parser.get(mimeType1); - if (idiom != null && !idioms.contains(idiom)) idioms.addAll(idiom); + if (idiom != null) idioms.addAll(idiom); + } + + // check extension and add as backup (in case no, wrong or unknown/unsupported mime was suppied) + String ext = MultiProtocolURL.getFileExtension(url.getFileName()); + if (ext != null && ext.length() > 0) { + if (denyExtensionx.containsKey(ext)) throw new Parser.Failure("file extension '" + ext + "' is denied (1)", url); + idiom = ext2parser.get(ext); + if (idiom != null && !idioms.containsAll(idiom)) { // use containsAll -> idiom is a Set of parser + idioms.addAll(idiom); + } } // check mime type computed from extension final String mimeType2 = ext2mime.get(ext); - if (mimeType2 != null && (idiom = mime2parser.get(mimeType2)) != null && !idioms.contains(idiom)) idioms.addAll(idiom); + if (mimeType2 != null && (idiom = mime2parser.get(mimeType2)) != null && !idioms.containsAll(idiom)) { // use containsAll -> idiom is a Set of parser + idioms.addAll(idiom); + } // always add the generic parser (make sure it is the last in access order) idioms.add(genericIdiom);