From 11639aef351d5785248aa4625a8f3d62be04b166 Mon Sep 17 00:00:00 2001 From: orbiter Date: Tue, 25 May 2010 12:54:57 +0000 Subject: [PATCH] - added new protocol loader for 'file'-type URLs - it is now possible to crawl the local file system with an intranet peer - redesign of URL handling - refactoring: created LGPLed package cora: 'content retrieval api' which may be used externally by other applications without yacy core elements because it has no dependencies to other parts of yacy git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6902 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- defaults/yacy.init | 5 +- htroot/Collage.java | 6 +- htroot/Crawler_p.java | 12 +- htroot/FeedReader_p.java | 8 +- htroot/SettingsAck_p.html | 10 +- htroot/SettingsAck_p.java | 18 +- htroot/Settings_Crawler.inc | 16 + htroot/Settings_p.java | 1 + htroot/ViewFile.java | 9 +- htroot/api/feed.java | 4 +- htroot/rct_p.java | 6 +- htroot/yacy/search.java | 4 +- htroot/yacy/transferRWI.java | 4 +- htroot/yacy/transferURL.java | 4 +- htroot/yacysearch.java | 4 +- htroot/yacysearch_location.java | 7 +- source/de/anomic/crawler/CrawlQueues.java | 6 +- source/de/anomic/crawler/CrawlStacker.java | 1 + source/de/anomic/crawler/ResultImages.java | 15 +- source/de/anomic/crawler/RobotsTxt.java | 5 +- .../anomic/crawler/retrieval/FTPLoader.java | 5 +- .../anomic/crawler/retrieval/FileLoader.java | 144 +++ .../anomic/crawler/retrieval/HTTPLoader.java | 5 +- source/de/anomic/data/BookmarkHelper.java | 17 +- source/de/anomic/data/MimeTable.java | 6 +- source/de/anomic/net/ftpc.java | 4 +- source/de/anomic/search/DocumentIndex.java | 2 +- source/de/anomic/search/MediaSnippet.java | 15 +- .../de/anomic/search/MetadataRepository.java | 3 +- source/de/anomic/search/RankingProcess.java | 3 +- source/de/anomic/search/ResultEntry.java | 3 +- source/de/anomic/search/ResultFetcher.java | 6 +- source/de/anomic/search/Segment.java | 7 +- source/de/anomic/search/Switchboard.java | 15 +- source/de/anomic/search/TextSnippet.java | 2 +- source/de/anomic/server/serverObjects.java | 6 +- .../yacy/graphics/WebStructureGraph.java | 9 +- source/de/anomic/yacy/yacyClient.java | 245 +--- source/de/anomic/yacy/yacyCore.java | 4 +- source/de/anomic/yacy/yacyPeerActions.java | 4 +- source/de/anomic/yacy/yacyRelease.java | 14 +- source/net/yacy/cora/document/Channel.java | 42 + source/net/yacy/cora/document/Channels.java | 27 + source/net/yacy/cora/document/Hit.java | 74 ++ .../yacy/cora/document/MultiProtocolURI.java | 1037 ++++++++++++++++ .../util => cora/document}/Punycode.java | 18 +- .../parser/xml => cora/document}/RSSFeed.java | 55 +- .../content => cora/document}/RSSMessage.java | 119 +- .../xml => cora/document}/RSSReader.java | 77 +- .../net/yacy/cora/protocol/HttpConnector.java | 90 ++ source/net/yacy/cora/services/Search.java | 145 +++ source/net/yacy/document/AbstractParser.java | 12 +- source/net/yacy/document/Condenser.java | 6 +- source/net/yacy/document/Document.java | 90 +- source/net/yacy/document/Idiom.java | 8 +- source/net/yacy/document/ParserException.java | 8 +- source/net/yacy/document/TextParser.java | 20 +- .../net/yacy/document/parser/bzipParser.java | 4 +- .../net/yacy/document/parser/csvParser.java | 6 +- .../net/yacy/document/parser/docParser.java | 4 +- .../net/yacy/document/parser/gzipParser.java | 4 +- .../document/parser/html/ContentScraper.java | 62 +- .../yacy/document/parser/html/ImageEntry.java | 8 +- .../parser/html/ScraperInputStream.java | 4 +- .../net/yacy/document/parser/htmlParser.java | 6 +- .../parser/images/genericImageParser.java | 22 +- .../net/yacy/document/parser/odtParser.java | 6 +- .../net/yacy/document/parser/ooxmlParser.java | 6 +- .../net/yacy/document/parser/pdfParser.java | 4 +- .../net/yacy/document/parser/pptParser.java | 4 +- source/net/yacy/document/parser/psParser.java | 6 +- .../net/yacy/document/parser/rssParser.java | 28 +- .../net/yacy/document/parser/rtfParser.java | 4 +- .../yacy/document/parser/sevenzipParser.java | 12 +- .../net/yacy/document/parser/swfParser.java | 8 +- .../net/yacy/document/parser/tarParser.java | 10 +- .../yacy/document/parser/torrentParser.java | 6 +- .../net/yacy/document/parser/vcfParser.java | 8 +- .../net/yacy/document/parser/vsdParser.java | 4 +- .../net/yacy/document/parser/xlsParser.java | 6 +- .../net/yacy/document/parser/zipParser.java | 10 +- .../yacy/kelondro/data/meta/DigestURI.java | 1058 +---------------- source/net/yacy/kelondro/util/Domains.java | 2 +- source/net/yacy/repository/Blacklist.java | 2 +- .../net/yacy/repository/LoaderDispatcher.java | 8 +- test/de/anomic/yacy/yacyURLTest.java | 6 +- 86 files changed, 2134 insertions(+), 1676 deletions(-) create mode 100644 source/de/anomic/crawler/retrieval/FileLoader.java create mode 100644 source/net/yacy/cora/document/Channel.java create mode 100644 source/net/yacy/cora/document/Channels.java create mode 100644 source/net/yacy/cora/document/Hit.java create mode 100644 source/net/yacy/cora/document/MultiProtocolURI.java rename source/net/yacy/{kelondro/util => cora/document}/Punycode.java (94%) rename source/net/yacy/{document/parser/xml => cora/document}/RSSFeed.java (77%) rename source/net/yacy/{document/content => cora/document}/RSSMessage.java (62%) rename source/net/yacy/{document/parser/xml => cora/document}/RSSReader.java (70%) create mode 100644 source/net/yacy/cora/protocol/HttpConnector.java create mode 100644 source/net/yacy/cora/services/Search.java diff --git a/defaults/yacy.init b/defaults/yacy.init index f115ed269..38a2778d6 100644 --- a/defaults/yacy.init +++ b/defaults/yacy.init @@ -685,7 +685,10 @@ crawler.http.maxFileSize=1048576 crawler.ftp.maxFileSize=1048576 # smb crawler specific settings: maximum size -crawler.smb.maxFileSize=50000000 +crawler.smb.maxFileSize=100000000 + +# smb crawler specific settings: maximum size +crawler.file.maxFileSize=100000000 # maximum number of crawler threads crawler.MaxActiveThreads = 200 diff --git a/htroot/Collage.java b/htroot/Collage.java index ce5e0c5c5..55352b624 100755 --- a/htroot/Collage.java +++ b/htroot/Collage.java @@ -24,7 +24,7 @@ import java.util.Random; -import net.yacy.kelondro.data.meta.DigestURI; +import net.yacy.cora.document.MultiProtocolURI; import net.yacy.kelondro.util.Domains; import de.anomic.crawler.ResultImages; @@ -90,8 +90,8 @@ public class Collage { final int yOffset = embed ? 0 : 70; for (int i = 0; i < fifoSize; i++) { - final DigestURI baseURL = origins[i].baseURL; - final DigestURI imageURL = origins[i].imageEntry.url(); + final MultiProtocolURI baseURL = origins[i].baseURL; + final MultiProtocolURI imageURL = origins[i].imageEntry.url(); // check if this loads a page from localhost, which must be prevented to protect the server // against attacks to the administration interface when localhost access is granted diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index ad48a4df5..4829d0061 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -36,6 +36,7 @@ import java.util.Set; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; +import net.yacy.cora.document.MultiProtocolURI; import net.yacy.document.parser.html.ContentScraper; import net.yacy.document.parser.html.TransformerWriter; import net.yacy.kelondro.data.meta.DigestURI; @@ -234,7 +235,7 @@ public class Crawler_p { // stack url sb.crawler.profilesPassiveCrawls.removeEntry(crawlingStartURL.hash()); // if there is an old entry, delete it final CrawlProfile.entry pe = sb.crawler.profilesActiveCrawls.newEntry( - crawlingStartURL.getHost(), + (crawlingStartURL.getHost() == null) ? Long.toHexString(System.currentTimeMillis()) : crawlingStartURL.getHost(), crawlingStartURL, newcrawlingMustMatch, newcrawlingMustNotMatch, @@ -345,7 +346,7 @@ public class Crawler_p { writer.close(); //String headline = scraper.getHeadline(); - final Map hyperlinks = scraper.getAnchors(); + final Map hyperlinks = scraper.getAnchors(); // creating a crawler profile final DigestURI crawlURL = new DigestURI("file://" + file.toString(), null); @@ -370,11 +371,12 @@ public class Crawler_p { sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL); // loop through the contained links - final Iterator> linkiterator = hyperlinks.entrySet().iterator(); + final Iterator> linkiterator = hyperlinks.entrySet().iterator(); DigestURI nexturl; while (linkiterator.hasNext()) { - final Map.Entry e = linkiterator.next(); - nexturl = e.getKey(); + final Map.Entry e = linkiterator.next(); + if (e.getKey() == null) continue; + nexturl = new DigestURI(e.getKey()); if (nexturl == null) continue; // enqueuing the url for crawling diff --git a/htroot/FeedReader_p.java b/htroot/FeedReader_p.java index cba09d0cd..d2059e187 100644 --- a/htroot/FeedReader_p.java +++ b/htroot/FeedReader_p.java @@ -25,9 +25,9 @@ import java.io.IOException; import java.net.MalformedURLException; -import net.yacy.document.content.RSSMessage; -import net.yacy.document.parser.xml.RSSFeed; -import net.yacy.document.parser.xml.RSSReader; +import net.yacy.cora.document.Hit; +import net.yacy.cora.document.RSSFeed; +import net.yacy.cora.document.RSSReader; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.logging.Log; @@ -69,7 +69,7 @@ public class FeedReader_p { prop.putHTML("page_description", feed.getChannel().getDescription()); int i = 0; - for (final RSSMessage item: feed) { + for (final Hit item: feed) { prop.putHTML("page_items_" + i + "_author", item.getAuthor()); prop.putHTML("page_items_" + i + "_title", item.getTitle()); prop.putHTML("page_items_" + i + "_link", item.getLink()); diff --git a/htroot/SettingsAck_p.html b/htroot/SettingsAck_p.html index 4be5bcb58..79d8f0be6 100644 --- a/htroot/SettingsAck_p.html +++ b/htroot/SettingsAck_p.html @@ -159,23 +159,27 @@ http Crawler Settings: - Maximum Filesize: + Maximum HTTP Filesize: #[crawler.http.maxFileSize]# ftp Crawler Settings: - Maximum Filesize: + Maximum FTP Filesize: #[crawler.ftp.maxFileSize]# smb Crawler Settings: - Maximum Filesize: + Maximum SMB Filesize: #[crawler.smb.maxFileSize]# + + Maximum file Filesize: + #[crawler.file.maxFileSize]# + ::

Invalid crawler timeout value: #[crawler.clientTimeout]#

diff --git a/htroot/SettingsAck_p.java b/htroot/SettingsAck_p.java index 1ec681c4e..0c3fe9214 100644 --- a/htroot/SettingsAck_p.java +++ b/htroot/SettingsAck_p.java @@ -503,18 +503,32 @@ public class SettingsAck_p { long maxSmbSize; try { maxSmbSize = Integer.parseInt(maxSizeStr); - env.setConfig("crawler.smb.maxFileSize", Long.toString(maxFtpSize)); + env.setConfig("crawler.smb.maxFileSize", Long.toString(maxSmbSize)); } catch (final NumberFormatException e) { prop.put("info", "31"); prop.putHTML("info_crawler.smb.maxFileSize",post.get("crawler.smb.maxFileSize")); return prop; } + maxSizeStr = post.get("crawler.file.maxFileSize"); + if (maxSizeStr==null||maxSizeStr.length()==0) maxSizeStr = "-1"; + + long maxFileSize; + try { + maxFileSize = Integer.parseInt(maxSizeStr); + env.setConfig("crawler.file.maxFileSize", Long.toString(maxFileSize)); + } catch (final NumberFormatException e) { + prop.put("info", "31"); + prop.putHTML("info_crawler.file.maxFileSize",post.get("crawler.file.maxFileSize")); + return prop; + } + // everything is ok prop.put("info_crawler.clientTimeout",(crawlerTimeout==0) ? "0" :DateFormatter.formatInterval(crawlerTimeout)); prop.put("info_crawler.http.maxFileSize",(maxHttpSize==-1)? "-1":Formatter.bytesToString(maxHttpSize)); prop.put("info_crawler.ftp.maxFileSize", (maxFtpSize==-1) ? "-1":Formatter.bytesToString(maxFtpSize)); - prop.put("info_crawler.smb.maxFileSize", (maxFtpSize==-1) ? "-1":Formatter.bytesToString(maxSmbSize)); + prop.put("info_crawler.smb.maxFileSize", (maxSmbSize==-1) ? "-1":Formatter.bytesToString(maxSmbSize)); + prop.put("info_crawler.file.maxFileSize", (maxFileSize==-1) ? "-1":Formatter.bytesToString(maxFileSize)); prop.put("info", "28"); return prop; } diff --git a/htroot/Settings_Crawler.inc b/htroot/Settings_Crawler.inc index f375ebeb3..112fa9f0a 100644 --- a/htroot/Settings_Crawler.inc +++ b/htroot/Settings_Crawler.inc @@ -26,6 +26,22 @@
+

SMB Crawler Settings:

+ + Maximum Filesize: + + Maximum allowed file size in bytes that should be downloaded. Larger files will be skipped. -1 means unlimited. + +
+ +

Local File Crawler Settings:

+ + Maximum Filesize: + + Maximum allowed file size in bytes that should be downloaded. Larger files will be skipped. -1 means unlimited. + +
+   diff --git a/htroot/Settings_p.java b/htroot/Settings_p.java index 1c2998d95..382f78718 100644 --- a/htroot/Settings_p.java +++ b/htroot/Settings_p.java @@ -202,6 +202,7 @@ public final class Settings_p { prop.putHTML("crawler.http.maxFileSize",sb.getConfig("crawler.http.maxFileSize", "-1")); prop.putHTML("crawler.ftp.maxFileSize",sb.getConfig("crawler.ftp.maxFileSize", "-1")); prop.putHTML("crawler.smb.maxFileSize",sb.getConfig("crawler.smb.maxFileSize", "-1")); + prop.putHTML("crawler.file.maxFileSize",sb.getConfig("crawler.file.maxFileSize", "-1")); // return rewrite properties return prop; diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java index cb1108d9a..cb3bd045e 100644 --- a/htroot/ViewFile.java +++ b/htroot/ViewFile.java @@ -35,6 +35,7 @@ import java.util.HashMap; import java.util.Iterator; import java.util.Map; +import net.yacy.cora.document.MultiProtocolURI; import net.yacy.document.Condenser; import net.yacy.document.Document; import net.yacy.document.ParserException; @@ -372,7 +373,7 @@ public class ViewFile { i += putMediaInfo(prop, wordArray, i, document.getAudiolinks(), "audio", (i % 2 == 0)); dark = (i % 2 == 0); - final HashMap ts = document.getImages(); + final HashMap ts = document.getImages(); final Iterator tsi = ts.values().iterator(); ImageEntry entry; while (tsi.hasNext()) { @@ -439,9 +440,9 @@ public class ViewFile { return message; } - private static int putMediaInfo(final serverObjects prop, final String[] wordArray, int c, final Map media, final String name, boolean dark) { - final Iterator> mi = media.entrySet().iterator(); - Map.Entry entry; + private static int putMediaInfo(final serverObjects prop, final String[] wordArray, int c, final Map media, final String name, boolean dark) { + final Iterator> mi = media.entrySet().iterator(); + Map.Entry entry; int i = 0; while (mi.hasNext()) { entry = mi.next(); diff --git a/htroot/api/feed.java b/htroot/api/feed.java index 227ea6592..000301f9b 100755 --- a/htroot/api/feed.java +++ b/htroot/api/feed.java @@ -2,8 +2,8 @@ import java.util.Date; -import net.yacy.document.content.RSSMessage; -import net.yacy.document.parser.xml.RSSFeed; +import net.yacy.cora.document.RSSFeed; +import net.yacy.cora.document.RSSMessage; import de.anomic.http.server.RequestHeader; import de.anomic.search.Switchboard; diff --git a/htroot/rct_p.java b/htroot/rct_p.java index a5549c117..716685881 100644 --- a/htroot/rct_p.java +++ b/htroot/rct_p.java @@ -30,8 +30,8 @@ import java.text.ParseException; import java.util.Date; import java.util.Iterator; -import net.yacy.document.content.RSSMessage; -import net.yacy.document.parser.xml.RSSFeed; +import net.yacy.cora.document.RSSFeed; +import net.yacy.cora.document.Hit; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.util.DateFormatter; @@ -57,7 +57,7 @@ public class rct_p { final yacySeed seed = (peerhash == null) ? null : sb.peers.getConnected(peerhash); final RSSFeed feed = (seed == null) ? null : yacyClient.queryRemoteCrawlURLs(sb.peers, seed, 20, 60000); if (feed != null) { - for (final RSSMessage item: feed) { + for (final Hit item: feed) { //System.out.println("URL=" + item.getLink() + ", desc=" + item.getDescription() + ", pubDate=" + item.getPubDate()); // put url on remote crawl stack diff --git a/htroot/yacy/search.java b/htroot/yacy/search.java index 428e71237..bcf296f7b 100644 --- a/htroot/yacy/search.java +++ b/htroot/yacy/search.java @@ -35,8 +35,8 @@ import java.util.Map; import java.util.TreeMap; import java.util.TreeSet; -import net.yacy.document.content.RSSMessage; -import net.yacy.document.parser.xml.RSSFeed; +import net.yacy.cora.document.RSSFeed; +import net.yacy.cora.document.RSSMessage; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.word.WordReference; import net.yacy.kelondro.data.word.WordReferenceRow; diff --git a/htroot/yacy/transferRWI.java b/htroot/yacy/transferRWI.java index 341c876d6..93d6fcb1a 100644 --- a/htroot/yacy/transferRWI.java +++ b/htroot/yacy/transferRWI.java @@ -30,8 +30,8 @@ import java.util.ArrayList; import java.util.Iterator; -import net.yacy.document.content.RSSMessage; -import net.yacy.document.parser.xml.RSSFeed; +import net.yacy.cora.document.RSSFeed; +import net.yacy.cora.document.RSSMessage; import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.word.WordReferenceRow; import net.yacy.kelondro.index.HandleSet; diff --git a/htroot/yacy/transferURL.java b/htroot/yacy/transferURL.java index fd5748975..8c8cab6a4 100644 --- a/htroot/yacy/transferURL.java +++ b/htroot/yacy/transferURL.java @@ -29,8 +29,8 @@ import java.io.IOException; import java.text.ParseException; -import net.yacy.document.content.RSSMessage; -import net.yacy.document.parser.xml.RSSFeed; +import net.yacy.cora.document.RSSFeed; +import net.yacy.cora.document.RSSMessage; import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.util.DateFormatter; diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index b4b54b578..a92b38e67 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -32,11 +32,11 @@ import java.util.HashMap; import java.util.Iterator; import java.util.TreeSet; +import net.yacy.cora.document.RSSFeed; +import net.yacy.cora.document.RSSMessage; import net.yacy.document.Condenser; import net.yacy.document.Document; -import net.yacy.document.content.RSSMessage; import net.yacy.document.geolocalization.Location; -import net.yacy.document.parser.xml.RSSFeed; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.word.Word; diff --git a/htroot/yacysearch_location.java b/htroot/yacysearch_location.java index ee69ceb9e..1cab05061 100644 --- a/htroot/yacysearch_location.java +++ b/htroot/yacysearch_location.java @@ -22,7 +22,8 @@ import java.util.Set; import java.util.concurrent.BlockingQueue; import java.util.concurrent.TimeUnit; -import net.yacy.document.content.RSSMessage; +import net.yacy.cora.document.RSSMessage; +import net.yacy.cora.services.Search; import net.yacy.document.geolocalization.Location; import de.anomic.data.LibraryProvider; import de.anomic.http.server.HeaderFramework; @@ -32,7 +33,6 @@ import de.anomic.search.SwitchboardConstants; import de.anomic.server.serverCore; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; -import de.anomic.yacy.yacyClient; import java.util.Date; import net.yacy.kelondro.util.DateFormatter; @@ -91,7 +91,8 @@ public class yacysearch_location { if (search_title || search_publisher || search_creator || search_subject) try { // get a queue of search results - BlockingQueue results = yacyClient.search(null, query, false, false, maximumTime, Integer.MAX_VALUE); + String rssSearchServiceURL = "http://localhost:" + sb.getConfig("port", "8080") + "/yacysearch.rss"; + BlockingQueue results = Search.search(rssSearchServiceURL, query, false, false, maximumTime, Integer.MAX_VALUE); // take the results and compute some locations RSSMessage message; diff --git a/source/de/anomic/crawler/CrawlQueues.java b/source/de/anomic/crawler/CrawlQueues.java index 386b921d9..aa1abf100 100644 --- a/source/de/anomic/crawler/CrawlQueues.java +++ b/source/de/anomic/crawler/CrawlQueues.java @@ -36,8 +36,8 @@ import java.util.Iterator; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; -import net.yacy.document.content.RSSMessage; -import net.yacy.document.parser.xml.RSSFeed; +import net.yacy.cora.document.Hit; +import net.yacy.cora.document.RSSFeed; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.order.Base64Order; @@ -421,7 +421,7 @@ public class CrawlQueues { // parse the rss DigestURI url, referrer; Date loaddate; - for (final RSSMessage item: feed) { + for (final Hit item: feed) { //System.out.println("URL=" + item.getLink() + ", desc=" + item.getDescription() + ", pubDate=" + item.getPubDate()); // put url on remote crawl stack diff --git a/source/de/anomic/crawler/CrawlStacker.java b/source/de/anomic/crawler/CrawlStacker.java index c1b792d11..82b5f0a56 100644 --- a/source/de/anomic/crawler/CrawlStacker.java +++ b/source/de/anomic/crawler/CrawlStacker.java @@ -354,6 +354,7 @@ public final class CrawlStacker { // returns true if the url can be accepted accoring to network.unit.domain if (url == null) return "url is null"; final String host = url.getHost(); + if (this.acceptLocalURLs && host == null && url.getProtocol().equals("file")) return null; if (host == null) return "url.host is null"; if (this.acceptGlobalURLs && this.acceptLocalURLs) return null; // fast shortcut to avoid dnsResolve // check if this is a local address and we are allowed to index local pages: diff --git a/source/de/anomic/crawler/ResultImages.java b/source/de/anomic/crawler/ResultImages.java index d60c7de46..2626384c0 100755 --- a/source/de/anomic/crawler/ResultImages.java +++ b/source/de/anomic/crawler/ResultImages.java @@ -30,9 +30,9 @@ import java.util.HashMap; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentLinkedQueue; +import net.yacy.cora.document.MultiProtocolURI; import net.yacy.document.Document; import net.yacy.document.parser.html.ImageEntry; -import net.yacy.kelondro.data.meta.DigestURI; public class ResultImages { @@ -48,18 +48,17 @@ public class ResultImages { // we also check all links for a double-check so we don't get the same image more than once in any queue // image links may appear double here even if the pages where the image links are embedded already are checked for double-occurrence: // the same images may be linked from different pages - private static final ConcurrentHashMap doubleCheck = new ConcurrentHashMap(); // (url-hash, time) when the url appeared first + private static final ConcurrentHashMap doubleCheck = new ConcurrentHashMap(); // (url, time) when the url appeared first public static void registerImages(final Document document, final boolean privateEntry) { if (document == null) return; if (document.dc_source() == null) return; - final HashMap images = document.getImages(); + final HashMap images = document.getImages(); for (final ImageEntry image: images.values()) { // do a double-check; attention: this can be time-consuming since this possibly needs a DNS-lookup - String hashstring = new String(image.url().hash()); - if (doubleCheck.containsKey(hashstring)) continue; - doubleCheck.put(hashstring, System.currentTimeMillis()); + if (doubleCheck.containsKey(image.url())) continue; + doubleCheck.put(image.url(), System.currentTimeMillis()); final String name = image.url().getFile(); boolean good = false; @@ -144,8 +143,8 @@ public class ResultImages { public static class OriginEntry { public ImageEntry imageEntry; - public DigestURI baseURL; - public OriginEntry(final ImageEntry imageEntry, final DigestURI baseURL) { + public MultiProtocolURI baseURL; + public OriginEntry(final ImageEntry imageEntry, final MultiProtocolURI baseURL) { this.imageEntry = imageEntry; this.baseURL = baseURL; } diff --git a/source/de/anomic/crawler/RobotsTxt.java b/source/de/anomic/crawler/RobotsTxt.java index e1ff4c146..c6569a9f8 100644 --- a/source/de/anomic/crawler/RobotsTxt.java +++ b/source/de/anomic/crawler/RobotsTxt.java @@ -35,6 +35,7 @@ import java.util.Date; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; +import net.yacy.cora.document.MultiProtocolURI; import net.yacy.kelondro.blob.BEncodedHeap; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.logging.Log; @@ -317,7 +318,7 @@ public class RobotsTxt { reqHeaders.put(HeaderFramework.USER_AGENT, HTTPLoader.crawlerUserAgent); // adding referer - reqHeaders.put(RequestHeader.REFERER, (DigestURI.newURL(robotsURL,"/")).toNormalform(true, true)); + reqHeaders.put(RequestHeader.REFERER, (MultiProtocolURI.newURL(robotsURL,"/")).toNormalform(true, true)); if (entry != null) { oldEtag = entry.getETag(); @@ -380,7 +381,7 @@ public class RobotsTxt { redirectionUrlString = redirectionUrlString.trim(); // generating the new URL object - final DigestURI redirectionUrl = DigestURI.newURL(robotsURL, redirectionUrlString); + final DigestURI redirectionUrl = new DigestURI(MultiProtocolURI.newURL(robotsURL, redirectionUrlString)); // following the redirection if (log.isFinest()) log.logFinest("Redirection detected for robots.txt with URL '" + robotsURL + "'." + diff --git a/source/de/anomic/crawler/retrieval/FTPLoader.java b/source/de/anomic/crawler/retrieval/FTPLoader.java index 94c18cd04..4c9967b9d 100644 --- a/source/de/anomic/crawler/retrieval/FTPLoader.java +++ b/source/de/anomic/crawler/retrieval/FTPLoader.java @@ -32,6 +32,7 @@ import java.io.IOException; import java.io.PrintStream; import java.util.Date; +import net.yacy.cora.document.MultiProtocolURI; import net.yacy.document.TextParser; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.logging.Log; @@ -272,8 +273,8 @@ public class FTPLoader { * @param entryUrl * @return */ - private String getPath(final DigestURI entryUrl) { - return DigestURI.unescape(entryUrl.getPath()).replace("\"", "\"\""); + private String getPath(final MultiProtocolURI entryUrl) { + return MultiProtocolURI.unescape(entryUrl.getPath()).replace("\"", "\"\""); } } diff --git a/source/de/anomic/crawler/retrieval/FileLoader.java b/source/de/anomic/crawler/retrieval/FileLoader.java new file mode 100644 index 000000000..bf6951727 --- /dev/null +++ b/source/de/anomic/crawler/retrieval/FileLoader.java @@ -0,0 +1,144 @@ +/** + * FileLoader + * Copyright 2010 by Michael Peter Christen + * First released 25.5.2010 at http://yacy.net + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file COPYING.LESSER. + * If not, see . + */ + +package de.anomic.crawler.retrieval; + +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.Date; +import java.util.List; + +import de.anomic.http.server.HeaderFramework; +import de.anomic.http.server.RequestHeader; +import de.anomic.http.server.ResponseHeader; +import de.anomic.net.ftpc; +import de.anomic.search.Segments; +import de.anomic.search.Switchboard; +import de.anomic.data.MimeTable; + +import net.yacy.document.TextParser; +import net.yacy.kelondro.data.meta.DigestURI; +import net.yacy.kelondro.logging.Log; +import net.yacy.kelondro.util.DateFormatter; +import net.yacy.kelondro.util.FileUtils; + +public class FileLoader { + + private final Switchboard sb; + private final Log log; + private final int maxFileSize; + + public FileLoader(final Switchboard sb, final Log log) { + this.sb = sb; + this.log = log; + maxFileSize = (int) sb.getConfigLong("crawler.file.maxFileSize", -1l); + } + + public Response load(final Request request, boolean acceptOnlyParseable) throws IOException { + DigestURI url = request.url(); + if (!url.getProtocol().equals("file")) throw new IOException("wrong loader for FileLoader: " + url.getProtocol()); + + RequestHeader requestHeader = new RequestHeader(); + if (request.referrerhash() != null) { + DigestURI ur = sb.getURL(Segments.Process.LOCALCRAWLING, request.referrerhash()); + if (ur != null) requestHeader.put(RequestHeader.REFERER, ur.toNormalform(true, false)); + } + + // process directories: transform them to html with meta robots=noindex (using the ftpc lib) + if (url.isDirectory()) { + String[] l = url.list(); + if (l == null) { + // this can only happen if there is no connection or the directory does not exist + log.logInfo("directory listing not available. URL = " + request.url().toString()); + sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, "directory listing not available. URL = " + request.url().toString()); + throw new IOException("directory listing not available. URL = " + request.url().toString()); + } + String u = url.toNormalform(true, true); + List list = new ArrayList(); + for (String s: l) { + list.add(u + ((u.endsWith("/") || u.endsWith("\\")) ? "" : "/") + s); + } + + StringBuilder content = ftpc.dirhtml(u, null, null, null, list, true); + + ResponseHeader responseHeader = new ResponseHeader(); + responseHeader.put(HeaderFramework.LAST_MODIFIED, DateFormatter.formatRFC1123(new Date())); + responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html"); + Response response = new Response( + request, + requestHeader, + responseHeader, + "200", + sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()), + content.toString().getBytes()); + + return response; + } + + // create response header + String mime = MimeTable.ext2mime(url.getFileExtension()); + ResponseHeader responseHeader = new ResponseHeader(); + responseHeader.put(HeaderFramework.LAST_MODIFIED, DateFormatter.formatRFC1123(new Date(url.lastModified()))); + responseHeader.put(HeaderFramework.CONTENT_TYPE, mime); + + // check mime type and availability of parsers + // and also check resource size and limitation of the size + long size = url.length(); + String parserError = null; + if ((acceptOnlyParseable && (parserError = TextParser.supports(url, mime)) != null) || + (size > maxFileSize && maxFileSize >= 0)) { + // we know that we cannot process that file before loading + // only the metadata is returned + + if (parserError != null) { + log.logInfo("No parser available in File crawler: '" + parserError + "' for URL " + request.url().toString() + ": parsing only metadata"); + } else { + log.logInfo("Too big file in File crawler with size = " + size + " Bytes for URL " + request.url().toString() + ": parsing only metadata"); + } + + // create response with metadata only + responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/plain"); + Response response = new Response( + request, + requestHeader, + responseHeader, + "200", + sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()), + url.toNormalform(true, true).getBytes()); + return response; + } + + // load the resource + InputStream is = url.getInputStream(); + byte[] b = FileUtils.read(is); + is.close(); + + // create response with loaded content + Response response = new Response( + request, + requestHeader, + responseHeader, + "200", + sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()), + b); + return response; + } +} diff --git a/source/de/anomic/crawler/retrieval/HTTPLoader.java b/source/de/anomic/crawler/retrieval/HTTPLoader.java index e0a8e23a0..bbb3f41c6 100644 --- a/source/de/anomic/crawler/retrieval/HTTPLoader.java +++ b/source/de/anomic/crawler/retrieval/HTTPLoader.java @@ -27,6 +27,7 @@ package de.anomic.crawler.retrieval; import java.io.IOException; import java.util.Date; +import net.yacy.cora.document.MultiProtocolURI; import net.yacy.document.TextParser; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.logging.Log; @@ -180,7 +181,7 @@ public final class HTTPLoader { } // normalizing URL - final DigestURI redirectionUrl = DigestURI.newURL(request.url(), redirectionUrlString); + final DigestURI redirectionUrl = new DigestURI(MultiProtocolURI.newURL(request.url(), redirectionUrlString)); // restart crawling with new url this.log.logInfo("CRAWLER Redirection detected ('" + res.getStatusLine() + "') for URL " + request.url().toString()); @@ -289,7 +290,7 @@ public final class HTTPLoader { } // normalizing URL - final DigestURI redirectionUrl = DigestURI.newURL(request.url(), redirectionUrlString); + final DigestURI redirectionUrl = new DigestURI(MultiProtocolURI.newURL(request.url(), redirectionUrlString)); // if we are already doing a shutdown we don't need to retry crawling diff --git a/source/de/anomic/data/BookmarkHelper.java b/source/de/anomic/data/BookmarkHelper.java index 2be4cf829..908f8668b 100644 --- a/source/de/anomic/data/BookmarkHelper.java +++ b/source/de/anomic/data/BookmarkHelper.java @@ -52,6 +52,7 @@ import org.xml.sax.SAXException; import de.anomic.data.bookmarksDB.Bookmark; import de.anomic.data.bookmarksDB.Tag; +import net.yacy.cora.document.MultiProtocolURI; import net.yacy.document.parser.html.ContentScraper; import net.yacy.document.parser.html.TransformerWriter; import net.yacy.kelondro.data.meta.DigestURI; @@ -128,9 +129,9 @@ public class BookmarkHelper { int importCount = 0; - Map links = new HashMap(); + Map links = new HashMap(); String title; - DigestURI url; + MultiProtocolURI url; Bookmark bm; final Set tags=listManager.string2set(tag); //this allow multiple default tags try { @@ -142,14 +143,14 @@ public class BookmarkHelper { writer.close(); links = scraper.getAnchors(); } catch (final IOException e) { Log.logWarning("BOOKMARKS", "error during load of links: "+ e.getClass() +" "+ e.getMessage());} - for (Entry link: links.entrySet()) { - url= link.getKey(); - title=link.getValue(); + for (Entry link: links.entrySet()) { + url = link.getKey(); + title = link.getValue(); Log.logInfo("BOOKMARKS", "links.get(url)"); - if(title.equals("")){//cannot be displayed - title=url.toString(); + if (title.equals("")) {//cannot be displayed + title = url.toString(); } - bm=db.new Bookmark(url.toString()); + bm = db.new Bookmark(url.toString()); bm.setProperty(Bookmark.BOOKMARK_TITLE, title); bm.setTags(tags); bm.setPublic(importPublic); diff --git a/source/de/anomic/data/MimeTable.java b/source/de/anomic/data/MimeTable.java index 4a0438863..3986e6e78 100644 --- a/source/de/anomic/data/MimeTable.java +++ b/source/de/anomic/data/MimeTable.java @@ -5,7 +5,7 @@ import java.io.File; import java.io.FileInputStream; import java.util.Properties; -import net.yacy.kelondro.data.meta.DigestURI; +import net.yacy.cora.document.MultiProtocolURI; public class MimeTable { @@ -42,11 +42,11 @@ public class MimeTable { return mimeTable.getProperty(ext, dfltMime); } - public static String url2mime(final DigestURI url, final String dfltMime) { + public static String url2mime(final MultiProtocolURI url, final String dfltMime) { return ext2mime(url.getFileExtension(), dfltMime); } - public static String url2mime(final DigestURI url) { + public static String url2mime(final MultiProtocolURI url) { return ext2mime(url.getFileExtension()); } } diff --git a/source/de/anomic/net/ftpc.java b/source/de/anomic/net/ftpc.java index bf7b637aa..7b1330692 100644 --- a/source/de/anomic/net/ftpc.java +++ b/source/de/anomic/net/ftpc.java @@ -2645,7 +2645,7 @@ public class ftpc { page.append("\n"); page.append("\n"); page.append(" " + title + "\n"); - page.append(" \n"); + page.append(" \n"); if (metaRobotNoindex) { page.append(" \n"); } @@ -2674,7 +2674,7 @@ public class ftpc { if (line.length() > nameEnd) { page.append(line.substring(nameEnd)); } - } else if (line.startsWith("http://") || line.startsWith("ftp://") || line.startsWith("smb://")) { + } else if (line.startsWith("http://") || line.startsWith("ftp://") || line.startsWith("smb://") || line.startsWith("file://")) { page.append("" + line + ""); } else { // raw diff --git a/source/de/anomic/search/DocumentIndex.java b/source/de/anomic/search/DocumentIndex.java index 9a6712bc3..b0a3e5b0a 100644 --- a/source/de/anomic/search/DocumentIndex.java +++ b/source/de/anomic/search/DocumentIndex.java @@ -146,7 +146,7 @@ public class DocumentIndex extends Segment { * If the given file is a path to a directory, the complete sub-tree is indexed * @param start */ - public void addConcurrent(DigestURI start) { + public void addConcurrent(DigestURI start) throws IOException { assert (start != null); assert (start.canRead()) : start.toString(); if (!start.isDirectory()) { diff --git a/source/de/anomic/search/MediaSnippet.java b/source/de/anomic/search/MediaSnippet.java index bdd719916..ada9138cc 100644 --- a/source/de/anomic/search/MediaSnippet.java +++ b/source/de/anomic/search/MediaSnippet.java @@ -32,6 +32,7 @@ import java.util.TreeSet; import de.anomic.data.MimeTable; +import net.yacy.cora.document.MultiProtocolURI; import net.yacy.document.Document; import net.yacy.document.parser.html.ImageEntry; import net.yacy.kelondro.data.meta.DigestURI; @@ -130,25 +131,25 @@ public class MediaSnippet implements Comparable, Comparator computeMediaSnippets(final Document document, final HandleSet queryhashes, final ContentDomain mediatype) { if (document == null) return new ArrayList(); - Map media = null; + Map media = null; if (mediatype == ContentDomain.AUDIO) media = document.getAudiolinks(); else if (mediatype == ContentDomain.VIDEO) media = document.getVideolinks(); else if (mediatype == ContentDomain.APP) media = document.getApplinks(); if (media == null) return null; - final Iterator> i = media.entrySet().iterator(); - Map.Entry entry; + final Iterator> i = media.entrySet().iterator(); + Map.Entry entry; DigestURI url; String desc; final ArrayList result = new ArrayList(); while (i.hasNext()) { entry = i.next(); - url = entry.getKey(); + url = new DigestURI(entry.getKey()); desc = entry.getValue(); int ranking = TextSnippet.removeAppearanceHashes(url.toNormalform(false, false), queryhashes).size() + TextSnippet.removeAppearanceHashes(desc, queryhashes).size(); if (ranking < 2 * queryhashes.size()) { - result.add(new MediaSnippet(mediatype, url, MimeTable.url2mime(url), desc, document.getTextLength(), null, ranking, document.dc_source())); + result.add(new MediaSnippet(mediatype, url, MimeTable.url2mime(url), desc, document.getTextLength(), null, ranking, new DigestURI(document.dc_source()))); } } return result; @@ -167,7 +168,7 @@ public class MediaSnippet implements Comparable, Comparator result = new ArrayList(); while (i.hasNext()) { ientry = i.next(); - url = ientry.url(); + url = new DigestURI(ientry.url()); String u = url.toString(); if (u.indexOf(".ico") >= 0 || u.indexOf("favicon") >= 0) continue; if (ientry.height() > 0 && ientry.height() < 64) continue; @@ -177,7 +178,7 @@ public class MediaSnippet implements Comparable, Comparator { if (format == 2) { pw.println(""); pw.println("" + CharacterCoding.unicode2xml(metadata.dc_title(), true) + ""); - pw.println("" + DigestURI.escape(url) + ""); + pw.println("" + MultiProtocolURI.escape(url) + ""); if (metadata.dc_creator().length() > 0) pw.println("" + CharacterCoding.unicode2xml(metadata.dc_creator(), true) + ""); if (metadata.dc_subject().length() > 0) pw.println("" + CharacterCoding.unicode2xml(metadata.dc_subject(), true) + ""); pw.println("" + entry.moddate().toString() + ""); diff --git a/source/de/anomic/search/RankingProcess.java b/source/de/anomic/search/RankingProcess.java index 6cd2621ae..61a06395c 100644 --- a/source/de/anomic/search/RankingProcess.java +++ b/source/de/anomic/search/RankingProcess.java @@ -39,6 +39,7 @@ import java.util.concurrent.BlockingQueue; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.TimeUnit; +import net.yacy.cora.document.MultiProtocolURI; import net.yacy.document.Condenser; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadataRow; @@ -631,7 +632,7 @@ public final class RankingProcess extends Thread { // take out relevant information for reference computation if ((resultEntry.url() == null) || (resultEntry.title() == null)) return; //final String[] urlcomps = htmlFilterContentScraper.urlComps(resultEntry.url().toNormalform(true, true)); // word components of the url - final String[] descrcomps = DigestURI.splitpattern.split(resultEntry.title().toLowerCase()); // words in the description + final String[] descrcomps = MultiProtocolURI.splitpattern.split(resultEntry.title().toLowerCase()); // words in the description // add references //addTopic(urlcomps); diff --git a/source/de/anomic/search/ResultEntry.java b/source/de/anomic/search/ResultEntry.java index 7d1590c2c..96983aa33 100644 --- a/source/de/anomic/search/ResultEntry.java +++ b/source/de/anomic/search/ResultEntry.java @@ -31,6 +31,7 @@ import java.util.ArrayList; import java.util.Comparator; import java.util.Date; +import net.yacy.cora.document.MultiProtocolURI; import net.yacy.document.Condenser; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadataRow; @@ -124,7 +125,7 @@ public class ResultEntry implements Comparable, Comparator> i = condenser.words().entrySet().iterator(); @@ -273,10 +274,10 @@ public class Segment { if (!u.contains("/" + language + "/") && !u.contains("/" + ISO639.country(language).toLowerCase() + "/")) { // no confirmation using the url, use the TLD language = url.language(); - System.out.println(error + ", corrected using the TLD"); + log.logWarning(error + ", corrected using the TLD"); } else { // this is a strong hint that the statistics was in fact correct - System.out.println(error + ", but the url proves that the statistic is correct"); + log.logWarning(error + ", but the url proves that the statistic is correct"); } } } else { diff --git a/source/de/anomic/search/Switchboard.java b/source/de/anomic/search/Switchboard.java index ca1be28a0..ba6670001 100644 --- a/source/de/anomic/search/Switchboard.java +++ b/source/de/anomic/search/Switchboard.java @@ -70,16 +70,17 @@ import java.util.zip.GZIPOutputStream; import java.util.zip.ZipEntry; import java.util.zip.ZipInputStream; +import net.yacy.cora.document.MultiProtocolURI; +import net.yacy.cora.document.RSSFeed; +import net.yacy.cora.document.RSSMessage; import net.yacy.document.Condenser; import net.yacy.document.Document; import net.yacy.document.TextParser; import net.yacy.document.ParserException; import net.yacy.document.content.DCEntry; -import net.yacy.document.content.RSSMessage; import net.yacy.document.content.SurrogateReader; import net.yacy.document.importer.OAIListFriendsLoader; import net.yacy.document.parser.html.ImageEntry; -import net.yacy.document.parser.xml.RSSFeed; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.meta.URIMetadataRow.Components; @@ -291,7 +292,7 @@ public final class Switchboard extends serverSwitch { // init sessionid name file final String sessionidNamesFile = getConfig("sessionidNamesFile",""); this.log.logConfig("Loading sessionid file " + sessionidNamesFile); - DigestURI.initSessionIDNames(new File(getRootPath(), sessionidNamesFile)); + MultiProtocolURI.initSessionIDNames(FileUtils.loadList(new File(getRootPath(), sessionidNamesFile))); // init tables this.tables = new WorkTables(this.workPath); @@ -1733,7 +1734,7 @@ public final class Switchboard extends serverSwitch { ((response.profile() == null) || (response.depth() < response.profile().depth())) ) { // get the hyperlinks - final Map hl = document.getHyperlinks(); + final Map hl = document.getHyperlinks(); // add all images also to the crawl stack for (ImageEntry imageReference : document.getImages().values()) { @@ -1741,15 +1742,15 @@ public final class Switchboard extends serverSwitch { } // insert those hyperlinks to the crawler - DigestURI nextUrl; - for (Map.Entry nextEntry : hl.entrySet()) { + MultiProtocolURI nextUrl; + for (Map.Entry nextEntry : hl.entrySet()) { // check for interruption checkInterruption(); // process the next hyperlink nextUrl = nextEntry.getKey(); String u = nextUrl.toNormalform(true, true, true); - if (!(u.startsWith("http") || u.startsWith("ftp") || u.startsWith("smb"))) continue; + if (!(u.startsWith("http://") || u.startsWith("ftp://") || u.startsWith("smb://") || u.startsWith("file://"))) continue; // enqueue the hyperlink into the pre-notice-url db try { crawlStacker.enqueueEntry(new Request( diff --git a/source/de/anomic/search/TextSnippet.java b/source/de/anomic/search/TextSnippet.java index 1f3ab032e..264bfbe23 100644 --- a/source/de/anomic/search/TextSnippet.java +++ b/source/de/anomic/search/TextSnippet.java @@ -405,7 +405,7 @@ public class TextSnippet implements Comparable, Comparator implements Cloneable if (this.size() == 0) return ""; StringBuilder param = new StringBuilder(); for (Map.Entry entry: this.entrySet()) { - param.append(DigestURI.escape(entry.getKey())); + param.append(MultiProtocolURI.escape(entry.getKey())); param.append('='); - param.append(DigestURI.escape(entry.getValue())); + param.append(MultiProtocolURI.escape(entry.getValue())); param.append('&'); } param.setLength(param.length() - 1); diff --git a/source/de/anomic/yacy/graphics/WebStructureGraph.java b/source/de/anomic/yacy/graphics/WebStructureGraph.java index 2a6b8208e..546e0f482 100644 --- a/source/de/anomic/yacy/graphics/WebStructureGraph.java +++ b/source/de/anomic/yacy/graphics/WebStructureGraph.java @@ -37,6 +37,7 @@ import java.util.SortedMap; import java.util.TreeMap; import java.util.TreeSet; +import net.yacy.cora.document.MultiProtocolURI; import net.yacy.document.Condenser; import net.yacy.document.Document; import net.yacy.kelondro.data.meta.DigestURI; @@ -95,11 +96,11 @@ public class WebStructureGraph { } public Integer[] /*(outlinksSame, outlinksOther)*/ generateCitationReference(final Document document, final Condenser condenser, final Date docDate) { - final DigestURI url = document.dc_source(); + final DigestURI url = new DigestURI(document.dc_source()); // generate citation reference - final Map hl = document.getHyperlinks(); - final Iterator it = hl.keySet().iterator(); + final Map hl = document.getHyperlinks(); + final Iterator it = hl.keySet().iterator(); byte[] nexturlhashb; String nexturlhash; final StringBuilder cpg = new StringBuilder(12 * (hl.size() + 1) + 1); @@ -109,7 +110,7 @@ public class WebStructureGraph { int GCount = 0; int LCount = 0; while (it.hasNext()) { - nexturlhashb = it.next().hash(); + nexturlhashb = new DigestURI(it.next()).hash(); if (nexturlhashb != null) { nexturlhash = new String(nexturlhashb); assert nexturlhash.length() == 12 : "nexturlhash.length() = " + nexturlhash.length() + ", nexturlhash = " + nexturlhash; diff --git a/source/de/anomic/yacy/yacyClient.java b/source/de/anomic/yacy/yacyClient.java index e4df355c8..4bf0bec1e 100644 --- a/source/de/anomic/yacy/yacyClient.java +++ b/source/de/anomic/yacy/yacyClient.java @@ -54,14 +54,12 @@ import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.TreeMap; -import java.util.concurrent.BlockingQueue; -import java.util.concurrent.LinkedBlockingQueue; import java.util.regex.Pattern; -import net.yacy.document.content.RSSMessage; -import net.yacy.document.parser.xml.RSSFeed; -import net.yacy.document.parser.xml.RSSReader; -import net.yacy.kelondro.data.meta.DigestURI; +import net.yacy.cora.document.RSSFeed; +import net.yacy.cora.document.RSSReader; +import net.yacy.cora.protocol.HttpConnector; +import net.yacy.cora.services.Search; import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.data.word.WordReference; @@ -86,10 +84,8 @@ import de.anomic.crawler.retrieval.HTTPLoader; import de.anomic.http.client.DefaultCharsetFilePart; import de.anomic.http.client.DefaultCharsetStringPart; import de.anomic.http.client.Client; -import de.anomic.http.client.RemoteProxyConfig; import de.anomic.http.server.HeaderFramework; import de.anomic.http.server.RequestHeader; -import de.anomic.http.server.ResponseContainer; import de.anomic.search.RankingProfile; import de.anomic.search.RankingProcess; import de.anomic.search.Segment; @@ -101,6 +97,22 @@ import de.anomic.tools.crypt; public final class yacyClient { + + /** + * @see wput + * @param target + * @param filename + * @param post + * @return + * @throws IOException + */ + private static byte[] postToFile(final yacySeed target, final String filename, final List post, final int timeout) throws IOException { + return HttpConnector.wput("http://" + target.getClusterAddress() + "/yacy/" + filename, target.getHexHash() + ".yacyh", post, timeout, false); + } + private static byte[] postToFile(final yacySeedDB seedDB, final String targetHash, final String filename, final List post, final int timeout) throws IOException { + return HttpConnector.wput("http://" + targetAddress(seedDB, targetHash) + "/yacy/" + filename, yacySeed.b64Hash2hexHash(targetHash)+ ".yacyh", post, timeout, false); + } + /** * this is called to enrich the seed information by * - own address (if peer is behind a nat/router) @@ -134,7 +146,7 @@ public final class yacyClient { post.add(new DefaultCharsetStringPart("seed", mySeed.genSeedStr(salt))); // send request final long start = System.currentTimeMillis(); - final byte[] content = wput("http://" + address + "/yacy/hello.html", yacySeed.b64Hash2hexHash(otherHash) + ".yacyh", post, 30000, false); + final byte[] content = HttpConnector.wput("http://" + address + "/yacy/hello.html", yacySeed.b64Hash2hexHash(otherHash) + ".yacyh", post, 30000, false); yacyCore.log.logInfo("yacyClient.publishMySeed thread '" + Thread.currentThread().getName() + "' contacted peer at " + address + ", received " + ((content == null) ? "null" : content.length) + " bytes, time = " + (System.currentTimeMillis() - start) + " milliseconds"); result = FileUtils.table(content); break; @@ -237,82 +249,6 @@ public final class yacyClient { return count; } - /** - * send data to the server named by vhost - * - * @param address address of the server - * @param vhost name of the server at address which should respond - * @param post data to send (name-value-pairs) - * @param gzipBody send with content gzip encoded - * @return response body - * @throws IOException - */ - /* - private static byte[] wput(final String url, String vhost, final List post, boolean gzipBody) throws IOException { - return wput(url, vhost, post, 10000, gzipBody); - } - */ - /** - * send data to the server named by vhost - * - * @param address address of the server - * @param vhost name of the server at address which should respond - * @param post data to send (name-value-pairs) - * @param timeout in milliseconds - * @return response body - * @throws IOException - */ - private static byte[] wput(final String url, final String vhost, final List post, final int timeout) throws IOException { - return wput(url, vhost, post, timeout, false); - } - /** - * send data to the server named by vhost - * - * @param address address of the server - * @param vhost name of the server at address which should respond - * @param post data to send (name-value-pairs) - * @param timeout in milliseconds - * @param gzipBody send with content gzip encoded - * @return response body - * @throws IOException - */ - private static byte[] wput(final String url, final String vhost, final List post, final int timeout, final boolean gzipBody) throws IOException { - final RequestHeader header = new RequestHeader(); - header.put(HeaderFramework.USER_AGENT, HTTPLoader.yacyUserAgent); - header.put(HeaderFramework.HOST, vhost); - final Client client = new Client(timeout, header); - client.setProxy(proxyConfig()); - - ResponseContainer res = null; - byte[] content = null; - try { - // send request/data - res = client.POST(url, post, gzipBody); - content = res.getData(); - } finally { - if(res != null) { - // release connection - res.closeStream(); - } - } - return content; - } - - /** - * @see wput - * @param target - * @param filename - * @param post - * @return - * @throws IOException - */ - private static byte[] postToFile(final yacySeed target, final String filename, final List post, final int timeout) throws IOException { - return wput("http://" + target.getClusterAddress() + "/yacy/" + filename, target.getHexHash() + ".yacyh", post, timeout, false); - } - private static byte[] postToFile(final yacySeedDB seedDB, final String targetHash, final String filename, final List post, final int timeout) throws IOException { - return wput("http://" + targetAddress(seedDB, targetHash) + "/yacy/" + filename, yacySeed.b64Hash2hexHash(targetHash)+ ".yacyh", post, timeout, false); - } - public static yacySeed querySeed(final yacySeed target, final String seedHash) { // prepare request final String salt = crypt.randomSalt(); @@ -400,7 +336,7 @@ public final class yacyClient { // send request try { /* a long time-out is needed */ - final byte[] result = wput("http://" + target.getClusterAddress() + "/yacy/urls.xml", target.getHexHash() + ".yacyh", post, (int) maxTime); + final byte[] result = HttpConnector.wput("http://" + target.getClusterAddress() + "/yacy/urls.xml", target.getHexHash() + ".yacyh", post, (int) maxTime); final RSSReader reader = RSSReader.parse(result); if (reader == null) { yacyCore.log.logWarning("yacyClient.queryRemoteCrawlURLs failed asking peer '" + target.getName() + "': probably bad response from remote peer (1), reader == null"); @@ -425,120 +361,11 @@ public final class yacyClient { return null; } } - - - public static BlockingQueue search(String urlBase, String query, boolean verify, boolean global, long timeout, int maximumRecords) { - if (urlBase == null) { - urlBase = "http://localhost:" + Switchboard.getSwitchboard().getConfig("port", "8080") + "/yacysearch.rss"; - } - BlockingQueue queue = new LinkedBlockingQueue(); - searchJob job = new searchJob(urlBase, query, verify, global, timeout, maximumRecords, queue); - job.start(); - return queue; - } - private final static int recordsPerSession = 10; - - public static class searchJob extends Thread { - - String urlBase, query; - boolean verify, global; - long timeout; - int startRecord, maximumRecords; - BlockingQueue queue; - - public searchJob(String urlBase, String query, boolean verify, boolean global, long timeout, int maximumRecords, BlockingQueue queue) { - this.urlBase = urlBase; - this.query = query; - this.verify = verify; - this.global = global; - this.timeout = timeout; - this.startRecord = 0; - this.maximumRecords = maximumRecords; - this.queue = queue; - } - - public void run() { - RSSMessage message; - mainloop: while (timeout > 0 && maximumRecords > 0) { - long st = System.currentTimeMillis(); - RSSFeed feed = search(urlBase, query, verify, global, timeout, startRecord, recordsPerSession); - if (feed == null || feed.isEmpty()) break mainloop; - maximumRecords -= feed.size(); - innerloop: while (!feed.isEmpty()) { - message = feed.pollMessage(); - if (message == null) break innerloop; - try { - queue.put(message); - } catch (InterruptedException e) { - break innerloop; - } - } - startRecord += recordsPerSession; - timeout -= System.currentTimeMillis() - st; - } - try { queue.put(RSSMessage.POISON); } catch (InterruptedException e) {} - } - } - - /** - * send a query to a yacy public search interface - * @param urlBase the target url base (everything before the ? that follows the SRU request syntax properties). can null, then the local peer is used - * @param query the query as string - * @param startRecord number of first record - * @param maximumRecords maximum number of records - * @param verify if true, result entries are verified using the snippet fetch (slow); if false simply the result is returned - * @param global if true also search results from other peers are included - * @param timeout milliseconds that are waited at maximum for a search result - * @return - */ - public static RSSFeed search(String urlBase, String query, boolean verify, boolean global, long timeout, int startRecord, int maximumRecords) { - // returns a search result from a peer - if (urlBase == null) { - urlBase = "http://localhost:" + Switchboard.getSwitchboard().getConfig("port", "8080") + "/yacysearch.rss"; - } - DigestURI uri = null; - try { - uri = new DigestURI(urlBase, null); - } catch (MalformedURLException e) { - yacyCore.log.logWarning("yacyClient.search failed asking peer '" + urlBase + "': bad url, " + e.getMessage()); - return null; - } - - // prepare request - final List post = new ArrayList(); - post.add(new DefaultCharsetStringPart("query", query)); - post.add(new DefaultCharsetStringPart("startRecord", Integer.toString(startRecord))); - post.add(new DefaultCharsetStringPart("maximumRecords", Long.toString(maximumRecords))); - post.add(new DefaultCharsetStringPart("verify", verify ? "true" : "false")); - post.add(new DefaultCharsetStringPart("resource", global ? "global" : "local")); - - // send request - try { - final byte[] result = wput(urlBase, uri.getHost(), post, (int) timeout); - //String debug = new String(result); System.out.println("*** DEBUG: " + debug); - final RSSReader reader = RSSReader.parse(result); - if (reader == null) { - yacyCore.log.logWarning("yacyClient.search failed asking peer '" + uri.getHost() + "': probably bad response from remote peer (1), reader == null"); - return null; - } - final RSSFeed feed = reader.getFeed(); - if (feed == null) { - // case where the rss reader does not understand the content - yacyCore.log.logWarning("yacyClient.search failed asking peer '" + uri.getHost() + "': probably bad response from remote peer (2)"); - return null; - } - return feed; - } catch (final IOException e) { - yacyCore.log.logSevere("yacyClient.search error asking peer '" + uri.getHost() + "':" + e.toString()); - return null; - } - } - - public static RSSFeed search(final yacySeed targetSeed, String query, boolean verify, boolean global, long timeout, int startRecord, int maximumRecords) { + public static RSSFeed search(final yacySeed targetSeed, String query, boolean verify, boolean global, long timeout, int startRecord, int maximumRecords) throws IOException { String address = (targetSeed == null || targetSeed == Switchboard.getSwitchboard().peers.mySeed()) ? "localhost:" + Switchboard.getSwitchboard().getConfig("port", "8080") : targetSeed.getClusterAddress(); String urlBase = "http://" + address + "/yacysearch.rss"; - return search(urlBase, query, verify, global, timeout, startRecord, maximumRecords); + return Search.search(urlBase, query, verify, global, timeout, startRecord, maximumRecords); } @SuppressWarnings("unchecked") @@ -607,7 +434,7 @@ public final class yacyClient { // send request HashMap result = null; try { - result = FileUtils.table(wput("http://" + target.getClusterAddress() + "/yacy/search.html", target.getHexHash() + ".yacyh", post, 60000)); + result = FileUtils.table(HttpConnector.wput("http://" + target.getClusterAddress() + "/yacy/search.html", target.getHexHash() + ".yacyh", post, 60000)); } catch (final IOException e) { yacyCore.log.logInfo("SEARCH failed, Peer: " + target.hash + ":" + target.getName() + " (" + e.getMessage() + "), score=" + target.selectscore); //yacyCore.peerActions.peerDeparture(target, "search request to peer created io exception: " + e.getMessage()); @@ -878,7 +705,7 @@ public final class yacyClient { // send request try { - final byte[] content = wput("http://" + targetAddress + "/yacy/transfer.html", targetAddress, post, 10000); + final byte[] content = HttpConnector.wput("http://" + targetAddress + "/yacy/transfer.html", targetAddress, post, 10000); final HashMap result = FileUtils.table(content); return result; } catch (final Exception e) { @@ -902,7 +729,7 @@ public final class yacyClient { // send request try { - final byte[] content = wput("http://" + targetAddress + "/yacy/transfer.html", targetAddress, post, 20000); + final byte[] content = HttpConnector.wput("http://" + targetAddress + "/yacy/transfer.html", targetAddress, post, 20000); final HashMap result = FileUtils.table(content); return result; } catch (final Exception e) { @@ -977,7 +804,7 @@ public final class yacyClient { // send request try { - final byte[] content = wput("http://" + address + "/yacy/crawlReceipt.html", target.getHexHash() + ".yacyh", post, 10000); + final byte[] content = HttpConnector.wput("http://" + address + "/yacy/crawlReceipt.html", target.getHexHash() + ".yacyh", post, 10000); return FileUtils.table(content); } catch (final Exception e) { // most probably a network time-out exception @@ -1127,7 +954,7 @@ public final class yacyClient { post.add(new DefaultCharsetStringPart("entryc", Integer.toString(indexcount))); post.add(new DefaultCharsetStringPart("indexes", entrypost.toString())); try { - final byte[] content = wput("http://" + address + "/yacy/transferRWI.html", targetSeed.getHexHash() + ".yacyh", post, timeout, gzipBody); + final byte[] content = HttpConnector.wput("http://" + address + "/yacy/transferRWI.html", targetSeed.getHexHash() + ".yacyh", post, timeout, gzipBody); final Iterator v = FileUtils.strings(content); // this should return a list of urlhashes that are unknown @@ -1171,7 +998,7 @@ public final class yacyClient { } post.add(new DefaultCharsetStringPart("urlc", Integer.toString(urlc))); try { - final byte[] content = wput("http://" + address + "/yacy/transferURL.html", targetSeed.getHexHash() + ".yacyh", post, timeout, gzipBody); + final byte[] content = HttpConnector.wput("http://" + address + "/yacy/transferURL.html", targetSeed.getHexHash() + ".yacyh", post, timeout, gzipBody); final Iterator v = FileUtils.strings(content); final HashMap result = FileUtils.table(v); @@ -1193,7 +1020,7 @@ public final class yacyClient { String address = targetSeed.getClusterAddress(); if (address == null) { address = "localhost:8080"; } try { - final byte[] content = wput("http://" + address + "/yacy/profile.html", targetSeed.getHexHash() + ".yacyh", post, 5000); + final byte[] content = HttpConnector.wput("http://" + address + "/yacy/profile.html", targetSeed.getHexHash() + ".yacyh", post, 5000); return FileUtils.table(content); } catch (final Exception e) { yacyCore.log.logSevere("yacyClient.getProfile error:" + e.getMessage()); @@ -1201,14 +1028,6 @@ public final class yacyClient { } } - /** - * proxy for "to YaCy connections" - * @return - */ - private static final RemoteProxyConfig proxyConfig() { - final RemoteProxyConfig p = RemoteProxyConfig.getRemoteProxyConfig(); - return ((p != null) && (p.useProxy()) && (p.useProxy4Yacy())) ? p : null; - } public static void main(final String[] args) { if(args.length > 1) { @@ -1262,7 +1081,7 @@ public final class yacyClient { //post.add(new FilePart("filename", new ByteArrayPartSource(filename, file))); // do it! try { - final byte[] response = wput(url.toString(), vhost, post, timeout, gzipBody); + final byte[] response = HttpConnector.wput(url.toString(), vhost, post, timeout, gzipBody); System.out.println(new String(response)); } catch (final IOException e) { Log.logException(e); diff --git a/source/de/anomic/yacy/yacyCore.java b/source/de/anomic/yacy/yacyCore.java index e0f18cf73..968877986 100644 --- a/source/de/anomic/yacy/yacyCore.java +++ b/source/de/anomic/yacy/yacyCore.java @@ -48,8 +48,8 @@ import java.util.List; import java.util.Map; import java.util.concurrent.Semaphore; -import net.yacy.document.content.RSSMessage; -import net.yacy.document.parser.xml.RSSFeed; +import net.yacy.cora.document.RSSFeed; +import net.yacy.cora.document.RSSMessage; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.util.DateFormatter; diff --git a/source/de/anomic/yacy/yacyPeerActions.java b/source/de/anomic/yacy/yacyPeerActions.java index ae7866b38..2cb4e0294 100644 --- a/source/de/anomic/yacy/yacyPeerActions.java +++ b/source/de/anomic/yacy/yacyPeerActions.java @@ -26,8 +26,8 @@ package de.anomic.yacy; import java.util.HashMap; -import net.yacy.document.content.RSSMessage; -import net.yacy.document.parser.xml.RSSFeed; +import net.yacy.cora.document.RSSFeed; +import net.yacy.cora.document.RSSMessage; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.util.DateFormatter; import net.yacy.kelondro.util.MapTools; diff --git a/source/de/anomic/yacy/yacyRelease.java b/source/de/anomic/yacy/yacyRelease.java index 082e89cc6..542116c34 100644 --- a/source/de/anomic/yacy/yacyRelease.java +++ b/source/de/anomic/yacy/yacyRelease.java @@ -45,8 +45,8 @@ import java.util.Map; import java.util.SortedSet; import java.util.TreeSet; +import net.yacy.cora.document.MultiProtocolURI; import net.yacy.document.parser.html.ContentScraper; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.io.CharBuffer; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.order.Base64Order; @@ -74,17 +74,17 @@ public final class yacyRelease extends yacyVersion { private static Map latestReleases = new HashMap(); public final static List latestReleaseLocations = new ArrayList(); // will be initialized with value in defaults/yacy.network.freeworld.unit - private DigestURI url; + private MultiProtocolURI url; private File releaseFile; private PublicKey publicKey; - public yacyRelease(final DigestURI url) { + public yacyRelease(final MultiProtocolURI url) { super(url.getFileName()); this.url = url; } - public yacyRelease(final DigestURI url, PublicKey publicKey) { + public yacyRelease(final MultiProtocolURI url, PublicKey publicKey) { this(url); this.publicKey = publicKey; } @@ -94,7 +94,7 @@ public final class yacyRelease extends yacyVersion { this.releaseFile = releaseFile; } - public DigestURI getUrl() { + public MultiProtocolURI getUrl() { return url; } @@ -241,10 +241,10 @@ public final class yacyRelease extends yacyVersion { } // analyse links in scraper resource, and find link to latest release in it - final Map anchors = scraper.getAnchors(); // a url (String) / name (String) relation + final Map anchors = scraper.getAnchors(); // a url (String) / name (String) relation final TreeSet mainReleases = new TreeSet(); final TreeSet devReleases = new TreeSet(); - for(DigestURI url : anchors.keySet()) { + for (MultiProtocolURI url : anchors.keySet()) { try { yacyRelease release = new yacyRelease(url, location.getPublicKey()); //System.out.println("r " + release.toAnchor()); diff --git a/source/net/yacy/cora/document/Channel.java b/source/net/yacy/cora/document/Channel.java new file mode 100644 index 000000000..1da4b2ceb --- /dev/null +++ b/source/net/yacy/cora/document/Channel.java @@ -0,0 +1,42 @@ +/** + * Channel + * Copyright 2010 by Michael Peter Christen + * First released 10.5.2010 at http://yacy.net + * + * This file is part of YaCy Content Integration + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file COPYING.LESSER. + * If not, see . + */ + +package net.yacy.cora.document; + +public interface Channel extends Iterable { + + public void setTitle(String title); + + public void setLink(String link); + + public void setDescription(String description); + + public void setImageURL(String imageUrl); + + public void setTotalResults(String totalResults); + + public void setStartIndex(String startIndex); + + public void setItemsPerPage(String itemsPerPage); + + public void setSearchTerms(String searchTerms); +} diff --git a/source/net/yacy/cora/document/Channels.java b/source/net/yacy/cora/document/Channels.java new file mode 100644 index 000000000..7eeeec718 --- /dev/null +++ b/source/net/yacy/cora/document/Channels.java @@ -0,0 +1,27 @@ +/** + * Channels + * Copyright 2010 by Michael Peter Christen + * First released 10.5.2010 at http://yacy.net + * + * This file is part of YaCy Content Integration + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file COPYING.LESSER. + * If not, see . + */ + +package net.yacy.cora.document; + +public class Channels { + +} diff --git a/source/net/yacy/cora/document/Hit.java b/source/net/yacy/cora/document/Hit.java new file mode 100644 index 000000000..d6c7df073 --- /dev/null +++ b/source/net/yacy/cora/document/Hit.java @@ -0,0 +1,74 @@ +/** + * Hit + * Copyright 2010 by Michael Peter Christen + * First released 10.5.2010 at http://yacy.net + * + * This file is part of YaCy Content Integration + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file COPYING.LESSER. + * If not, see . + */ + +package net.yacy.cora.document; + +public interface Hit { + + public void setAuthor(String title); + + public void setCopyright(String title); + + public void setCategory(String title); + + public void setTitle(String title); + + public void setLink(String link); + + public void setReferrer(String title); + + public void setLanguage(String title); + + public void setDescription(String description); + + public void setCreator(String pubdate); + + public void setPubDate(String pubdate); + + public void setGuid(String guid); + + public void setDocs(String guid); + + + public String getAuthor(); + + public String getCopyright(); + + public String getCategory(); + + public String getTitle(); + + public String getLink(); + + public String getReferrer(); + + public String getLanguage(); + + public String getDescription(); + + public String getPubDate(); + + public String getGuid(); + + public String getDocs(); + +} diff --git a/source/net/yacy/cora/document/MultiProtocolURI.java b/source/net/yacy/cora/document/MultiProtocolURI.java new file mode 100644 index 000000000..f9cda9317 --- /dev/null +++ b/source/net/yacy/cora/document/MultiProtocolURI.java @@ -0,0 +1,1037 @@ +/** + * MultiProtocolURI + * Copyright 2010 by Michael Peter Christen + * First released 25.5.2010 at http://yacy.net + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file COPYING.LESSER. + * If not, see . + */ + + +package net.yacy.cora.document; + +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.Serializable; +import java.net.MalformedURLException; +import java.text.Collator; +import java.util.Locale; +import java.util.Set; +import java.util.TreeSet; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import jcifs.smb.SmbException; +import jcifs.smb.SmbFile; +import jcifs.smb.SmbFileInputStream; + +import net.yacy.cora.document.Punycode.PunycodeException; + +/** + * MultiProtocolURI provides a URL object for multiple protocols like http, https, ftp, smb and file + * + */ +public class MultiProtocolURI implements Serializable { + + private static final long serialVersionUID = -1173233022912141884L; + public static final int TLD_any_zone_filter = 255; // from TLD zones can be filtered during search; this is the catch-all filter + private static final Pattern backPathPattern = Pattern.compile("(/[^/]+(? sessionIDnames; + static { + insensitiveCollator.setStrength(Collator.SECONDARY); + insensitiveCollator.setDecomposition(Collator.NO_DECOMPOSITION); + sessionIDnames = new TreeSet(insensitiveCollator); + } + + public static final void initSessionIDNames(Set idNames) { + for (String s: idNames) { + if (s == null) continue; + s = s.trim(); + if (s.length() > 0) sessionIDnames.add(s); + } + } + + // class variables + protected String protocol, host, userInfo, path, quest, ref; + protected int port; + + public MultiProtocolURI(final File file) throws MalformedURLException { + this("file", "", -1, file.getAbsolutePath()); + } + + protected MultiProtocolURI(final MultiProtocolURI url) { + this.protocol = url.protocol; + this.host = url.host; + this.userInfo = url.userInfo; + this.path = url.path; + this.quest = url.quest; + this.ref = url.ref; + this.port = url.port; + } + + public MultiProtocolURI(final String url) throws MalformedURLException { + if (url == null) throw new MalformedURLException("url string is null"); + parseURLString(url); + } + + public static final boolean isHTTP(String s) { return s.startsWith("http://"); } + public static final boolean isHTTPS(String s) { return s.startsWith("https://"); } + public static final boolean isFTP(String s) { return s.startsWith("ftp://"); } + public static final boolean isFile(String s) { return s.startsWith("file://"); } + public static final boolean isSMB(String s) { return s.startsWith("smb://") || s.startsWith("\\\\"); } + + public final boolean isHTTP() { return this.protocol.equals("http"); } + public final boolean isHTTPS() { return this.protocol.equals("https"); } + public final boolean isFTP() { return this.protocol.equals("ftp"); } + public final boolean isFile() { return this.protocol.equals("file"); } + public final boolean isSMB() { return this.protocol.equals("smb"); } + + private void parseURLString(String url) throws MalformedURLException { + // identify protocol + assert (url != null); + url = url.trim(); + if (url.startsWith("\\\\")) { + url = "smb://" + url.substring(2).replaceAll("\\\\", "/"); + } + + if (url.length() > 1 && url.charAt(1) == ':') { + // maybe a DOS drive path + url = "file://" + url; + } + + if (url.length() > 0 && url.charAt(0) == '/') { + // maybe a unix/linux absolute path + url = "file://" + url; + } + + int p = url.indexOf(':'); + if (p < 0) { + url = "http://" + url; + p = 4; + } + this.protocol = url.substring(0, p).toLowerCase().trim(); + if (url.length() < p + 4) throw new MalformedURLException("URL not parseable: '" + url + "'"); + if (!this.protocol.equals("file") && url.substring(p + 1, p + 3).equals("//")) { + // identify host, userInfo and file for http and ftp protocol + final int q = url.indexOf('/', p + 3); + int r; + if (q < 0) { + if ((r = url.indexOf('@', p + 3)) < 0) { + host = url.substring(p + 3); + userInfo = null; + } else { + host = url.substring(r + 1); + userInfo = url.substring(p + 3, r); + } + path = "/"; + } else { + host = url.substring(p + 3, q).trim(); + if ((r = host.indexOf('@')) < 0) { + userInfo = null; + } else { + userInfo = host.substring(0, r); + host = host.substring(r + 1); + } + path = url.substring(q); + } + if (host.length() < 4 && !protocol.equals("file")) throw new MalformedURLException("host too short: '" + host + "'"); + if (host.indexOf('&') >= 0) throw new MalformedURLException("invalid '&' in host"); + path = resolveBackpath(path); + identPort(url, (isHTTP() ? 80 : (isHTTPS() ? 443 : (isFTP() ? 21 : (isSMB() ? 445 : -1))))); + identRef(); + identQuest(); + escape(); + } else { + // this is not a http or ftp url + if (protocol.equals("mailto")) { + // parse email url + final int q = url.indexOf('@', p + 3); + if (q < 0) { + throw new MalformedURLException("wrong email address: " + url); + } + userInfo = url.substring(p + 1, q); + host = url.substring(q + 1); + path = null; + port = -1; + quest = null; + ref = null; + } if (protocol.equals("file")) { + // parse file url + String h = url.substring(p + 1); + if (h.startsWith("//")) { + // host may be given, but may be also empty + final int q = h.indexOf('/', 2); + if (q <= 0) { + // no host given + host = null; + path = h.substring(2); + } else { + host = h.substring(2, q); + if (host.length() == 0 || host.equals("localhost")) host = null; + h = h.substring(q); + char c = h.charAt(2); + if (c == ':' || c == '|') + path = h.substring(1); + else + path = h; + } + } else { + host = null; + if (h.length() > 0 && h.charAt(0) == '/') { + char c = h.charAt(2); + if (c == ':' || c == '|') + path = h.substring(1); + else + path = h; + } else { + char c = h.charAt(1); + if (c == ':' || c == '|') + path = h; + else + path = "/" + h; + } + } + userInfo = null; + port = -1; + quest = null; + ref = null; + } else { + throw new MalformedURLException("unknown protocol: " + url); + } + } + + // handle international domains + if (!Punycode.isBasic(host)) try { + final String[] domainParts = patternDot.split(host, 0); + StringBuilder buffer = new StringBuilder(); + // encode each domain-part separately + for(int i=0; i 0 && relPath.charAt(0) == '/') { + this.path = relPath; + } else if (baseURL.path.endsWith("/")) { + if (relPath.length() > 0 && (relPath.charAt(0) == '#' || relPath.charAt(0) == '?')) { + throw new MalformedURLException("relative path malformed: " + relPath); + } + this.path = baseURL.path + relPath; + } else { + if (relPath.length() > 0 && (relPath.charAt(0) == '#' || relPath.charAt(0) == '?')) { + this.path = baseURL.path + relPath; + } else { + final int q = baseURL.path.lastIndexOf('/'); + if (q < 0) { + this.path = relPath; + } else { + this.path = baseURL.path.substring(0, q + 1) + relPath; + } + } + } + this.quest = baseURL.quest; + this.ref = baseURL.ref; + + path = resolveBackpath(path); + identRef(); + identQuest(); + escape(); + } + + public MultiProtocolURI(final String protocol, final String host, final int port, final String path) throws MalformedURLException { + if (protocol == null) throw new MalformedURLException("protocol is null"); + this.protocol = protocol; + this.host = host; + this.port = port; + this.path = path; + identRef(); + identQuest(); + escape(); + } + + // resolve '..' + public static final String resolveBackpath(final String path) { + String p = path; + if (p.length() == 0 || p.charAt(0) != '/') { p = "/" + p; } + final Matcher matcher = backPathPattern.matcher(p); + while (matcher.find()) { + p = matcher.replaceAll(""); + matcher.reset(p); + } + return p.equals("") ? "/" : p; + } + + /** + * Escapes the following parts of the url, this object already contains: + *
    + *
  • path: see {@link #escape(String)}
  • + *
  • ref: same as above
  • + *
  • quest: same as above without the ampersand ("&") and the equals symbol
  • + *
+ */ + private void escape() { + if (path != null && path.indexOf('%') == -1) escapePath(); + if (quest != null && quest.indexOf('%') == -1) escapeQuest(); + if (ref != null && ref.indexOf('%') == -1) escapeRef(); + } + + private void escapePath() { + final String[] pathp = patternSlash.split(path, -1); + StringBuilder ptmp = new StringBuilder(path.length() + 10); + for (int i = 0; i < pathp.length; i++) { + ptmp.append('/'); + ptmp.append(escape(pathp[i])); + } + path = ptmp.substring((ptmp.length() > 0) ? 1 : 0); + } + + private void escapeRef() { + ref = escape(ref).toString(); + } + + private void escapeQuest() { + final String[] questp = patternAmp.split(quest, -1); + StringBuilder qtmp = new StringBuilder(quest.length() + 10); + for (int i = 0; i < questp.length; i++) { + if (questp[i].indexOf('=') != -1) { + qtmp.append('&'); + qtmp.append(escape(questp[i].substring(0, questp[i].indexOf('=')))); + qtmp.append('='); + qtmp.append(escape(questp[i].substring(questp[i].indexOf('=') + 1))); + } else { + qtmp.append('&'); + qtmp.append(escape(questp[i])); + } + } + quest = qtmp.substring((qtmp.length() > 0) ? 1 : 0); + } + + private final static String[] hex = { + "%00", "%01", "%02", "%03", "%04", "%05", "%06", "%07", + "%08", "%09", "%0A", "%0B", "%0C", "%0D", "%0E", "%0F", + "%10", "%11", "%12", "%13", "%14", "%15", "%16", "%17", + "%18", "%19", "%1A", "%1B", "%1C", "%1D", "%1E", "%1F", + "%20", "%21", "%22", "%23", "%24", "%25", "%26", "%27", + "%28", "%29", "%2A", "%2B", "%2C", "%2D", "%2E", "%2F", + "%30", "%31", "%32", "%33", "%34", "%35", "%36", "%37", + "%38", "%39", "%3A", "%3B", "%3C", "%3D", "%3E", "%3F", + "%40", "%41", "%42", "%43", "%44", "%45", "%46", "%47", + "%48", "%49", "%4A", "%4B", "%4C", "%4D", "%4E", "%4F", + "%50", "%51", "%52", "%53", "%54", "%55", "%56", "%57", + "%58", "%59", "%5A", "%5B", "%5C", "%5D", "%5E", "%5F", + "%60", "%61", "%62", "%63", "%64", "%65", "%66", "%67", + "%68", "%69", "%6A", "%6B", "%6C", "%6D", "%6E", "%6F", + "%70", "%71", "%72", "%73", "%74", "%75", "%76", "%77", + "%78", "%79", "%7A", "%7B", "%7C", "%7D", "%7E", "%7F", + "%80", "%81", "%82", "%83", "%84", "%85", "%86", "%87", + "%88", "%89", "%8A", "%8B", "%8C", "%8D", "%8E", "%8F", + "%90", "%91", "%92", "%93", "%94", "%95", "%96", "%97", + "%98", "%99", "%9A", "%9B", "%9C", "%9D", "%9E", "%9F", + "%A0", "%A1", "%A2", "%A3", "%A4", "%A5", "%A6", "%A7", + "%A8", "%A9", "%AA", "%AB", "%AC", "%AD", "%AE", "%AF", + "%B0", "%B1", "%B2", "%B3", "%B4", "%B5", "%B6", "%B7", + "%B8", "%B9", "%BA", "%BB", "%BC", "%BD", "%BE", "%BF", + "%C0", "%C1", "%C2", "%C3", "%C4", "%C5", "%C6", "%C7", + "%C8", "%C9", "%CA", "%CB", "%CC", "%CD", "%CE", "%CF", + "%D0", "%D1", "%D2", "%D3", "%D4", "%D5", "%D6", "%D7", + "%D8", "%D9", "%DA", "%DB", "%DC", "%DD", "%DE", "%DF", + "%E0", "%E1", "%E2", "%E3", "%E4", "%E5", "%E6", "%E7", + "%E8", "%E9", "%EA", "%EB", "%EC", "%ED", "%EE", "%EF", + "%F0", "%F1", "%F2", "%F3", "%F4", "%F5", "%F6", "%F7", + "%F8", "%F9", "%FA", "%FB", "%FC", "%FD", "%FE", "%FF" + }; + + /** + * Encode a string to the "x-www-form-urlencoded" form, enhanced + * with the UTF-8-in-URL proposal. This is what happens: + * + *
    + *
  • The ASCII characters 'a' through 'z', 'A' through 'Z', + * and '0' through '9' remain the same. + * + *
  • The unreserved characters - _ . ! ~ * ' ( ) remain the same. + * + *
  • All other ASCII characters are converted into the + * 3-character string "%xy", where xy is + * the two-digit hexadecimal representation of the character + * code + * + *
  • All non-ASCII characters are encoded in two steps: first + * to a sequence of 2 or 3 bytes, using the UTF-8 algorithm; + * secondly each of these bytes is encoded as "%xx". + *
+ * + * @param s The string to be encoded + * @return The encoded string + */ + // from: http://www.w3.org/International/URLUTF8Encoder.java + public static StringBuilder escape(final String s) { + final int len = s.length(); + final StringBuilder sbuf = new StringBuilder(len + 10); + for (int i = 0; i < len; i++) { + final int ch = s.charAt(i); + if ('A' <= ch && ch <= 'Z') { // 'A'..'Z' + sbuf.append((char)ch); + } else if ('a' <= ch && ch <= 'z') { // 'a'..'z' + sbuf.append((char)ch); + } else if ('0' <= ch && ch <= '9') { // '0'..'9' + sbuf.append((char)ch); + } else if (ch == ' ') { // space + sbuf.append("%20"); + } else if (ch == '&' || ch == ':' // unreserved + || ch == '-' || ch == '_' + || ch == '.' || ch == '!' + || ch == '~' || ch == '*' + || ch == '\'' || ch == '(' + || ch == ')' || ch == ';') { + sbuf.append((char)ch); + } else if (ch == '/') { // reserved, but may appear in post part where it should not be replaced + sbuf.append((char)ch); + } else if (ch <= 0x007f) { // other ASCII + sbuf.append(hex[ch]); + } else if (ch <= 0x07FF) { // non-ASCII <= 0x7FF + sbuf.append(hex[0xc0 | (ch >> 6)]); + sbuf.append(hex[0x80 | (ch & 0x3F)]); + } else { // 0x7FF < ch <= 0xFFFF + sbuf.append(hex[0xe0 | (ch >> 12)]); + sbuf.append(hex[0x80 | ((ch >> 6) & 0x3F)]); + sbuf.append(hex[0x80 | (ch & 0x3F)]); + } + } + return sbuf; + } + + // from: http://www.w3.org/International/unescape.java + public static String unescape(final String s) { + final int l = s.length(); + final StringBuilder sbuf = new StringBuilder(l); + int ch = -1; + int b, sumb = 0; + for (int i = 0, more = -1; i < l; i++) { + /* Get next byte b from URL segment s */ + switch (ch = s.charAt(i)) { + case '%': + if (i + 2 < l) { + ch = s.charAt(++i); + int hb = (Character.isDigit ((char) ch) ? ch - '0' : 10 + Character.toLowerCase((char) ch) - 'a') & 0xF; + ch = s.charAt(++i); + int lb = (Character.isDigit ((char) ch) ? ch - '0' : 10 + Character.toLowerCase ((char) ch) - 'a') & 0xF; + b = (hb << 4) | lb; + } else { + b = ch; + } + break; + case '+': + b = ' '; + break; + default: + b = ch; + } + /* Decode byte b as UTF-8, sumb collects incomplete chars */ + if ((b & 0xc0) == 0x80) { // 10xxxxxx (continuation byte) + sumb = (sumb << 6) | (b & 0x3f); // Add 6 bits to sumb + if (--more == 0) sbuf.append((char) sumb); // Add char to sbuf + } else if ((b & 0x80) == 0x00) { // 0xxxxxxx (yields 7 bits) + sbuf.append((char) b); // Store in sbuf + } else if ((b & 0xe0) == 0xc0) { // 110xxxxx (yields 5 bits) + sumb = b & 0x1f; + more = 1; // Expect 1 more byte + } else if ((b & 0xf0) == 0xe0) { // 1110xxxx (yields 4 bits) + sumb = b & 0x0f; + more = 2; // Expect 2 more bytes + } else if ((b & 0xf8) == 0xf0) { // 11110xxx (yields 3 bits) + sumb = b & 0x07; + more = 3; // Expect 3 more bytes + } else if ((b & 0xfc) == 0xf8) { // 111110xx (yields 2 bits) + sumb = b & 0x03; + more = 4; // Expect 4 more bytes + } else /*if ((b & 0xfe) == 0xfc)*/ { // 1111110x (yields 1 bit) + sumb = b & 0x01; + more = 5; // Expect 5 more bytes + } + /* We don't test if the UTF-8 encoding is well-formed */ + } + return sbuf.toString(); + } + + private void identPort(final String inputURL, final int dflt) throws MalformedURLException { + // identify ref in file + final int r = this.host.indexOf(':'); + if (r < 0) { + this.port = dflt; + } else { + try { + final String portStr = this.host.substring(r + 1); + if (portStr.trim().length() > 0) this.port = Integer.parseInt(portStr); + else this.port = -1; + this.host = this.host.substring(0, r); + } catch (final NumberFormatException e) { + throw new MalformedURLException("wrong port in host fragment '" + this.host + "' of input url '" + inputURL + "'"); + } + } + } + + private void identRef() { + // identify ref in file + final int r = path.indexOf('#'); + if (r < 0) { + this.ref = null; + } else { + this.ref = path.substring(r + 1); + this.path = path.substring(0, r); + } + } + + private void identQuest() { + // identify quest in file + final int r = path.indexOf('?'); + if (r < 0) { + this.quest = null; + } else { + this.quest = path.substring(r + 1); + this.path = path.substring(0, r); + } + } + + public String getFile() { + return getFile(false, false); + } + + public String getFile(final boolean excludeReference, final boolean removeSessionID) { + // this is the path plus quest plus ref + // if there is no quest and no ref the result is identical to getPath + // this is defined according to http://java.sun.com/j2se/1.4.2/docs/api/java/net/URL.html#getFile() + if (quest == null) return (excludeReference || ref == null) ? path : path + "#" + ref; + String q = quest; + if (removeSessionID) { + for (String sid: sessionIDnames) { + if (q.toLowerCase().startsWith(sid.toLowerCase() + "=")) { + int p = q.indexOf('&'); + if (p < 0) return (excludeReference || ref == null) ? path : path + "#" + ref; + q = q.substring(p + 1); + continue; + } + int p = q.toLowerCase().indexOf("&" + sid.toLowerCase() + "="); + if (p < 0) continue; + int p1 = q.indexOf('&', p); + if (p1 < 0) { + q = q.substring(0, p); + } else { + q = q.substring(0, p) + q.substring(p1); + } + } + } + return (excludeReference || ref == null) ? path + "?" + q : path + "?" + q + "#" + ref; + } + + public String getFileName() { + // this is a method not defined in any sun api + // it returns the last portion of a path without any reference + final int p = path.lastIndexOf('/'); + if (p < 0) return path; + if (p == path.length() - 1) return ""; // no file name, this is a path to a directory + return path.substring(p + 1); // the 'real' file name + } + + public String getFileExtension() { + String name = getFileName(); + int p = name.lastIndexOf('.'); + if (p < 0) return ""; + return name.substring(p + 1); + } + + public String getPath() { + return path; + } + + /** + * return the file object to a local file + * this patches also 'strange' windows file paths + * @return the file as absolute path + */ + public File getLocalFile() { + char c = path.charAt(1); + if (c == ':') return new File(path.replace('/', '\\')); + if (c == '|') return new File(path.charAt(0) + ":" + path.substring(2).replace('/', '\\')); + c = path.charAt(2); + if (c == ':' || c == '|') return new File(path.charAt(1) + ":" + path.substring(3).replace('/', '\\')); + return new File(path); + } + + public String getAuthority() { + return ((port >= 0) && (host != null)) ? host + ":" + port : ((host != null) ? host : ""); + } + + public String getHost() { + return host; + } + + public int getPort() { + return port; + } + + public String getProtocol() { + return protocol; + } + + public String getRef() { + return ref; + } + + public void removeRef() { + ref = null; + } + + public String getUserInfo() { + return userInfo; + } + + public String getQuery() { + return quest; + } + + @Override + public String toString() { + return toNormalform(false, true); + } + + public String toNormalform(final boolean excludeReference, final boolean stripAmp) { + return toNormalform(excludeReference, stripAmp, false); + } + + public String toNormalform(final boolean excludeReference, final boolean stripAmp, final boolean removeSessionID) { + String result = toNormalform0(excludeReference, removeSessionID); + if (stripAmp) { + result = result.replaceAll("&", "&"); + } + return result; + } + + private String toNormalform0(final boolean excludeReference, final boolean removeSessionID) { + // generates a normal form of the URL + boolean defaultPort = false; + if (this.protocol.equals("mailto")) { + return this.protocol + ":" + this.userInfo + "@" + this.host; + } else if (isHTTP()) { + if (this.port < 0 || this.port == 80) { defaultPort = true; } + } else if (isHTTPS()) { + if (this.port < 0 || this.port == 443) { defaultPort = true; } + } else if (isFTP()) { + if (this.port < 0 || this.port == 21) { defaultPort = true; } + } else if (isSMB()) { + if (this.port < 0 || this.port == 445) { defaultPort = true; } + } else if (isFile()) { + defaultPort = true; + } + final String urlPath = this.getFile(excludeReference, removeSessionID); + + if (defaultPort) { + return + this.protocol + "://" + + ((this.getHost() == null) ? "" : ((this.userInfo != null) ? (this.userInfo + "@") : ("")) + this.getHost().toLowerCase()) + + urlPath; + } + return this.protocol + "://" + + ((this.userInfo != null) ? (this.userInfo + "@") : ("")) + + this.getHost().toLowerCase() + ((defaultPort) ? ("") : (":" + this.port)) + urlPath; + } + + public int hashCode() { + return this.toNormalform(true, true).hashCode(); + } + + /* (non-Javadoc) + * @see java.lang.Object#equals(java.lang.Object) + */ + @Override + public boolean equals(final Object obj) { + if (this == obj) return true; + if (obj == null) return false; + if (!(obj instanceof MultiProtocolURI)) return false; + MultiProtocolURI other = (MultiProtocolURI) obj; + return this.toString().equals(other.toString()); + } + + public int compareTo(final Object h) { + assert (h instanceof MultiProtocolURI); + return this.toString().compareTo(((MultiProtocolURI) h).toString()); + } + + public boolean isPOST() { + return (this.quest != null) && (this.quest.length() > 0); + } + + public final boolean isCGI() { + final String ls = unescape(path.toLowerCase()); + return ls.indexOf(".cgi") >= 0 || + ls.indexOf(".exe") >= 0; + } + + public final boolean isIndividual() { + final String q = unescape(path.toLowerCase()); + for (String sid: sessionIDnames) { + if (q.startsWith(sid.toLowerCase() + "=")) return true; + int p = q.indexOf("&" + sid.toLowerCase() + "="); + if (p >= 0) return true; + } + int pos; + return + ((pos = q.indexOf("sid")) > 0 && + (q.charAt(--pos) == '?' || q.charAt(pos) == '&' || q.charAt(pos) == ';') && + (pos += 5) < q.length() && + (q.charAt(pos) != '&' && q.charAt(--pos) == '=') + ) || + + ((pos = q.indexOf("sessionid")) > 0 && + (pos += 10) < q.length() && + (q.charAt(pos) != '&' && + (q.charAt(--pos) == '=' || q.charAt(pos) == '/')) + ) || + + ((pos = q.indexOf("phpsessid")) > 0 && + (pos += 10) < q.length() && + (q.charAt(pos) != '&' && + (q.charAt(--pos) == '=' || q.charAt(pos) == '/'))); + } + + // checks for local/global IP range and local IP + public boolean isLocal() { + return (this.host.startsWith("127.") || this.host.equals("localhost") || this.host.startsWith("0:0:0:0:0:0:0:1")); + } + + // language calculation + public final String language() { + String language = "en"; + if (host == null) return language; + final int pos = host.lastIndexOf('.'); + if (pos > 0 && host.length() - pos == 3) language = host.substring(pos + 1).toLowerCase(); + if (language.equals("uk")) language = "en"; + return language; + } + + // The MultiProtocolURI may be used to integrate File- and SMB accessed into one object + // some extraction methods that generate File/SmbFile objects from the MultiProtocolURI + + /** + * create a standard java URL. + * Please call isHTTP(), isHTTPS() and isFTP() before using this class + */ + public java.net.URL getURL() throws MalformedURLException { + if (!(isHTTP() || isHTTPS() || isFTP())) throw new UnsupportedOperationException(); + return new java.net.URL(this.toNormalform(false, true)); + } + + /** + * create a standard java File. + * Please call isFile() before using this class + */ + public java.io.File getFSFile() { + if (!isFile()) throw new UnsupportedOperationException(); + return new java.io.File(this.toNormalform(false, true).substring(7)); + } + + /** + * create a smb File + * Please call isSMB() before using this class + * @throws MalformedURLException + */ + public SmbFile getSmbFile() throws MalformedURLException { + if (!isSMB()) throw new UnsupportedOperationException(); + String url = this.toNormalform(false, true); + return new SmbFile(url); + } + + // some methods that let the MultiProtocolURI look like a java.io.File object + // to use these methods the object must be either of type isFile() or isSMB() + + public boolean exists() throws IOException { + if (isFile()) return getFSFile().exists(); + if (isSMB()) try { + return getSmbFile().exists(); + } catch (SmbException e) { + throw new IOException("SMB.exists SmbException for " + this.toString() + ": " + e.getMessage()); + } catch (MalformedURLException e) { + throw new IOException("SMB.exists MalformedURLException for " + this.toString() + ": " + e.getMessage()); + } + return false; + } + + public boolean canRead() throws IOException { + if (isFile()) return getFSFile().canRead(); + if (isSMB()) try { + return getSmbFile().canRead(); + } catch (SmbException e) { + throw new IOException("SMB.canRead SmbException for " + this.toString() + ": " + e.getMessage()); + } catch (MalformedURLException e) { + throw new IOException("SMB.canRead MalformedURLException for " + this.toString() + ": " + e.getMessage()); + } + return false; + } + + public boolean canWrite() throws IOException { + if (isFile()) return getFSFile().canWrite(); + if (isSMB()) try { + return getSmbFile().canWrite(); + } catch (SmbException e) { + throw new IOException("SMB.canWrite SmbException for " + this.toString() + ": " + e.getMessage()); + } catch (MalformedURLException e) { + throw new IOException("SMB.canWrite MalformedURLException for " + this.toString() + ": " + e.getMessage()); + } + return false; + } + + public boolean isHidden() throws IOException { + if (isFile()) return getFSFile().isHidden(); + if (isSMB()) try { + return getSmbFile().isHidden(); + } catch (SmbException e) { + throw new IOException("SMB.isHidden SmbException for " + this.toString() + ": " + e.getMessage()); + } catch (MalformedURLException e) { + throw new IOException("SMB.isHidden MalformedURLException for " + this.toString() + ": " + e.getMessage()); + } + return false; + } + + public boolean isDirectory() throws IOException { + if (isFile()) return getFSFile().isDirectory(); + if (isSMB()) try { + return getSmbFile().isDirectory(); + } catch (SmbException e) { + throw new IOException("SMB.isDirectory SmbException for " + this.toString() + ": " + e.getMessage()); + } catch (MalformedURLException e) { + throw new IOException("SMB.isDirectory MalformedURLException for " + this.toString() + ": " + e.getMessage()); + } + return false; + } + + public long length() throws IOException { + if (isFile()) return getFSFile().length(); + if (isSMB()) try { + return getSmbFile().length(); + } catch (SmbException e) { + throw new IOException("SMB.length SmbException for " + this.toString() + ": " + e.getMessage()); + } catch (MalformedURLException e) { + throw new IOException("SMB.length MalformedURLException for " + this.toString() + ": " + e.getMessage()); + } + return 0; + } + + public long lastModified() throws IOException { + if (isFile()) return getFSFile().lastModified(); + if (isSMB()) try { + return getSmbFile().lastModified(); + } catch (SmbException e) { + throw new IOException("SMB.lastModified SmbException for " + this.toString() + ": " + e.getMessage()); + } catch (MalformedURLException e) { + throw new IOException("SMB.lastModified MalformedURLException for " + this.toString() + ": " + e.getMessage()); + } + return 0; + } + + public String getName() throws IOException { + if (isFile()) return getFSFile().getName(); + if (isSMB()) try { + return getSmbFile().getName(); + } catch (MalformedURLException e) { + throw new IOException("SMB.getName MalformedURLException for " + this.toString() + ": " + e.getMessage()); + } + return null; + } + + public String[] list() throws IOException { + if (isFile()) return getFSFile().list(); + if (isSMB()) try { + SmbFile sf = getSmbFile(); + try { + return sf.list(); + } catch (SmbException e) { + throw new IOException("SMB.list SmbException for " + sf.toString() + ": " + e.getMessage()); + } + } catch (MalformedURLException e) { + throw new IOException("SMB.list MalformedURLException for " + this.toString() + ": " + e.getMessage()); + } + return null; + } + + public InputStream getInputStream() throws IOException { + if (isFile()) return new FileInputStream(getFSFile()); + if (isSMB()) return new SmbFileInputStream(getSmbFile()); + return null; + } + + //--------------------- + + private static final String splitrex = " |/|\\(|\\)|-|\\:|_|\\.|,|\\?|!|'|" + '"'; + public static final Pattern splitpattern = Pattern.compile(splitrex); + public static String[] urlComps(String normalizedURL) { + final int p = normalizedURL.indexOf("//"); + if (p > 0) normalizedURL = normalizedURL.substring(p + 2); + return splitpattern.split(normalizedURL.toLowerCase()); // word components of the url + } + + public static void main(final String[] args) { + final String[][] test = new String[][]{ + new String[]{null, "C:WINDOWS\\CMD0.EXE"}, + new String[]{null, "file://C:WINDOWS\\CMD0.EXE"}, + new String[]{null, "file:/bin/yacy1"}, // file:/// may have many '/' if the host is omitted and the path starts with '/' + new String[]{null, "file:///bin/yacy2"}, // file:/// may have many '/' if the host is omitted and the path starts with '/' + new String[]{null, "file:C:WINDOWS\\CMD.EXE"}, + new String[]{null, "file:///C:WINDOWS\\CMD1.EXE"}, + new String[]{null, "file:///C|WINDOWS\\CMD2.EXE"}, + new String[]{null, "http://www.anomic.de/test/"}, + new String[]{null, "http://www.anomic.de/"}, + new String[]{null, "http://www.anomic.de"}, + new String[]{null, "http://www.anomic.de/home/test?x=1#home"}, + new String[]{null, "http://www.anomic.de/home/test?x=1"}, + new String[]{null, "http://www.anomic.de/home/test#home"}, + new String[]{null, "ftp://ftp.anomic.de/home/test#home"}, + new String[]{null, "http://www.anomic.de/home/../abc/"}, + new String[]{null, "mailto:abcdefg@nomailnomail.com"}, + new String[]{"http://www.anomic.de/home", "test"}, + new String[]{"http://www.anomic.de/home", "test/"}, + new String[]{"http://www.anomic.de/home/", "test"}, + new String[]{"http://www.anomic.de/home/", "test/"}, + new String[]{"http://www.anomic.de/home/index.html", "test.htm"}, + new String[]{"http://www.anomic.de/home/index.html", "http://www.yacy.net/test"}, + new String[]{"http://www.anomic.de/home/index.html", "ftp://ftp.yacy.net/test"}, + new String[]{"http://www.anomic.de/home/index.html", "../test"}, + new String[]{"http://www.anomic.de/home/index.html", "mailto:abcdefg@nomailnomail.com"}, + new String[]{null, "news:de.test"}, + new String[]{"http://www.anomic.de/home", "news:de.test"}, + new String[]{null, "mailto:bob@web.com"}, + new String[]{"http://www.anomic.de/home", "mailto:bob@web.com"}, + new String[]{"http://www.anomic.de/home", "ftp://ftp.anomic.de/src"}, + new String[]{null, "ftp://ftp.delegate.org/"}, + new String[]{"http://www.anomic.de/home", "ftp://ftp.delegate.org/"}, + new String[]{"http://www.anomic.de","mailto:yacy@weltherrschaft.org"}, + new String[]{"http://www.anomic.de","javascipt:temp"}, + new String[]{null,"http://yacy-websuche.de/wiki/index.php?title=De:IntroInformationFreedom&action=history"}, + new String[]{null, "http://diskusjion.no/index.php?s=5bad5f431a106d9a8355429b81bb0ca5&showuser=23585"}, + new String[]{null, "http://diskusjion.no/index.php?s=5bad5f431a106d9a8355429b81bb0ca5&showuser=23585"}, + new String[]{null, "http://www.scc.kit.edu/publikationen/80.php?PHPSESSID=5f3624d3e1c33d4c086ab600d4d5f5a1"}, + new String[]{null, "smb://localhost/"}, + new String[]{null, "smb://localhost/repository"}, // paths must end with '/' + new String[]{null, "smb://localhost/repository/"}, + new String[]{null, "\\\\localhost\\"}, // Windows-like notion of smb shares + new String[]{null, "\\\\localhost\\repository"}, + new String[]{null, "\\\\localhost\\repository\\"} + }; + //MultiProtocolURI.initSessionIDNames(FileUtils.loadList(new File("defaults/sessionid.names"))); + String environment, url; + MultiProtocolURI aURL, aURL1; + java.net.URL jURL; + for (int i = 0; i < test.length; i++) { + environment = test[i][0]; + url = test[i][1]; + try {aURL = MultiProtocolURI.newURL(environment, url);} catch (final MalformedURLException e) {e.printStackTrace(); aURL = null;} + if (environment == null) { + try {jURL = new java.net.URL(url);} catch (final MalformedURLException e) {jURL = null;} + } else { + try {jURL = new java.net.URL(new java.net.URL(environment), url);} catch (final MalformedURLException e) {jURL = null;} + } + + // check equality to java.net.URL + if (((aURL == null) && (jURL != null)) || + ((aURL != null) && (jURL == null)) || + ((aURL != null) && (jURL != null) && (!(jURL.toString().equals(aURL.toString()))))) { + System.out.println("Difference for environment=" + environment + ", url=" + url + ":"); + System.out.println((jURL == null) ? "jURL rejected input" : "jURL=" + jURL.toString()); + System.out.println((aURL == null) ? "aURL rejected input" : "aURL=" + aURL.toString()); + } + + // check stability: the normalform of the normalform must be equal to the normalform + if (aURL != null) try { + aURL1 = new MultiProtocolURI(aURL.toNormalform(false, true)); + if (!(aURL1.toNormalform(false, true).equals(aURL.toNormalform(false, true)))) { + System.out.println("no stability for url:"); + System.out.println("aURL0=" + aURL.toString()); + System.out.println("aURL1=" + aURL1.toString()); + } + } catch (final MalformedURLException e) { + System.out.println("no stability for url:"); + System.out.println("aURL0=" + aURL.toString()); + System.out.println("aURL1 cannot be computed:" + e.getMessage()); + } + } + } +} diff --git a/source/net/yacy/kelondro/util/Punycode.java b/source/net/yacy/cora/document/Punycode.java similarity index 94% rename from source/net/yacy/kelondro/util/Punycode.java rename to source/net/yacy/cora/document/Punycode.java index a0070b13d..5fdf3000c 100644 --- a/source/net/yacy/kelondro/util/Punycode.java +++ b/source/net/yacy/cora/document/Punycode.java @@ -21,19 +21,19 @@ * USA */ -package net.yacy.kelondro.util; +package net.yacy.cora.document; public class Punycode { /* Punycode parameters */ - final static int TMIN = 1; - final static int TMAX = 26; - final static int BASE = 36; - final static int INITIAL_N = 128; - final static int INITIAL_BIAS = 72; - final static int DAMP = 700; - final static int SKEW = 38; - final static char DELIMITER = '-'; + private final static int TMIN = 1; + private final static int TMAX = 26; + private final static int BASE = 36; + private final static int INITIAL_N = 128; + private final static int INITIAL_BIAS = 72; + private final static int DAMP = 700; + private final static int SKEW = 38; + private final static char DELIMITER = '-'; /** * Punycodes a unicode string. diff --git a/source/net/yacy/document/parser/xml/RSSFeed.java b/source/net/yacy/cora/document/RSSFeed.java similarity index 77% rename from source/net/yacy/document/parser/xml/RSSFeed.java rename to source/net/yacy/cora/document/RSSFeed.java index ec0f9610f..f223e4afc 100644 --- a/source/net/yacy/document/parser/xml/RSSFeed.java +++ b/source/net/yacy/cora/document/RSSFeed.java @@ -1,40 +1,31 @@ -// RSSFeed.java -// (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany -// first published 24.04.2008 on http://yacy.net -// -// This is a part of YaCy, a peer-to-peer based web search engine -// -// $LastChangedDate$ -// $LastChangedRevision$ -// $LastChangedBy$ -// -// LICENSE -// -// This program is free software; you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation; either version 2 of the License, or -// (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +/** + * RSSFeed + * Copyright 2007 by Michael Peter Christen + * First released 16.7.2007 at http://yacy.net + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file COPYING.LESSER. + * If not, see . + */ -package net.yacy.document.parser.xml; +package net.yacy.cora.document; import java.util.HashSet; import java.util.Iterator; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentLinkedQueue; -import net.yacy.document.content.RSSMessage; - - -public class RSSFeed implements Iterable { +public class RSSFeed implements Iterable { // static channel names of feeds public static final String TEST = "TEST"; @@ -119,7 +110,7 @@ public class RSSFeed implements Iterable { return messages.size(); } - public Iterator iterator() { + public Iterator iterator() { return new messageIterator(); } @@ -131,7 +122,7 @@ public class RSSFeed implements Iterable { return messages.remove(nextGUID); } - public class messageIterator implements Iterator{ + public class messageIterator implements Iterator{ Iterator GUIDiterator; String lastGUID; diff --git a/source/net/yacy/document/content/RSSMessage.java b/source/net/yacy/cora/document/RSSMessage.java similarity index 62% rename from source/net/yacy/document/content/RSSMessage.java rename to source/net/yacy/cora/document/RSSMessage.java index bf4e08859..5981b8e56 100644 --- a/source/net/yacy/document/content/RSSMessage.java +++ b/source/net/yacy/cora/document/RSSMessage.java @@ -1,31 +1,24 @@ -// RSSMessage.java -// (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany -// first published 16.07.2007 on http://yacy.net -// -// This is a part of YaCy, a peer-to-peer based web search engine -// -// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $ -// $LastChangedRevision: 1986 $ -// $LastChangedBy: orbiter $ -// -// LICENSE -// -// This program is free software; you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation; either version 2 of the License, or -// (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +/** + * RSSMessage + * Copyright 2007 by Michael Peter Christen + * First released 16.7.2007 at http://yacy.net + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file COPYING.LESSER. + * If not, see . + */ - -package net.yacy.document.content; +package net.yacy.cora.document; import java.util.Date; import java.util.HashMap; @@ -33,7 +26,7 @@ import java.util.HashSet; import java.util.Map; import java.util.Set; -public class RSSMessage { +public class RSSMessage implements Hit { // statics for item generation and automatic categorization private static int guidcount = 0; @@ -165,4 +158,74 @@ public class RSSMessage { public String toString() { return this.map.toString(); } + + public void setAuthor(String title) { + // TODO Auto-generated method stub + + } + + public void setCategory(String title) { + // TODO Auto-generated method stub + + } + + public void setCopyright(String title) { + // TODO Auto-generated method stub + + } + + public void setCreator(String pubdate) { + // TODO Auto-generated method stub + + } + + public void setDescription(String description) { + // TODO Auto-generated method stub + + } + + public void setDocs(String guid) { + // TODO Auto-generated method stub + + } + + public void setGuid(String guid) { + // TODO Auto-generated method stub + + } + + public void setLanguage(String title) { + // TODO Auto-generated method stub + + } + + public void setLink(String link) { + // TODO Auto-generated method stub + + } + + public void setPubDate(String pubdate) { + // TODO Auto-generated method stub + + } + + public void setReferrer(String title) { + // TODO Auto-generated method stub + + } + + public void setSize(long size) { + // TODO Auto-generated method stub + + } + + public void setSizename(String sizename) { + // TODO Auto-generated method stub + + } + + public void setTitle(String title) { + // TODO Auto-generated method stub + + } } diff --git a/source/net/yacy/document/parser/xml/RSSReader.java b/source/net/yacy/cora/document/RSSReader.java similarity index 70% rename from source/net/yacy/document/parser/xml/RSSReader.java rename to source/net/yacy/cora/document/RSSReader.java index 0f12e65fe..c4f7a3072 100644 --- a/source/net/yacy/document/parser/xml/RSSReader.java +++ b/source/net/yacy/cora/document/RSSReader.java @@ -1,30 +1,24 @@ -// RSSReader.java -// (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany -// first published 16.07.2007 on http://yacy.net -// -// This is a part of YaCy, a peer-to-peer based web search engine -// -// $LastChangedDate$ -// $LastChangedRevision$ -// $LastChangedBy$ -// -// LICENSE -// -// This program is free software; you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation; either version 2 of the License, or -// (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +/** + * RSSReader + * Copyright 2007 by Michael Peter Christen + * First released 16.7.2007 at http://yacy.net + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file COPYING.LESSER. + * If not, see . + */ -package net.yacy.document.parser.xml; +package net.yacy.cora.document; import java.io.ByteArrayInputStream; import java.io.IOException; @@ -34,10 +28,6 @@ import javax.xml.parsers.ParserConfigurationException; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; -import net.yacy.document.content.RSSMessage; -import net.yacy.kelondro.logging.Log; -import net.yacy.kelondro.util.ByteBuffer; - import org.xml.sax.Attributes; import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; @@ -86,25 +76,21 @@ public class RSSReader extends DefaultHandler { } } - public static RSSReader parse(final byte[] a) { + public static RSSReader parse(final byte[] a) throws IOException { // check integrity of array if ((a == null) || (a.length == 0)) { - Log.logWarning("rssReader", "response=null"); - return null; + throw new IOException("response=null"); } if (a.length < 100) { - Log.logWarning("rssReader", "response=" + new String(a)); - return null; + throw new IOException("response=" + new String(a)); } - if (!ByteBuffer.equals(a, ". + */ + + +package net.yacy.cora.protocol; + +import java.io.IOException; +import java.util.List; + +import org.apache.commons.httpclient.methods.multipart.Part; + +import de.anomic.crawler.retrieval.HTTPLoader; +import de.anomic.http.client.Client; +import de.anomic.http.client.RemoteProxyConfig; +import de.anomic.http.server.HeaderFramework; +import de.anomic.http.server.RequestHeader; +import de.anomic.http.server.ResponseContainer; + +public class HttpConnector { + + /** + * send data to the server named by vhost + * + * @param address address of the server + * @param vhost name of the server at address which should respond + * @param post data to send (name-value-pairs) + * @param timeout in milliseconds + * @return response body + * @throws IOException + */ + public static byte[] wput(final String url, final String vhost, final List post, final int timeout) throws IOException { + return wput(url, vhost, post, timeout, false); + } + + /** + * send data to the server named by vhost + * + * @param address address of the server + * @param vhost name of the server at address which should respond + * @param post data to send (name-value-pairs) + * @param timeout in milliseconds + * @param gzipBody send with content gzip encoded + * @return response body + * @throws IOException + */ + public static byte[] wput(final String url, final String vhost, final List post, final int timeout, final boolean gzipBody) throws IOException { + final RequestHeader header = new RequestHeader(); + header.put(HeaderFramework.USER_AGENT, HTTPLoader.yacyUserAgent); + header.put(HeaderFramework.HOST, vhost); + final Client client = new Client(timeout, header); + client.setProxy(proxyConfig()); + + ResponseContainer res = null; + byte[] content = null; + try { + // send request/data + res = client.POST(url, post, gzipBody); + content = res.getData(); + } finally { + if(res != null) { + // release connection + res.closeStream(); + } + } + return content; + } + + + private static final RemoteProxyConfig proxyConfig() { + final RemoteProxyConfig p = RemoteProxyConfig.getRemoteProxyConfig(); + return ((p != null) && (p.useProxy()) && (p.useProxy4Yacy())) ? p : null; + } +} diff --git a/source/net/yacy/cora/services/Search.java b/source/net/yacy/cora/services/Search.java new file mode 100644 index 000000000..b0fa26e2d --- /dev/null +++ b/source/net/yacy/cora/services/Search.java @@ -0,0 +1,145 @@ +/** + * Search + * Copyright 2010 by Michael Peter Christen + * First released 25.05.2010 at http://yacy.net + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file COPYING.LESSER. + * If not, see . + */ + + +package net.yacy.cora.services; + +import java.io.IOException; +import java.net.MalformedURLException; +import java.nio.charset.Charset; +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.BlockingQueue; +import java.util.concurrent.LinkedBlockingQueue; + +import net.yacy.cora.document.MultiProtocolURI; +import net.yacy.cora.document.RSSFeed; +import net.yacy.cora.document.RSSMessage; +import net.yacy.cora.document.RSSReader; +import net.yacy.cora.protocol.HttpConnector; + +import org.apache.commons.httpclient.methods.multipart.Part; +import org.apache.commons.httpclient.methods.multipart.StringPart; + +public class Search { + + public static BlockingQueue search(String rssSearchServiceURL, String query, boolean verify, boolean global, long timeout, int maximumRecords) { + BlockingQueue queue = new LinkedBlockingQueue(); + searchJob job = new searchJob(rssSearchServiceURL, query, verify, global, timeout, maximumRecords, queue); + job.start(); + return queue; + } + + private final static int recordsPerSession = 10; + + public static class searchJob extends Thread { + + String urlBase, query; + boolean verify, global; + long timeout; + int startRecord, maximumRecords; + BlockingQueue queue; + + public searchJob(String urlBase, String query, boolean verify, boolean global, long timeout, int maximumRecords, BlockingQueue queue) { + this.urlBase = urlBase; + this.query = query; + this.verify = verify; + this.global = global; + this.timeout = timeout; + this.startRecord = 0; + this.maximumRecords = maximumRecords; + this.queue = queue; + } + + public void run() { + RSSMessage message; + mainloop: while (timeout > 0 && maximumRecords > 0) { + long st = System.currentTimeMillis(); + RSSFeed feed; + try { + feed = search(urlBase, query, verify, global, timeout, startRecord, recordsPerSession); + } catch (IOException e1) { + break mainloop; + } + if (feed == null || feed.isEmpty()) break mainloop; + maximumRecords -= feed.size(); + innerloop: while (!feed.isEmpty()) { + message = feed.pollMessage(); + if (message == null) break innerloop; + try { + queue.put(message); + } catch (InterruptedException e) { + break innerloop; + } + } + startRecord += recordsPerSession; + timeout -= System.currentTimeMillis() - st; + } + try { queue.put(RSSMessage.POISON); } catch (InterruptedException e) {} + } + } + + /** + * send a query to a yacy public search interface + * @param rssSearchServiceURL the target url base (everything before the ? that follows the SRU request syntax properties). can null, then the local peer is used + * @param query the query as string + * @param startRecord number of first record + * @param maximumRecords maximum number of records + * @param verify if true, result entries are verified using the snippet fetch (slow); if false simply the result is returned + * @param global if true also search results from other peers are included + * @param timeout milliseconds that are waited at maximum for a search result + * @return + */ + public static RSSFeed search(String rssSearchServiceURL, String query, boolean verify, boolean global, long timeout, int startRecord, int maximumRecords) throws IOException { + MultiProtocolURI uri = null; + try { + uri = new MultiProtocolURI(rssSearchServiceURL); + } catch (MalformedURLException e) { + throw new IOException("cora.Search failed asking peer '" + rssSearchServiceURL + "': bad url, " + e.getMessage()); + } + + // prepare request + final List post = new ArrayList(); + post.add(new StringPart("query", query, Charset.defaultCharset().name())); + post.add(new StringPart("startRecord", Integer.toString(startRecord), Charset.defaultCharset().name())); + post.add(new StringPart("maximumRecords", Long.toString(maximumRecords), Charset.defaultCharset().name())); + post.add(new StringPart("verify", verify ? "true" : "false", Charset.defaultCharset().name())); + post.add(new StringPart("resource", global ? "global" : "local", Charset.defaultCharset().name())); + + // send request + try { + final byte[] result = HttpConnector.wput(rssSearchServiceURL, uri.getHost(), post, (int) timeout); + //String debug = new String(result); System.out.println("*** DEBUG: " + debug); + final RSSReader reader = RSSReader.parse(result); + if (reader == null) { + throw new IOException("cora.Search failed asking peer '" + uri.getHost() + "': probably bad response from remote peer (1), reader == null"); + } + final RSSFeed feed = reader.getFeed(); + if (feed == null) { + // case where the rss reader does not understand the content + throw new IOException("cora.Search failed asking peer '" + uri.getHost() + "': probably bad response from remote peer (2)"); + } + return feed; + } catch (final IOException e) { + throw new IOException("cora.Search error asking peer '" + uri.getHost() + "':" + e.toString()); + } + } + +} diff --git a/source/net/yacy/document/AbstractParser.java b/source/net/yacy/document/AbstractParser.java index 957a39990..2e75b9e38 100644 --- a/source/net/yacy/document/AbstractParser.java +++ b/source/net/yacy/document/AbstractParser.java @@ -33,7 +33,7 @@ import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; -import net.yacy.kelondro.data.meta.DigestURI; +import net.yacy.cora.document.MultiProtocolURI; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.workflow.WorkflowThread; @@ -108,7 +108,7 @@ public abstract class AbstractParser implements Idiom { return tempFile; } - public int parseDir(final DigestURI location, final String prefix, final File dir, final Document doc) + public int parseDir(final MultiProtocolURI location, final String prefix, final File dir, final Document doc) throws ParserException, InterruptedException, IOException { if (!dir.isDirectory()) throw new ParserException("tried to parse ordinary file " + dir + " as directory", location); @@ -122,7 +122,7 @@ public abstract class AbstractParser implements Idiom { if (file.isDirectory()) { result += parseDir(location, prefix, file, doc); } else try { - final DigestURI url = DigestURI.newURL(location, "/" + prefix + "/" + final MultiProtocolURI url = MultiProtocolURI.newURL(location, "/" + prefix + "/" // XXX: workaround for relative paths within document + file.getPath().substring(file.getPath().indexOf(File.separatorChar) + 1) + "/" + file.getName()); @@ -151,7 +151,7 @@ public abstract class AbstractParser implements Idiom { * @see net.yacy.document.Idiom#parse(de.anomic.net.URL, java.lang.String, byte[]) */ public Document parse( - final DigestURI location, + final MultiProtocolURI location, final String mimeType, final String charset, final byte[] source @@ -186,7 +186,7 @@ public abstract class AbstractParser implements Idiom { * @see net.yacy.document.Idiom#parse(de.anomic.net.URL, java.lang.String, java.io.File) */ public Document parse( - final DigestURI location, + final MultiProtocolURI location, final String mimeType, final String charset, final File sourceFile @@ -220,7 +220,7 @@ public abstract class AbstractParser implements Idiom { * * @see net.yacy.document.Idiom#parse(de.anomic.net.URL, java.lang.String, java.io.InputStream) */ - public abstract Document parse(DigestURI location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException; + public abstract Document parse(MultiProtocolURI location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException; /** * Return the name of the parser diff --git a/source/net/yacy/document/Condenser.java b/source/net/yacy/document/Condenser.java index fc1228ab5..3c7a88c0a 100644 --- a/source/net/yacy/document/Condenser.java +++ b/source/net/yacy/document/Condenser.java @@ -46,10 +46,10 @@ import java.util.Properties; import java.util.TreeMap; import java.util.TreeSet; +import net.yacy.cora.document.MultiProtocolURI; import net.yacy.document.language.Identificator; import net.yacy.document.parser.html.ContentScraper; import net.yacy.document.parser.html.ImageEntry; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.data.word.WordReferenceRow; import net.yacy.kelondro.logging.Log; @@ -125,7 +125,7 @@ public final class Condenser { this.languageIdentificator = new Identificator(); - Map.Entry entry; + Map.Entry entry; if (indexText) { createCondensement(document.getText()); // the phrase counter: @@ -179,7 +179,7 @@ public final class Condenser { if (indexMedia) { // add anchor descriptions: here, we also add the url components // audio - Iterator> i = document.getAudiolinks().entrySet().iterator(); + Iterator> i = document.getAudiolinks().entrySet().iterator(); while (i.hasNext()) { entry = i.next(); insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasaudio, RESULT_FLAGS, false); diff --git a/source/net/yacy/document/Document.java b/source/net/yacy/document/Document.java index 69ce288dc..7f7f32d03 100644 --- a/source/net/yacy/document/Document.java +++ b/source/net/yacy/document/Document.java @@ -45,9 +45,9 @@ import java.util.Map; import java.util.Set; import java.util.TreeSet; +import net.yacy.cora.document.MultiProtocolURI; import net.yacy.document.parser.html.ContentScraper; import net.yacy.document.parser.html.ImageEntry; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.util.DateFormatter; import net.yacy.kelondro.util.FileUtils; @@ -55,7 +55,7 @@ import net.yacy.kelondro.util.FileUtils; public class Document { - private final DigestURI source; // the source url + private final MultiProtocolURI source; // the source url private final String mimeType; // mimeType as taken from http header private final String charset; // the charset of the document private final List keywords; // most resources provide a keyword field @@ -65,24 +65,24 @@ public class Document { private final List sections; // if present: more titles/headlines appearing in the document private final StringBuilder description; // an abstract, if present: short content description private Object text; // the clear text, all that is visible - private final Map anchors; // all links embedded as clickeable entities (anchor tags) - private final HashMap images; // all visible pictures in document + private final Map anchors; // all links embedded as clickeable entities (anchor tags) + private final HashMap images; // all visible pictures in document // the anchors and images - Maps are URL-to-EntityDescription mappings. // The EntityDescription appear either as visible text in anchors or as alternative // text in image tags. - private Map hyperlinks, audiolinks, videolinks, applinks; + private Map hyperlinks, audiolinks, videolinks, applinks; private Map emaillinks; - private DigestURI favicon; + private MultiProtocolURI favicon; private boolean resorted; private InputStream textStream; private int inboundLinks, outboundLinks; // counters for inbound and outbound links, are counted after calling notifyWebStructure private Set languages; private boolean indexingDenied; - public Document(final DigestURI location, final String mimeType, final String charset, final Set languages, + public Document(final MultiProtocolURI location, final String mimeType, final String charset, final Set languages, final String[] keywords, final String title, final String author, final String publisher, final String[] sections, final String abstrct, - final Object text, final Map anchors, final HashMap images, + final Object text, final Map anchors, final HashMap images, boolean indexingDenied) { this.source = location; this.mimeType = (mimeType == null) ? "application/octet-stream" : mimeType; @@ -92,8 +92,8 @@ public class Document { this.creator = (author == null) ? new StringBuilder(0) : new StringBuilder(author); this.sections = (sections == null) ? new LinkedList() : Arrays.asList(sections); this.description = (abstrct == null) ? new StringBuilder(0) : new StringBuilder(abstrct); - this.anchors = (anchors == null) ? new HashMap(0) : anchors; - this.images = (images == null) ? new HashMap() : images; + this.anchors = (anchors == null) ? new HashMap(0) : anchors; + this.images = (images == null) ? new HashMap() : images; this.publisher = publisher; this.hyperlinks = null; this.audiolinks = null; @@ -159,7 +159,7 @@ dc_rights */ public String dc_title() { - return title.toString(); + return (title == null) ? "" : title.toString(); } public void setTitle(String title) { @@ -167,9 +167,7 @@ dc_rights } public String dc_creator() { - if (creator == null) - return ""; - return creator.toString(); + return (creator == null) ? "" : creator.toString(); } public String dc_subject(final char separator) { @@ -196,7 +194,7 @@ dc_rights } public String dc_publisher() { - return this.publisher; + return this.publisher == null ? "" : this.publisher; } public String dc_format() { @@ -207,7 +205,7 @@ dc_rights return this.source.toNormalform(true, false); } - public DigestURI dc_source() { + public MultiProtocolURI dc_source() { return this.source; } @@ -282,7 +280,7 @@ dc_rights return this.keywords; } - public Map getAnchors() { + public Map getAnchors() { // returns all links embedded as anchors (clickeable entities) // this is a url(String)/text(String) map return anchors; @@ -291,30 +289,30 @@ dc_rights // the next three methods provide a calculated view on the getAnchors/getImages: - public Map getHyperlinks() { + public Map getHyperlinks() { // this is a subset of the getAnchor-set: only links to other hyperrefs if (!resorted) resortLinks(); return hyperlinks; } - public Map getAudiolinks() { + public Map getAudiolinks() { if (!resorted) resortLinks(); return this.audiolinks; } - public Map getVideolinks() { + public Map getVideolinks() { if (!resorted) resortLinks(); return this.videolinks; } - public HashMap getImages() { + public HashMap getImages() { // returns all links enbedded as pictures (visible in document) // this resturns a htmlFilterImageEntry collection if (!resorted) resortLinks(); return images; } - public Map getApplinks() { + public Map getApplinks() { if (!resorted) resortLinks(); return this.applinks; } @@ -329,18 +327,18 @@ dc_rights if (this.resorted) return; // extract hyperlinks, medialinks and emaillinks from anchorlinks - DigestURI url; + MultiProtocolURI url; String u; int extpos, qpos; String ext = null; - final Iterator> i = anchors.entrySet().iterator(); - hyperlinks = new HashMap(); - videolinks = new HashMap(); - audiolinks = new HashMap(); - applinks = new HashMap(); + final Iterator> i = anchors.entrySet().iterator(); + hyperlinks = new HashMap(); + videolinks = new HashMap(); + audiolinks = new HashMap(); + applinks = new HashMap(); emaillinks = new HashMap(); - final HashMap collectedImages = new HashMap(); // this is a set that is collected now and joined later to the imagelinks - Map.Entry entry; + final HashMap collectedImages = new HashMap(); // this is a set that is collected now and joined later to the imagelinks + Map.Entry entry; while (i.hasNext()) { entry = i.next(); url = entry.getKey(); @@ -393,21 +391,21 @@ dc_rights this.resorted = true; } - public static Map allSubpaths(final Collection links) { + public static Map allSubpaths(final Collection links) { // links is either a Set of Strings (urls) or a Set of // htmlFilterImageEntries final HashSet h = new HashSet(); Iterator i = links.iterator(); Object o; - DigestURI url; + MultiProtocolURI url; String u; int pos; int l; while (i.hasNext()) try { o = i.next(); - if (o instanceof DigestURI) url = (DigestURI) o; - else if (o instanceof String) url = new DigestURI((String) o, null); + if (o instanceof MultiProtocolURI) url = (MultiProtocolURI) o; + else if (o instanceof String) url = new MultiProtocolURI((String) o); else if (o instanceof ImageEntry) url = ((ImageEntry) o).url(); else { assert false; @@ -428,11 +426,11 @@ dc_rights } catch (final MalformedURLException e) { } // now convert the strings to yacyURLs i = h.iterator(); - final HashMap v = new HashMap(); + final HashMap v = new HashMap(); while (i.hasNext()) { u = (String) i.next(); try { - url = new DigestURI(u, null); + url = new MultiProtocolURI(u); v.put(url, "sub"); } catch (final MalformedURLException e) { } @@ -440,23 +438,23 @@ dc_rights return v; } - public static Map allReflinks(final Collection links) { + public static Map allReflinks(final Collection links) { // links is either a Set of Strings (with urls) or // htmlFilterImageEntries // we find all links that are part of a reference inside a url - final HashMap v = new HashMap(); + final HashMap v = new HashMap(); final Iterator i = links.iterator(); Object o; - DigestURI url; + MultiProtocolURI url; String u; int pos; loop: while (i.hasNext()) try { o = i.next(); - if (o instanceof DigestURI) - url = (DigestURI) o; + if (o instanceof MultiProtocolURI) + url = (MultiProtocolURI) o; else if (o instanceof String) - url = new DigestURI((String) o, null); + url = new MultiProtocolURI((String) o); else if (o instanceof ImageEntry) url = ((ImageEntry) o).url(); else { @@ -469,7 +467,7 @@ dc_rights u = u.substring(pos); while ((pos = u.toLowerCase().indexOf("http://", 7)) > 0) u = u.substring(pos); - url = new DigestURI(u, null); + url = new MultiProtocolURI(u); if (!(v.containsKey(url))) v.put(url, "ref"); continue loop; @@ -479,7 +477,7 @@ dc_rights u = "http:/" + u.substring(pos); while ((pos = u.toLowerCase().indexOf("/www.", 7)) > 0) u = "http:/" + u.substring(pos); - url = new DigestURI(u, null); + url = new MultiProtocolURI(u); if (!(v.containsKey(url))) v.put(url, "ref"); continue loop; @@ -512,14 +510,14 @@ dc_rights /** * @return the {@link URL} to the favicon that belongs to the document */ - public DigestURI getFavicon() { + public MultiProtocolURI getFavicon() { return this.favicon; } /** * @param faviconURL the {@link URL} to the favicon that belongs to the document */ - public void setFavicon(final DigestURI faviconURL) { + public void setFavicon(final MultiProtocolURI faviconURL) { this.favicon = faviconURL; } diff --git a/source/net/yacy/document/Idiom.java b/source/net/yacy/document/Idiom.java index 6f8fc886b..c24f02899 100644 --- a/source/net/yacy/document/Idiom.java +++ b/source/net/yacy/document/Idiom.java @@ -29,7 +29,7 @@ import java.io.File; import java.io.InputStream; import java.util.Set; -import net.yacy.kelondro.data.meta.DigestURI; +import net.yacy.cora.document.MultiProtocolURI; /** @@ -51,7 +51,7 @@ public interface Idiom { * * @throws ParserException if the content could not be parsed properly */ - public Document parse(DigestURI location, String mimeType, String charset, byte[] source) + public Document parse(MultiProtocolURI location, String mimeType, String charset, byte[] source) throws ParserException, InterruptedException; /** @@ -65,7 +65,7 @@ public interface Idiom { * * @throws ParserException if the content could not be parsed properly */ - public Document parse(DigestURI location, String mimeType, String charset, File sourceFile) + public Document parse(MultiProtocolURI location, String mimeType, String charset, File sourceFile) throws ParserException, InterruptedException; /** @@ -79,7 +79,7 @@ public interface Idiom { * * @throws ParserException if the content could not be parsed properly */ - public Document parse(DigestURI location, String mimeType, String charset, InputStream source) + public Document parse(MultiProtocolURI location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException; /** diff --git a/source/net/yacy/document/ParserException.java b/source/net/yacy/document/ParserException.java index 9a53f54e4..a8f3520aa 100644 --- a/source/net/yacy/document/ParserException.java +++ b/source/net/yacy/document/ParserException.java @@ -24,10 +24,10 @@ package net.yacy.document; -import net.yacy.kelondro.data.meta.DigestURI; +import net.yacy.cora.document.MultiProtocolURI; public class ParserException extends Exception { - private DigestURI url = null; + private MultiProtocolURI url = null; private static final long serialVersionUID = 1L; @@ -35,12 +35,12 @@ public class ParserException extends Exception { super(); } - public ParserException(final String message, final DigestURI url) { + public ParserException(final String message, final MultiProtocolURI url) { super(message + "; url = " + url.toNormalform(true, false)); this.url = url; } - public DigestURI getURL() { + public MultiProtocolURI getURL() { return this.url; } } diff --git a/source/net/yacy/document/TextParser.java b/source/net/yacy/document/TextParser.java index d353d99ce..a5813782b 100644 --- a/source/net/yacy/document/TextParser.java +++ b/source/net/yacy/document/TextParser.java @@ -40,6 +40,7 @@ import java.util.Map; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; +import net.yacy.cora.document.MultiProtocolURI; import net.yacy.document.parser.bzipParser; import net.yacy.document.parser.csvParser; import net.yacy.document.parser.docParser; @@ -61,7 +62,6 @@ import net.yacy.document.parser.vsdParser; import net.yacy.document.parser.xlsParser; import net.yacy.document.parser.zipParser; import net.yacy.document.parser.images.genericImageParser; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.util.FileUtils; @@ -138,7 +138,7 @@ public final class TextParser { } public static Document parseSource( - final DigestURI location, + final MultiProtocolURI location, final String mimeType, final String charset, final File sourceFile @@ -167,7 +167,7 @@ public final class TextParser { } public static Document parseSource( - final DigestURI location, + final MultiProtocolURI location, String mimeType, final String charset, final byte[] content @@ -176,7 +176,7 @@ public final class TextParser { } public static Document parseSource( - final DigestURI location, + final MultiProtocolURI location, String mimeType, final String charset, final long contentLength, @@ -211,7 +211,7 @@ public final class TextParser { } private static Document parseSource( - final DigestURI location, + final MultiProtocolURI location, String mimeType, Idiom idiom, final String charset, @@ -233,7 +233,7 @@ public final class TextParser { } private static Document parseSource( - final DigestURI location, + final MultiProtocolURI location, String mimeType, List idioms, final String charset, @@ -280,7 +280,7 @@ public final class TextParser { * @param mimeType * @return returns null if the content is supported. If the content is not supported, return a error string. */ - public static String supports(final DigestURI url, String mimeType) { + public static String supports(final MultiProtocolURI url, String mimeType) { try { // try to get a parser. If this works, we don't need the parser itself, we just return null to show that everything is ok. List idioms = idiomParser(url, mimeType); @@ -304,7 +304,7 @@ public final class TextParser { * @return a list of Idiom parsers that may be appropriate for the given criteria * @throws ParserException */ - private static List idiomParser(final DigestURI url, String mimeType1) throws ParserException { + private static List idiomParser(final MultiProtocolURI url, String mimeType1) throws ParserException { List idioms = new ArrayList(2); // check extension @@ -345,7 +345,7 @@ public final class TextParser { return null; } - public static String supportsExtension(final DigestURI url) { + public static String supportsExtension(final MultiProtocolURI url) { String ext = url.getFileExtension().toLowerCase(); if (ext == null || ext.length() == 0) return null; if (denyExtensionx.containsKey(ext)) return "file extension '" + ext + "' is denied (2)"; @@ -357,7 +357,7 @@ public final class TextParser { return null; } - public static String mimeOf(DigestURI url) { + public static String mimeOf(MultiProtocolURI url) { return mimeOf(url.getFileExtension()); } diff --git a/source/net/yacy/document/parser/bzipParser.java b/source/net/yacy/document/parser/bzipParser.java index 40d445d6f..fc3ba0160 100644 --- a/source/net/yacy/document/parser/bzipParser.java +++ b/source/net/yacy/document/parser/bzipParser.java @@ -33,12 +33,12 @@ import java.io.InputStream; import java.util.HashSet; import java.util.Set; +import net.yacy.cora.document.MultiProtocolURI; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Idiom; import net.yacy.document.TextParser; import net.yacy.document.ParserException; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.util.FileUtils; import org.apache.tools.bzip2.CBZip2InputStream; @@ -75,7 +75,7 @@ public class bzipParser extends AbstractParser implements Idiom { return SUPPORTED_EXTENSIONS; } - public Document parse(final DigestURI location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException { + public Document parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException { File tempFile = null; try { diff --git a/source/net/yacy/document/parser/csvParser.java b/source/net/yacy/document/parser/csvParser.java index 0d556687a..6f7fc1591 100644 --- a/source/net/yacy/document/parser/csvParser.java +++ b/source/net/yacy/document/parser/csvParser.java @@ -37,11 +37,11 @@ import java.util.HashSet; import java.util.List; import java.util.Set; +import net.yacy.cora.document.MultiProtocolURI; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Idiom; import net.yacy.document.ParserException; -import net.yacy.kelondro.data.meta.DigestURI; /** * a parser for comma-separated values @@ -73,7 +73,7 @@ public class csvParser extends AbstractParser implements Idiom { } @Override - public Document parse(DigestURI location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException { + public Document parse(MultiProtocolURI location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException { // construct a document using all cells of the document // the first row is used as headline // all lines are artificially terminated by a '.' to separate them as sentence for the condenser. @@ -112,7 +112,7 @@ public class csvParser extends AbstractParser implements Idiom { return sb.toString(); } - public List getTable(DigestURI location, String mimeType, String charset, InputStream source) { + public List getTable(MultiProtocolURI location, String mimeType, String charset, InputStream source) { ArrayList rows = new ArrayList(); BufferedReader reader; try { diff --git a/source/net/yacy/document/parser/docParser.java b/source/net/yacy/document/parser/docParser.java index 7d2b14f4b..4114146c0 100644 --- a/source/net/yacy/document/parser/docParser.java +++ b/source/net/yacy/document/parser/docParser.java @@ -32,11 +32,11 @@ import java.io.UnsupportedEncodingException; import java.util.HashSet; import java.util.Set; +import net.yacy.cora.document.MultiProtocolURI; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Idiom; import net.yacy.document.ParserException; -import net.yacy.kelondro.data.meta.DigestURI; import org.apache.poi.hwpf.extractor.WordExtractor; @@ -65,7 +65,7 @@ public class docParser extends AbstractParser implements Idiom { super("Word Document Parser"); } - public Document parse(final DigestURI location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException { + public Document parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException { final WordExtractor extractor; diff --git a/source/net/yacy/document/parser/gzipParser.java b/source/net/yacy/document/parser/gzipParser.java index 624b7fdcb..f15930019 100644 --- a/source/net/yacy/document/parser/gzipParser.java +++ b/source/net/yacy/document/parser/gzipParser.java @@ -34,12 +34,12 @@ import java.util.HashSet; import java.util.Set; import java.util.zip.GZIPInputStream; +import net.yacy.cora.document.MultiProtocolURI; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Idiom; import net.yacy.document.TextParser; import net.yacy.document.ParserException; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.util.FileUtils; @@ -74,7 +74,7 @@ public class gzipParser extends AbstractParser implements Idiom { return SUPPORTED_EXTENSIONS; } - public Document parse(final DigestURI location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException { + public Document parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException { File tempFile = null; try { diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java index e64006bda..203c7cbde 100644 --- a/source/net/yacy/document/parser/html/ContentScraper.java +++ b/source/net/yacy/document/parser/html/ContentScraper.java @@ -44,8 +44,8 @@ import java.util.Properties; import javax.swing.event.EventListenerList; +import net.yacy.cora.document.MultiProtocolURI; import net.yacy.document.parser.htmlParser; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.io.CharBuffer; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.util.FileUtils; @@ -79,8 +79,8 @@ public class ContentScraper extends AbstractScraper implements Scraper { } // class variables: collectors for links - private HashMap anchors; - private HashMap images; // urlhash/image relation + private HashMap anchors; + private HashMap images; // urlhash/image relation private final HashMap metas; private String title; //private String headline; @@ -89,23 +89,23 @@ public class ContentScraper extends AbstractScraper implements Scraper { private final EventListenerList htmlFilterEventListeners; /** - * {@link DigestURI} to the favicon that belongs to the document + * {@link MultiProtocolURI} to the favicon that belongs to the document */ - private DigestURI favicon; + private MultiProtocolURI favicon; /** - * The document root {@link DigestURI} + * The document root {@link MultiProtocolURI} */ - private DigestURI root; + private MultiProtocolURI root; @SuppressWarnings("unchecked") - public ContentScraper(final DigestURI root) { + public ContentScraper(final MultiProtocolURI root) { // the root value here will not be used to load the resource. // it is only the reference for relative links super(linkTags0, linkTags1); this.root = root; - this.anchors = new HashMap(); - this.images = new HashMap(); + this.anchors = new HashMap(); + this.images = new HashMap(); this.metas = new HashMap(); this.title = ""; this.headlines = new ArrayList[4]; @@ -133,9 +133,9 @@ public class ContentScraper extends AbstractScraper implements Scraper { if (b.length() != 0) content.append(b).append(32); } - private DigestURI absolutePath(final String relativePath) { + private MultiProtocolURI absolutePath(final String relativePath) { try { - return DigestURI.newURL(root, relativePath); + return MultiProtocolURI.newURL(root, relativePath); } catch (final Exception e) { return null; } @@ -149,7 +149,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { if (width > 15 && height > 15) { final float ratio = (float) Math.min(width, height) / Math.max(width, height); if (ratio > 0.4) { - final DigestURI url = absolutePath(tagopts.getProperty("src", "")); + final MultiProtocolURI url = absolutePath(tagopts.getProperty("src", "")); final ImageEntry ie = new ImageEntry(url, tagopts.getProperty("alt", ""), width, height, -1); addImage(images, ie); } @@ -162,7 +162,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { } catch (final NumberFormatException e) {} } if (tagname.equalsIgnoreCase("base")) try { - root = new DigestURI(tagopts.getProperty("href", ""), null); + root = new MultiProtocolURI(tagopts.getProperty("href", "")); } catch (final MalformedURLException e) {} if (tagname.equalsIgnoreCase("frame")) { anchors.put(absolutePath(tagopts.getProperty("src", "")), tagopts.getProperty("name","")); @@ -185,7 +185,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { if (href.length() > 0) anchors.put(absolutePath(href), areatitle); } if (tagname.equalsIgnoreCase("link")) { - final DigestURI newLink = absolutePath(tagopts.getProperty("href", "")); + final MultiProtocolURI newLink = absolutePath(tagopts.getProperty("href", "")); if (newLink != null) { final String type = tagopts.getProperty("rel", ""); @@ -193,7 +193,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { if (type.equalsIgnoreCase("shortcut icon")) { final ImageEntry ie = new ImageEntry(newLink, linktitle, -1, -1, -1); - images.put(new String(ie.url().hash()), ie); + images.put(ie.url(), ie); this.favicon = newLink; } else if (!type.equalsIgnoreCase("stylesheet") && !type.equalsIgnoreCase("alternate stylesheet")) { anchors.put(newLink, linktitle); @@ -220,7 +220,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { // System.out.println("ScrapeTag1: tagname=" + tagname + ", opts=" + tagopts.toString() + ", text=" + new String(text)); if (tagname.equalsIgnoreCase("a") && text.length < 2048) { final String href = tagopts.getProperty("href", ""); - DigestURI url; + MultiProtocolURI url; if ((href.length() > 0) && ((url = absolutePath(href)) != null)) { final String f = url.getFile(); final int p = f.lastIndexOf('.'); @@ -350,7 +350,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { } } - public Map getAnchors() { + public Map getAnchors() { // returns a url (String) / name (String) relation return anchors; } @@ -359,7 +359,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { * get all images * @return a map of */ - public HashMap getImages() { + public HashMap getImages() { // this resturns a String(absolute url)/htmlFilterImageEntry - relation return images; } @@ -369,9 +369,9 @@ public class ContentScraper extends AbstractScraper implements Scraper { } /** - * @return the {@link DigestURI} to the favicon that belongs to the document + * @return the {@link MultiProtocolURI} to the favicon that belongs to the document */ - public DigestURI getFavicon() { + public MultiProtocolURI getFavicon() { return this.favicon; } @@ -442,7 +442,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { if (s == null) s = metas.get("dc.description"); if (s == null) s = ""; if (s.length() == 0) { - return DigestURI.splitpattern.split(getTitle().toLowerCase()); + return MultiProtocolURI.splitpattern.split(getTitle().toLowerCase()); } if (s.contains(",")) return s.split(" |,"); if (s.contains(";")) return s.split(" |;"); @@ -536,32 +536,32 @@ public class ContentScraper extends AbstractScraper implements Scraper { if (page == null) throw new IOException("no content in file " + file.toString()); // scrape document to look up charset - final ScraperInputStream htmlFilter = new ScraperInputStream(new ByteArrayInputStream(page),"UTF-8",new DigestURI("http://localhost", null),null,false); + final ScraperInputStream htmlFilter = new ScraperInputStream(new ByteArrayInputStream(page),"UTF-8", new MultiProtocolURI("http://localhost"),null,false); final String charset = htmlParser.patchCharsetEncoding(htmlFilter.detectCharset()); // scrape content - final ContentScraper scraper = new ContentScraper(new DigestURI("http://localhost", null)); + final ContentScraper scraper = new ContentScraper(new MultiProtocolURI("http://localhost")); final Writer writer = new TransformerWriter(null, null, scraper, null, false); FileUtils.copy(new ByteArrayInputStream(page), writer, Charset.forName(charset)); return scraper; } - public static void addAllImages(final HashMap a, final HashMap b) { - final Iterator> i = b.entrySet().iterator(); - Map.Entry ie; + public static void addAllImages(final HashMap a, final HashMap b) { + final Iterator> i = b.entrySet().iterator(); + Map.Entry ie; while (i.hasNext()) { ie = i.next(); addImage(a, ie.getValue()); } } - public static void addImage(final HashMap a, final ImageEntry ie) { - if (a.containsKey(new String(ie.url().hash()))) { + public static void addImage(final HashMap a, final ImageEntry ie) { + if (a.containsKey(ie.url())) { // in case of a collision, take that image that has the better image size tags - if ((ie.height() > 0) && (ie.width() > 0)) a.put(new String(ie.url().hash()), ie); + if ((ie.height() > 0) && (ie.width() > 0)) a.put(ie.url(), ie); } else { - a.put(new String(ie.url().hash()), ie); + a.put(ie.url(), ie); } } diff --git a/source/net/yacy/document/parser/html/ImageEntry.java b/source/net/yacy/document/parser/html/ImageEntry.java index 7aed5ef92..f9db000f8 100644 --- a/source/net/yacy/document/parser/html/ImageEntry.java +++ b/source/net/yacy/document/parser/html/ImageEntry.java @@ -26,16 +26,16 @@ package net.yacy.document.parser.html; import java.util.Comparator; -import net.yacy.kelondro.data.meta.DigestURI; +import net.yacy.cora.document.MultiProtocolURI; public class ImageEntry implements Comparable, Comparator { - private final DigestURI url; + private final MultiProtocolURI url; private final String alt; private final int width, height; private final long fileSize; - public ImageEntry(final DigestURI url, final String alt, final int width, final int height, long fileSize) { + public ImageEntry(final MultiProtocolURI url, final String alt, final int width, final int height, long fileSize) { this.url = url; this.alt = alt; this.width = width; @@ -43,7 +43,7 @@ public class ImageEntry implements Comparable, Comparator languages = new HashSet(); - final HashMap anchors = new HashMap(); - final HashMap images = new HashMap(); + final HashMap anchors = new HashMap(); + final HashMap images = new HashMap(); // add this image to the map of images String infoString = ii.info.toString(); - images.put(infoString, new ImageEntry(location, "", ii.width, ii.height, -1)); + images.put(ii.location, new ImageEntry(location, "", ii.width, ii.height, -1)); if (title == null) title = location.toNormalform(true, true); @@ -204,7 +204,7 @@ public class genericImageParser extends AbstractParser implements Idiom { } public static ImageInfo parseJavaImage( - final DigestURI location, + final MultiProtocolURI location, final InputStream sourceStream) throws ParserException { BufferedImage image = null; try { @@ -222,7 +222,7 @@ public class genericImageParser extends AbstractParser implements Idiom { } public static ImageInfo parseJavaImage( - final DigestURI location, + final MultiProtocolURI location, final BufferedImage image) { ImageInfo ii = new ImageInfo(location); ii.image = image; @@ -259,12 +259,12 @@ public class genericImageParser extends AbstractParser implements Idiom { } public static class ImageInfo { - public DigestURI location; + public MultiProtocolURI location; public BufferedImage image; public StringBuilder info; public int height; public int width; - public ImageInfo(final DigestURI location) { + public ImageInfo(final MultiProtocolURI location) { this.location = location; this.image = null; this.info = new StringBuilder(); @@ -278,9 +278,9 @@ public class genericImageParser extends AbstractParser implements Idiom { public static void main(final String[] args) { File image = new File(args[0]); genericImageParser parser = new genericImageParser(); - DigestURI uri; + MultiProtocolURI uri; try { - uri = new DigestURI("http://localhost/" + image.getName()); + uri = new MultiProtocolURI("http://localhost/" + image.getName()); Document document = parser.parse(uri, "image/" + uri.getFileExtension(), "UTF-8", new FileInputStream(image)); System.out.println(document.toString()); } catch (MalformedURLException e) { diff --git a/source/net/yacy/document/parser/odtParser.java b/source/net/yacy/document/parser/odtParser.java index 026a532eb..d3aad25e0 100644 --- a/source/net/yacy/document/parser/odtParser.java +++ b/source/net/yacy/document/parser/odtParser.java @@ -39,13 +39,13 @@ import java.util.zip.ZipFile; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; +import net.yacy.cora.document.MultiProtocolURI; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Idiom; import net.yacy.document.ParserException; import net.yacy.document.parser.xml.ODContentHandler; import net.yacy.document.parser.xml.ODMetaHandler; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.io.CharBuffer; import net.yacy.kelondro.util.FileUtils; @@ -106,7 +106,7 @@ public class odtParser extends AbstractParser implements Idiom { } @Override - public Document parse(final DigestURI location, final String mimeType, final String charset, final File dest) throws ParserException, InterruptedException { + public Document parse(final MultiProtocolURI location, final String mimeType, final String charset, final File dest) throws ParserException, InterruptedException { Writer writer = null; File writerFile = null; @@ -228,7 +228,7 @@ public class odtParser extends AbstractParser implements Idiom { } } - public Document parse(final DigestURI location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException { + public Document parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException { File dest = null; try { // creating a tempfile diff --git a/source/net/yacy/document/parser/ooxmlParser.java b/source/net/yacy/document/parser/ooxmlParser.java index 61e07c103..0003fb0ea 100644 --- a/source/net/yacy/document/parser/ooxmlParser.java +++ b/source/net/yacy/document/parser/ooxmlParser.java @@ -39,13 +39,13 @@ import java.util.zip.ZipFile; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; +import net.yacy.cora.document.MultiProtocolURI; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Idiom; import net.yacy.document.ParserException; import net.yacy.document.parser.xml.ODContentHandler; import net.yacy.document.parser.xml.ODMetaHandler; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.io.CharBuffer; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.util.FileUtils; @@ -90,7 +90,7 @@ public class ooxmlParser extends AbstractParser implements Idiom { } @Override - public Document parse(final DigestURI location, final String mimeType, final String charset, final File dest) throws ParserException, InterruptedException { + public Document parse(final MultiProtocolURI location, final String mimeType, final String charset, final File dest) throws ParserException, InterruptedException { Writer writer = null; File writerFile = null; @@ -215,7 +215,7 @@ public class ooxmlParser extends AbstractParser implements Idiom { } } - public Document parse(final DigestURI location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException { + public Document parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException { File dest = null; try { // creating a tempfile diff --git a/source/net/yacy/document/parser/pdfParser.java b/source/net/yacy/document/parser/pdfParser.java index f3442b014..08048fdc5 100644 --- a/source/net/yacy/document/parser/pdfParser.java +++ b/source/net/yacy/document/parser/pdfParser.java @@ -44,11 +44,11 @@ import org.apache.pdfbox.pdmodel.encryption.BadSecurityHandlerException; import org.apache.pdfbox.pdmodel.encryption.StandardDecryptionMaterial; import org.apache.pdfbox.util.PDFTextStripper; +import net.yacy.cora.document.MultiProtocolURI; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Idiom; import net.yacy.document.ParserException; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.io.CharBuffer; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.util.FileUtils; @@ -84,7 +84,7 @@ public class pdfParser extends AbstractParser implements Idiom { return SUPPORTED_EXTENSIONS; } - public Document parse(final DigestURI location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException { + public Document parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException { // create a pdf parser final PDDocument theDocument; diff --git a/source/net/yacy/document/parser/pptParser.java b/source/net/yacy/document/parser/pptParser.java index 9504f24e3..312441266 100644 --- a/source/net/yacy/document/parser/pptParser.java +++ b/source/net/yacy/document/parser/pptParser.java @@ -32,11 +32,11 @@ import java.io.InputStream; import java.util.HashSet; import java.util.Set; +import net.yacy.cora.document.MultiProtocolURI; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Idiom; import net.yacy.document.ParserException; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.logging.Log; import org.apache.poi.hslf.extractor.PowerPointExtractor; @@ -70,7 +70,7 @@ public class pptParser extends AbstractParser implements Idiom { * parses the source documents and returns a plasmaParserDocument containing * all extracted information about the parsed document */ - public Document parse(final DigestURI location, final String mimeType, + public Document parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException { try { diff --git a/source/net/yacy/document/parser/psParser.java b/source/net/yacy/document/parser/psParser.java index cef0dbc16..ecc774afb 100644 --- a/source/net/yacy/document/parser/psParser.java +++ b/source/net/yacy/document/parser/psParser.java @@ -37,11 +37,11 @@ import java.io.InputStreamReader; import java.util.HashSet; import java.util.Set; +import net.yacy.cora.document.MultiProtocolURI; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Idiom; import net.yacy.document.ParserException; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.util.FileUtils; @@ -104,7 +104,7 @@ public class psParser extends AbstractParser implements Idiom { @Override - public Document parse(final DigestURI location, final String mimeType, final String charset, final File sourceFile) throws ParserException, InterruptedException { + public Document parse(final MultiProtocolURI location, final String mimeType, final String charset, final File sourceFile) throws ParserException, InterruptedException { File outputFile = null; try { @@ -277,7 +277,7 @@ public class psParser extends AbstractParser implements Idiom { super.reset(); } - public Document parse(final DigestURI location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException { + public Document parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException { File tempFile = null; try { diff --git a/source/net/yacy/document/parser/rssParser.java b/source/net/yacy/document/parser/rssParser.java index 81687484e..668dd3ab9 100644 --- a/source/net/yacy/document/parser/rssParser.java +++ b/source/net/yacy/document/parser/rssParser.java @@ -40,18 +40,18 @@ import java.util.LinkedList; import java.util.Map; import java.util.Set; +import net.yacy.cora.document.Hit; +import net.yacy.cora.document.MultiProtocolURI; +import net.yacy.cora.document.RSSFeed; +import net.yacy.cora.document.RSSReader; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Idiom; import net.yacy.document.ParserException; -import net.yacy.document.content.RSSMessage; import net.yacy.document.parser.html.AbstractScraper; import net.yacy.document.parser.html.ContentScraper; import net.yacy.document.parser.html.ImageEntry; import net.yacy.document.parser.html.TransformerWriter; -import net.yacy.document.parser.xml.RSSFeed; -import net.yacy.document.parser.xml.RSSReader; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.io.CharBuffer; import net.yacy.kelondro.util.ByteBuffer; import net.yacy.kelondro.util.FileUtils; @@ -78,11 +78,11 @@ public class rssParser extends AbstractParser implements Idiom { super("Rich Site Summary/Atom Feed Parser"); } - public Document parse(final DigestURI location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException { + public Document parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException { final LinkedList feedSections = new LinkedList(); - final HashMap anchors = new HashMap(); - final HashMap images = new HashMap(); + final HashMap anchors = new HashMap(); + final HashMap images = new HashMap(); final ByteBuffer text = new ByteBuffer(); final CharBuffer authors = new CharBuffer(); @@ -119,20 +119,20 @@ public class rssParser extends AbstractParser implements Idiom { if (feed.getImage() != null) { try { - DigestURI imgURL = new DigestURI(feed.getImage(), null); - images.put(new String(imgURL.hash()), new ImageEntry(imgURL, feedTitle, -1, -1, -1)); + MultiProtocolURI imgURL = new MultiProtocolURI(feed.getImage()); + images.put(imgURL, new ImageEntry(imgURL, feedTitle, -1, -1, -1)); } catch (MalformedURLException e) {} } // loop through the feed items - for (final RSSMessage item: feed) { + for (final Hit item: feed) { // check for interruption checkInterruption(); final String itemTitle = item.getTitle(); - DigestURI itemURL = null; + MultiProtocolURI itemURL = null; try { - itemURL = new DigestURI(item.getLink(), null); + itemURL = new MultiProtocolURI(item.getLink()); } catch (MalformedURLException e) { continue; } @@ -164,12 +164,12 @@ public class rssParser extends AbstractParser implements Idiom { feedSections.add(itemHeadline); } - final Map itemLinks = scraper.getAnchors(); + final Map itemLinks = scraper.getAnchors(); if (itemLinks != null && !itemLinks.isEmpty()) { anchors.putAll(itemLinks); } - final HashMap itemImages = scraper.getImages(); + final HashMap itemImages = scraper.getImages(); if (itemImages != null && !itemImages.isEmpty()) { ContentScraper.addAllImages(images, itemImages); } diff --git a/source/net/yacy/document/parser/rtfParser.java b/source/net/yacy/document/parser/rtfParser.java index adb5f6abc..ac2474f57 100644 --- a/source/net/yacy/document/parser/rtfParser.java +++ b/source/net/yacy/document/parser/rtfParser.java @@ -34,11 +34,11 @@ import java.util.Set; import javax.swing.text.DefaultStyledDocument; import javax.swing.text.rtf.RTFEditorKit; +import net.yacy.cora.document.MultiProtocolURI; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Idiom; import net.yacy.document.ParserException; -import net.yacy.kelondro.data.meta.DigestURI; public class rtfParser extends AbstractParser implements Idiom { @@ -62,7 +62,7 @@ public class rtfParser extends AbstractParser implements Idiom { super("Rich Text Format Parser"); } - public Document parse(final DigestURI location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException { + public Document parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException { try { diff --git a/source/net/yacy/document/parser/sevenzipParser.java b/source/net/yacy/document/parser/sevenzipParser.java index 94c79de11..482bb38d4 100644 --- a/source/net/yacy/document/parser/sevenzipParser.java +++ b/source/net/yacy/document/parser/sevenzipParser.java @@ -36,12 +36,12 @@ import java.io.OutputStream; import java.util.HashSet; import java.util.Set; +import net.yacy.cora.document.MultiProtocolURI; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Idiom; import net.yacy.document.TextParser; import net.yacy.document.ParserException; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.util.FileUtils; @@ -69,7 +69,7 @@ public class sevenzipParser extends AbstractParser implements Idiom { super("7zip Archive Parser"); } - public Document parse(final DigestURI location, final String mimeType, final String charset, final IInStream source) throws ParserException, InterruptedException { + public Document parse(final MultiProtocolURI location, final String mimeType, final String charset, final IInStream source) throws ParserException, InterruptedException { final Document doc = new Document(location, mimeType, charset, null, null, null, null, null, null, null, (Object)null, null, null, false); Handler archive; super.theLogger.logFine("opening 7zip archive..."); @@ -99,13 +99,13 @@ public class sevenzipParser extends AbstractParser implements Idiom { } @Override - public Document parse(final DigestURI location, final String mimeType, final String charset, + public Document parse(final MultiProtocolURI location, final String mimeType, final String charset, final byte[] source) throws ParserException, InterruptedException { return parse(location, mimeType, charset, new ByteArrayIInStream(source)); } @Override - public Document parse(final DigestURI location, final String mimeType, final String charset, + public Document parse(final MultiProtocolURI location, final String mimeType, final String charset, final File sourceFile) throws ParserException, InterruptedException { try { return parse(location, mimeType, charset, new MyRandomAccessFile(sourceFile, "r")); @@ -114,7 +114,7 @@ public class sevenzipParser extends AbstractParser implements Idiom { } } - public Document parse(final DigestURI location, final String mimeType, final String charset, + public Document parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException { try { final ByteArrayOutputStream cfos = new ByteArrayOutputStream(); @@ -189,7 +189,7 @@ public class sevenzipParser extends AbstractParser implements Idiom { Document theDoc; // workaround for relative links in file, normally '#' shall be used behind the location, see // below for reversion of the effects - final DigestURI url = DigestURI.newURL(doc.dc_source(), this.prefix + "/" + super.filePath); + final MultiProtocolURI url = MultiProtocolURI.newURL(doc.dc_source(), this.prefix + "/" + super.filePath); final String mime = TextParser.mimeOf(super.filePath.substring(super.filePath.lastIndexOf('.') + 1)); theDoc = TextParser.parseSource(url, mime, null, this.cfos.toByteArray()); diff --git a/source/net/yacy/document/parser/swfParser.java b/source/net/yacy/document/parser/swfParser.java index c93e19fa4..050e2633f 100644 --- a/source/net/yacy/document/parser/swfParser.java +++ b/source/net/yacy/document/parser/swfParser.java @@ -33,11 +33,11 @@ import java.util.HashMap; import java.util.HashSet; import java.util.Set; +import net.yacy.cora.document.MultiProtocolURI; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Idiom; import net.yacy.document.ParserException; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.logging.Log; import pt.tumba.parser.swf.SWF2HTML; @@ -74,7 +74,7 @@ public class swfParser extends AbstractParser implements Idiom { * parses the source documents and returns a plasmaParserDocument containing * all extracted information about the parsed document */ - public Document parse(final DigestURI location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException { + public Document parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException { try { final SWF2HTML swf2html = new SWF2HTML(); @@ -97,7 +97,7 @@ public class swfParser extends AbstractParser implements Idiom { final String[] sections = null; final String abstrct = null; //TreeSet images = null; - final HashMap anchors = new HashMap(); + final HashMap anchors = new HashMap(); int urls = 0; int urlStart = -1; int urlEnd = 0; @@ -114,7 +114,7 @@ public class swfParser extends AbstractParser implements Idiom { urlEnd = contents.indexOf(linebreak,urlStart); url = contents.substring(urlStart,urlEnd); urlnr = (Integer.valueOf(++urls)).toString(); - anchors.put(new DigestURI(url, null), urlnr); + anchors.put(new MultiProtocolURI(url), urlnr); contents = contents.substring(0,urlStart)+contents.substring(urlEnd); } diff --git a/source/net/yacy/document/parser/tarParser.java b/source/net/yacy/document/parser/tarParser.java index f8f6cbb1a..fefb5e312 100644 --- a/source/net/yacy/document/parser/tarParser.java +++ b/source/net/yacy/document/parser/tarParser.java @@ -38,6 +38,7 @@ import java.util.Map; import java.util.Set; import java.util.zip.GZIPInputStream; +import net.yacy.cora.document.MultiProtocolURI; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Idiom; @@ -45,7 +46,6 @@ import net.yacy.document.TextParser; import net.yacy.document.ParserException; import net.yacy.document.parser.html.ContentScraper; import net.yacy.document.parser.html.ImageEntry; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.util.ByteBuffer; import net.yacy.kelondro.util.FileUtils; @@ -81,7 +81,7 @@ public class tarParser extends AbstractParser implements Idiom { return SUPPORTED_EXTENSIONS; } - public Document parse(final DigestURI location, final String mimeType, final String charset, InputStream source) throws ParserException, InterruptedException { + public Document parse(final MultiProtocolURI location, final String mimeType, final String charset, InputStream source) throws ParserException, InterruptedException { long docTextLength = 0; OutputStream docText = null; @@ -106,8 +106,8 @@ public class tarParser extends AbstractParser implements Idiom { final LinkedList docSections = new LinkedList(); final StringBuilder docAbstrct = new StringBuilder(); - final Map docAnchors = new HashMap(); - final HashMap docImages = new HashMap(); + final Map docAnchors = new HashMap(); + final HashMap docImages = new HashMap(); // looping through the contained files TarEntry entry; @@ -143,7 +143,7 @@ public class tarParser extends AbstractParser implements Idiom { checkInterruption(); // parsing the content - subDoc = TextParser.parseSource(DigestURI.newURL(location,"#" + entryName),entryMime,null,subDocTempFile); + subDoc = TextParser.parseSource(MultiProtocolURI.newURL(location,"#" + entryName),entryMime,null,subDocTempFile); } catch (final ParserException e) { this.theLogger.logInfo("Unable to parse tar file entry '" + entryName + "'. " + e.getMessage()); } finally { diff --git a/source/net/yacy/document/parser/torrentParser.java b/source/net/yacy/document/parser/torrentParser.java index 0f2b81ae8..7d3346e9c 100644 --- a/source/net/yacy/document/parser/torrentParser.java +++ b/source/net/yacy/document/parser/torrentParser.java @@ -36,12 +36,12 @@ import java.util.List; import java.util.Map; import java.util.Set; +import net.yacy.cora.document.MultiProtocolURI; import net.yacy.document.AbstractParser; import net.yacy.document.Condenser; import net.yacy.document.Document; import net.yacy.document.Idiom; import net.yacy.document.ParserException; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.util.BDecoder; import net.yacy.kelondro.util.FileUtils; @@ -75,7 +75,7 @@ public class torrentParser extends AbstractParser implements Idiom { } @Override - public Document parse(DigestURI location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException { + public Document parse(MultiProtocolURI location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException { byte[] b = null; try { b = FileUtils.read(source); @@ -141,7 +141,7 @@ public class torrentParser extends AbstractParser implements Idiom { try { byte[] b = FileUtils.read(new File(args[0])); torrentParser parser = new torrentParser(); - Document d = parser.parse(new DigestURI("http://localhost/test.torrent", null), null, "utf-8", b); + Document d = parser.parse(new MultiProtocolURI("http://localhost/test.torrent"), null, "utf-8", b); Condenser c = new Condenser(d, true, true); Map w = c.words(); for (Map.Entry e: w.entrySet()) System.out.println("Word: " + e.getKey() + " - " + e.getValue().posInText); diff --git a/source/net/yacy/document/parser/vcfParser.java b/source/net/yacy/document/parser/vcfParser.java index 86567f820..06610ccf9 100644 --- a/source/net/yacy/document/parser/vcfParser.java +++ b/source/net/yacy/document/parser/vcfParser.java @@ -37,11 +37,11 @@ import java.util.Iterator; import java.util.LinkedList; import java.util.Set; +import net.yacy.cora.document.MultiProtocolURI; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Idiom; import net.yacy.document.ParserException; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.order.Base64Order; /** @@ -80,13 +80,13 @@ public class vcfParser extends AbstractParser implements Idiom { return SUPPORTED_EXTENSIONS; } - public Document parse(final DigestURI url, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException { + public Document parse(final MultiProtocolURI url, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException { try { final StringBuilder parsedTitle = new StringBuilder(); final StringBuilder parsedDataText = new StringBuilder(); final HashMap parsedData = new HashMap(); - final HashMap anchors = new HashMap(); + final HashMap anchors = new HashMap(); final LinkedList parsedNames = new LinkedList(); boolean useLastLine = false; @@ -195,7 +195,7 @@ public class vcfParser extends AbstractParser implements Idiom { parsedData.clear(); } else if (key.toUpperCase().startsWith("URL")) { try { - final DigestURI newURL = new DigestURI(value, null); + final MultiProtocolURI newURL = new MultiProtocolURI(value); anchors.put(newURL, newURL.toString()); //parsedData.put(key,value); } catch (final MalformedURLException ex) {/* ignore this */} diff --git a/source/net/yacy/document/parser/vsdParser.java b/source/net/yacy/document/parser/vsdParser.java index 03f96d9ab..44b601394 100644 --- a/source/net/yacy/document/parser/vsdParser.java +++ b/source/net/yacy/document/parser/vsdParser.java @@ -31,11 +31,11 @@ import java.io.InputStream; import java.util.HashSet; import java.util.Set; +import net.yacy.cora.document.MultiProtocolURI; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Idiom; import net.yacy.document.ParserException; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.logging.Log; import org.apache.poi.hdgf.extractor.VisioTextExtractor; @@ -82,7 +82,7 @@ public class vsdParser extends AbstractParser implements Idiom { * parses the source documents and returns a plasmaParserDocument containing * all extracted information about the parsed document */ - public Document parse(final DigestURI location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException { + public Document parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException { Document theDoc = null; diff --git a/source/net/yacy/document/parser/xlsParser.java b/source/net/yacy/document/parser/xlsParser.java index bfbdc34ca..0667c2025 100644 --- a/source/net/yacy/document/parser/xlsParser.java +++ b/source/net/yacy/document/parser/xlsParser.java @@ -31,11 +31,11 @@ import java.io.InputStream; import java.util.HashSet; import java.util.Set; +import net.yacy.cora.document.MultiProtocolURI; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Idiom; import net.yacy.document.ParserException; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.logging.Log; import org.apache.poi.hssf.eventusermodel.HSSFEventFactory; @@ -76,7 +76,7 @@ public class xlsParser extends AbstractParser implements Idiom { * parses the source documents and returns a plasmaParserDocument containing * all extracted information about the parsed document */ - public Document parse(final DigestURI location, final String mimeType, + public Document parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException { return new XLSHSSFListener().parse(location, mimeType, charset, source); @@ -111,7 +111,7 @@ public class xlsParser extends AbstractParser implements Idiom { * parses the source documents and returns a Document containing * all extracted information about the parsed document */ - public Document parse(final DigestURI location, final String mimeType, + public Document parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException { try { diff --git a/source/net/yacy/document/parser/zipParser.java b/source/net/yacy/document/parser/zipParser.java index 846161c25..f61e3cc0c 100644 --- a/source/net/yacy/document/parser/zipParser.java +++ b/source/net/yacy/document/parser/zipParser.java @@ -39,6 +39,7 @@ import java.util.Set; import java.util.zip.ZipEntry; import java.util.zip.ZipInputStream; +import net.yacy.cora.document.MultiProtocolURI; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Idiom; @@ -46,7 +47,6 @@ import net.yacy.document.TextParser; import net.yacy.document.ParserException; import net.yacy.document.parser.html.ContentScraper; import net.yacy.document.parser.html.ImageEntry; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.util.ByteBuffer; import net.yacy.kelondro.util.FileUtils; @@ -82,7 +82,7 @@ public class zipParser extends AbstractParser implements Idiom { return SUPPORTED_EXTENSIONS; } - public Document parse(final DigestURI location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException { + public Document parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException { long docTextLength = 0; OutputStream docText = null; @@ -95,8 +95,8 @@ public class zipParser extends AbstractParser implements Idiom { final StringBuilder docLongTitle = new StringBuilder(); final LinkedList docSections = new LinkedList(); final StringBuilder docAbstrct = new StringBuilder(); - final Map docAnchors = new HashMap(); - final HashMap docImages = new HashMap(); + final Map docAnchors = new HashMap(); + final HashMap docImages = new HashMap(); // looping through the contained files ZipEntry entry; @@ -129,7 +129,7 @@ public class zipParser extends AbstractParser implements Idiom { FileUtils.copy(zippedContent,subDocTempFile,entry.getSize()); // parsing the zip file entry - subDoc = TextParser.parseSource(DigestURI.newURL(location,"#" + entryName),entryMime,null, subDocTempFile); + subDoc = TextParser.parseSource(MultiProtocolURI.newURL(location,"#" + entryName),entryMime,null, subDocTempFile); } catch (final ParserException e) { this.theLogger.logInfo("Unable to parse zip file entry '" + entryName + "'. " + e.getMessage()); } finally { diff --git a/source/net/yacy/kelondro/data/meta/DigestURI.java b/source/net/yacy/kelondro/data/meta/DigestURI.java index c3a4c4583..25db5e8d3 100644 --- a/source/net/yacy/kelondro/data/meta/DigestURI.java +++ b/source/net/yacy/kelondro/data/meta/DigestURI.java @@ -28,63 +28,24 @@ package net.yacy.kelondro.data.meta; // and to prevent that java.net.URL usage causes DNS queries which are used in java.net. import java.io.File; -import java.io.FileInputStream; -import java.io.IOException; -import java.io.InputStream; import java.io.Serializable; import java.net.MalformedURLException; -import java.net.UnknownHostException; -import java.text.Collator; -import java.util.Locale; -import java.util.TreeSet; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -import jcifs.smb.SmbException; -import jcifs.smb.SmbFile; -import jcifs.smb.SmbFileInputStream; +import net.yacy.cora.document.MultiProtocolURI; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.order.Base64Order; import net.yacy.kelondro.order.Digest; import net.yacy.kelondro.util.ByteArray; import net.yacy.kelondro.util.Domains; -import net.yacy.kelondro.util.FileUtils; -import net.yacy.kelondro.util.Punycode; -import net.yacy.kelondro.util.Punycode.PunycodeException; -public class DigestURI implements Serializable { +public class DigestURI extends MultiProtocolURI implements Serializable { - private static final long serialVersionUID = -1173233022912141884L; + private static final long serialVersionUID = -1173233022912141885L; public static final int TLD_any_zone_filter = 255; // from TLD zones can be filtered during search; this is the catch-all filter - private static final Pattern backPathPattern = Pattern.compile("(/[^/]+(? sessionIDnames; - static { - insensitiveCollator.setStrength(Collator.SECONDARY); - insensitiveCollator.setDecomposition(Collator.NO_DECOMPOSITION); - sessionIDnames = new TreeSet(insensitiveCollator); - } - - public static final void initSessionIDNames(File idNamesFile) { - for (String s: FileUtils.loadList(idNamesFile)) { - if (s == null) continue; - s = s.trim(); - if (s.length() > 0) sessionIDnames.add(s); - } - } // class variables - private String protocol, host, userInfo, path, quest, ref; private byte[] hash; - private int port; public static String domhash(final String host) { String h = host; @@ -108,640 +69,30 @@ public class DigestURI implements Serializable { } public DigestURI(final String url, final byte[] hash) throws MalformedURLException { - if (url == null) throw new MalformedURLException("url string is null"); - - parseURLString(url); + super(url); this.hash = hash; } - public static final boolean isHTTP(String s) { return s.startsWith("http://"); } - public static final boolean isHTTPS(String s) { return s.startsWith("https://"); } - public static final boolean isFTP(String s) { return s.startsWith("ftp://"); } - public static final boolean isFile(String s) { return s.startsWith("file://"); } - public static final boolean isSMB(String s) { return s.startsWith("smb://") || s.startsWith("\\\\"); } - - public final boolean isHTTP() { return this.protocol.equals("http"); } - public final boolean isHTTPS() { return this.protocol.equals("https"); } - public final boolean isFTP() { return this.protocol.equals("ftp"); } - public final boolean isFile() { return this.protocol.equals("file"); } - public final boolean isSMB() { return this.protocol.equals("smb"); } - - private void parseURLString(String url) throws MalformedURLException { - // identify protocol - assert (url != null); - url = url.trim(); - if (url.startsWith("\\\\")) { - url = "smb://" + url.substring(2).replaceAll("\\\\", "/"); - } - - if (url.length() > 1 && url.charAt(1) == ':') { - // maybe a DOS drive path - url = "file://" + url; - } - - if (url.length() > 0 && url.charAt(0) == '/') { - // maybe a unix/linux absolute path - url = "file://" + url; - } - - int p = url.indexOf(':'); - if (p < 0) { - url = "http://" + url; - p = 4; - } - this.protocol = url.substring(0, p).toLowerCase().trim(); - if (url.length() < p + 4) throw new MalformedURLException("URL not parseable: '" + url + "'"); - if (!this.protocol.equals("file") && url.substring(p + 1, p + 3).equals("//")) { - // identify host, userInfo and file for http and ftp protocol - final int q = url.indexOf('/', p + 3); - int r; - if (q < 0) { - if ((r = url.indexOf('@', p + 3)) < 0) { - host = url.substring(p + 3); - userInfo = null; - } else { - host = url.substring(r + 1); - userInfo = url.substring(p + 3, r); - } - path = "/"; - } else { - host = url.substring(p + 3, q).trim(); - if ((r = host.indexOf('@')) < 0) { - userInfo = null; - } else { - userInfo = host.substring(0, r); - host = host.substring(r + 1); - } - path = url.substring(q); - } - if (host.length() < 4 && !protocol.equals("file")) throw new MalformedURLException("host too short: '" + host + "'"); - if (host.indexOf('&') >= 0) throw new MalformedURLException("invalid '&' in host"); - path = resolveBackpath(path); - identPort(url, (isHTTP() ? 80 : (isHTTPS() ? 443 : (isFTP() ? 21 : (isSMB() ? 445 : -1))))); - identRef(); - identQuest(); - escape(); - } else { - // this is not a http or ftp url - if (protocol.equals("mailto")) { - // parse email url - final int q = url.indexOf('@', p + 3); - if (q < 0) { - throw new MalformedURLException("wrong email address: " + url); - } - userInfo = url.substring(p + 1, q); - host = url.substring(q + 1); - path = null; - port = -1; - quest = null; - ref = null; - } if (protocol.equals("file")) { - // parse file url - String h = url.substring(p + 1); - if (h.startsWith("//")) { - // host may be given, but may be also empty - final int q = h.indexOf('/', 2); - if (q <= 0) { - // no host given - host = null; - path = h.substring(2); - } else { - host = h.substring(2, q); - if (host.length() == 0 || host.equals("localhost")) host = null; - h = h.substring(q); - char c = h.charAt(2); - if (c == ':' || c == '|') - path = h.substring(1); - else - path = h; - } - } else { - host = null; - if (h.length() > 0 && h.charAt(0) == '/') { - char c = h.charAt(2); - if (c == ':' || c == '|') - path = h.substring(1); - else - path = h; - } else { - char c = h.charAt(1); - if (c == ':' || c == '|') - path = h; - else - path = "/" + h; - } - } - userInfo = null; - port = -1; - quest = null; - ref = null; - } else { - throw new MalformedURLException("unknown protocol: " + url); - } - } - - // handle international domains - if (!Punycode.isBasic(host)) try { - final String[] domainParts = patternDot.split(host, 0); - StringBuilder buffer = new StringBuilder(); - // encode each domain-part separately - for(int i=0; i 0 && relPath.charAt(0) == '/') { - this.path = relPath; - } else if (baseURL.path.endsWith("/")) { - if (relPath.length() > 0 && (relPath.charAt(0) == '#' || relPath.charAt(0) == '?')) { - throw new MalformedURLException("relative path malformed: " + relPath); - } - this.path = baseURL.path + relPath; - } else { - if (relPath.length() > 0 && (relPath.charAt(0) == '#' || relPath.charAt(0) == '?')) { - this.path = baseURL.path + relPath; - } else { - final int q = baseURL.path.lastIndexOf('/'); - if (q < 0) { - this.path = relPath; - } else { - this.path = baseURL.path.substring(0, q + 1) + relPath; - } - } - } - this.quest = baseURL.quest; - this.ref = baseURL.ref; - - path = resolveBackpath(path); - identRef(); - identQuest(); - escape(); } public DigestURI(final String protocol, final String host, final int port, final String path) throws MalformedURLException { - if (protocol == null) throw new MalformedURLException("protocol is null"); - this.protocol = protocol; - this.host = host; - this.port = port; - this.path = path; + super(protocol, host, port, path); this.hash = null; - identRef(); - identQuest(); - escape(); } - // resolve '..' - public String resolveBackpath(final String path) { - String p = path; - - /* by [MT] */ - if (p.length() == 0 || p.charAt(0) != '/') { p = "/" + p; } - - final Matcher matcher = backPathPattern.matcher(p); - while (matcher.find()) { - p = matcher.replaceAll(""); - matcher.reset(p); - } - - return p.equals("")?"/":p; - } - - /** - * Escapes the following parts of the url, this object already contains: - *
    - *
  • path: see {@link #escape(String)}
  • - *
  • ref: same as above
  • - *
  • quest: same as above without the ampersand ("&") and the equals symbol
  • - *
- */ - private void escape() { - if (path != null && path.indexOf('%') == -1) escapePath(); - if (quest != null && quest.indexOf('%') == -1) escapeQuest(); - if (ref != null && ref.indexOf('%') == -1) escapeRef(); - } - - private void escapePath() { - final String[] pathp = patternSlash.split(path, -1); - StringBuilder ptmp = new StringBuilder(path.length() + 10); - for (int i = 0; i < pathp.length; i++) { - ptmp.append('/'); - ptmp.append(escape(pathp[i])); - } - path = ptmp.substring((ptmp.length() > 0) ? 1 : 0); - } - - private void escapeRef() { - ref = escape(ref).toString(); - } - - private void escapeQuest() { - final String[] questp = patternAmp.split(quest, -1); - StringBuilder qtmp = new StringBuilder(quest.length() + 10); - for (int i = 0; i < questp.length; i++) { - if (questp[i].indexOf('=') != -1) { - qtmp.append('&'); - qtmp.append(escape(questp[i].substring(0, questp[i].indexOf('=')))); - qtmp.append('='); - qtmp.append(escape(questp[i].substring(questp[i].indexOf('=') + 1))); - } else { - qtmp.append('&'); - qtmp.append(escape(questp[i])); - } - } - quest = qtmp.substring((qtmp.length() > 0) ? 1 : 0); - } - - private final static String[] hex = { - "%00", "%01", "%02", "%03", "%04", "%05", "%06", "%07", - "%08", "%09", "%0A", "%0B", "%0C", "%0D", "%0E", "%0F", - "%10", "%11", "%12", "%13", "%14", "%15", "%16", "%17", - "%18", "%19", "%1A", "%1B", "%1C", "%1D", "%1E", "%1F", - "%20", "%21", "%22", "%23", "%24", "%25", "%26", "%27", - "%28", "%29", "%2A", "%2B", "%2C", "%2D", "%2E", "%2F", - "%30", "%31", "%32", "%33", "%34", "%35", "%36", "%37", - "%38", "%39", "%3A", "%3B", "%3C", "%3D", "%3E", "%3F", - "%40", "%41", "%42", "%43", "%44", "%45", "%46", "%47", - "%48", "%49", "%4A", "%4B", "%4C", "%4D", "%4E", "%4F", - "%50", "%51", "%52", "%53", "%54", "%55", "%56", "%57", - "%58", "%59", "%5A", "%5B", "%5C", "%5D", "%5E", "%5F", - "%60", "%61", "%62", "%63", "%64", "%65", "%66", "%67", - "%68", "%69", "%6A", "%6B", "%6C", "%6D", "%6E", "%6F", - "%70", "%71", "%72", "%73", "%74", "%75", "%76", "%77", - "%78", "%79", "%7A", "%7B", "%7C", "%7D", "%7E", "%7F", - "%80", "%81", "%82", "%83", "%84", "%85", "%86", "%87", - "%88", "%89", "%8A", "%8B", "%8C", "%8D", "%8E", "%8F", - "%90", "%91", "%92", "%93", "%94", "%95", "%96", "%97", - "%98", "%99", "%9A", "%9B", "%9C", "%9D", "%9E", "%9F", - "%A0", "%A1", "%A2", "%A3", "%A4", "%A5", "%A6", "%A7", - "%A8", "%A9", "%AA", "%AB", "%AC", "%AD", "%AE", "%AF", - "%B0", "%B1", "%B2", "%B3", "%B4", "%B5", "%B6", "%B7", - "%B8", "%B9", "%BA", "%BB", "%BC", "%BD", "%BE", "%BF", - "%C0", "%C1", "%C2", "%C3", "%C4", "%C5", "%C6", "%C7", - "%C8", "%C9", "%CA", "%CB", "%CC", "%CD", "%CE", "%CF", - "%D0", "%D1", "%D2", "%D3", "%D4", "%D5", "%D6", "%D7", - "%D8", "%D9", "%DA", "%DB", "%DC", "%DD", "%DE", "%DF", - "%E0", "%E1", "%E2", "%E3", "%E4", "%E5", "%E6", "%E7", - "%E8", "%E9", "%EA", "%EB", "%EC", "%ED", "%EE", "%EF", - "%F0", "%F1", "%F2", "%F3", "%F4", "%F5", "%F6", "%F7", - "%F8", "%F9", "%FA", "%FB", "%FC", "%FD", "%FE", "%FF" - }; - - /** - * Encode a string to the "x-www-form-urlencoded" form, enhanced - * with the UTF-8-in-URL proposal. This is what happens: - * - *
    - *
  • The ASCII characters 'a' through 'z', 'A' through 'Z', - * and '0' through '9' remain the same. - * - *
  • The unreserved characters - _ . ! ~ * ' ( ) remain the same. - * - *
  • All other ASCII characters are converted into the - * 3-character string "%xy", where xy is - * the two-digit hexadecimal representation of the character - * code - * - *
  • All non-ASCII characters are encoded in two steps: first - * to a sequence of 2 or 3 bytes, using the UTF-8 algorithm; - * secondly each of these bytes is encoded as "%xx". - *
- * - * @param s The string to be encoded - * @return The encoded string - */ - // from: http://www.w3.org/International/URLUTF8Encoder.java - public static StringBuilder escape(final String s) { - final int len = s.length(); - final StringBuilder sbuf = new StringBuilder(len + 10); - for (int i = 0; i < len; i++) { - final int ch = s.charAt(i); - if ('A' <= ch && ch <= 'Z') { // 'A'..'Z' - sbuf.append((char)ch); - } else if ('a' <= ch && ch <= 'z') { // 'a'..'z' - sbuf.append((char)ch); - } else if ('0' <= ch && ch <= '9') { // '0'..'9' - sbuf.append((char)ch); - } else if (ch == ' ') { // space - sbuf.append("%20"); - } else if (ch == '&' || ch == ':' // unreserved - || ch == '-' || ch == '_' - || ch == '.' || ch == '!' - || ch == '~' || ch == '*' - || ch == '\'' || ch == '(' - || ch == ')' || ch == ';') { - sbuf.append((char)ch); - } else if (ch == '/') { // reserved, but may appear in post part where it should not be replaced - sbuf.append((char)ch); - } else if (ch <= 0x007f) { // other ASCII - sbuf.append(hex[ch]); - } else if (ch <= 0x07FF) { // non-ASCII <= 0x7FF - sbuf.append(hex[0xc0 | (ch >> 6)]); - sbuf.append(hex[0x80 | (ch & 0x3F)]); - } else { // 0x7FF < ch <= 0xFFFF - sbuf.append(hex[0xe0 | (ch >> 12)]); - sbuf.append(hex[0x80 | ((ch >> 6) & 0x3F)]); - sbuf.append(hex[0x80 | (ch & 0x3F)]); - } - } - return sbuf; - } - - // from: http://www.w3.org/International/unescape.java - public static String unescape(final String s) { - final int l = s.length(); - final StringBuilder sbuf = new StringBuilder(l); - int ch = -1; - int b, sumb = 0; - for (int i = 0, more = -1; i < l; i++) { - /* Get next byte b from URL segment s */ - switch (ch = s.charAt(i)) { - case '%': - if (i + 2 < l) { - ch = s.charAt(++i); - int hb = (Character.isDigit ((char) ch) ? ch - '0' : 10 + Character.toLowerCase((char) ch) - 'a') & 0xF; - ch = s.charAt(++i); - int lb = (Character.isDigit ((char) ch) ? ch - '0' : 10 + Character.toLowerCase ((char) ch) - 'a') & 0xF; - b = (hb << 4) | lb; - } else { - b = ch; - } - break; - case '+': - b = ' '; - break; - default: - b = ch; - } - /* Decode byte b as UTF-8, sumb collects incomplete chars */ - if ((b & 0xc0) == 0x80) { // 10xxxxxx (continuation byte) - sumb = (sumb << 6) | (b & 0x3f); // Add 6 bits to sumb - if (--more == 0) sbuf.append((char) sumb); // Add char to sbuf - } else if ((b & 0x80) == 0x00) { // 0xxxxxxx (yields 7 bits) - sbuf.append((char) b); // Store in sbuf - } else if ((b & 0xe0) == 0xc0) { // 110xxxxx (yields 5 bits) - sumb = b & 0x1f; - more = 1; // Expect 1 more byte - } else if ((b & 0xf0) == 0xe0) { // 1110xxxx (yields 4 bits) - sumb = b & 0x0f; - more = 2; // Expect 2 more bytes - } else if ((b & 0xf8) == 0xf0) { // 11110xxx (yields 3 bits) - sumb = b & 0x07; - more = 3; // Expect 3 more bytes - } else if ((b & 0xfc) == 0xf8) { // 111110xx (yields 2 bits) - sumb = b & 0x03; - more = 4; // Expect 4 more bytes - } else /*if ((b & 0xfe) == 0xfc)*/ { // 1111110x (yields 1 bit) - sumb = b & 0x01; - more = 5; // Expect 5 more bytes - } - /* We don't test if the UTF-8 encoding is well-formed */ - } - return sbuf.toString(); - } - - private void identPort(final String inputURL, final int dflt) throws MalformedURLException { - // identify ref in file - final int r = this.host.indexOf(':'); - if (r < 0) { - this.port = dflt; - } else { - try { - final String portStr = this.host.substring(r + 1); - if (portStr.trim().length() > 0) this.port = Integer.parseInt(portStr); - else this.port = -1; - this.host = this.host.substring(0, r); - } catch (final NumberFormatException e) { - throw new MalformedURLException("wrong port in host fragment '" + this.host + "' of input url '" + inputURL + "'"); - } - } - } - - private void identRef() { - // identify ref in file - final int r = path.indexOf('#'); - if (r < 0) { - this.ref = null; - } else { - this.ref = path.substring(r + 1); - this.path = path.substring(0, r); - } - } - - private void identQuest() { - // identify quest in file - final int r = path.indexOf('?'); - if (r < 0) { - this.quest = null; - } else { - this.quest = path.substring(r + 1); - this.path = path.substring(0, r); - } - } - - public String getFile() { - return getFile(false, false); - } - - public String getFile(final boolean excludeReference, final boolean removeSessionID) { - // this is the path plus quest plus ref - // if there is no quest and no ref the result is identical to getPath - // this is defined according to http://java.sun.com/j2se/1.4.2/docs/api/java/net/URL.html#getFile() - if (quest == null) return (excludeReference || ref == null) ? path : path + "#" + ref; - String q = quest; - if (removeSessionID) { - for (String sid: sessionIDnames) { - if (q.toLowerCase().startsWith(sid.toLowerCase() + "=")) { - int p = q.indexOf('&'); - if (p < 0) return (excludeReference || ref == null) ? path : path + "#" + ref; - q = q.substring(p + 1); - continue; - } - int p = q.toLowerCase().indexOf("&" + sid.toLowerCase() + "="); - if (p < 0) continue; - int p1 = q.indexOf('&', p); - if (p1 < 0) { - q = q.substring(0, p); - } else { - q = q.substring(0, p) + q.substring(p1); - } - } - } - return (excludeReference || ref == null) ? path + "?" + q : path + "?" + q + "#" + ref; - } - - public String getFileName() { - // this is a method not defined in any sun api - // it returns the last portion of a path without any reference - final int p = path.lastIndexOf('/'); - if (p < 0) return path; - if (p == path.length() - 1) return ""; // no file name, this is a path to a directory - return path.substring(p + 1); // the 'real' file name - } - - public String getFileExtension() { - String name = getFileName(); - int p = name.lastIndexOf('.'); - if (p < 0) return ""; - return name.substring(p + 1); - } - - public String getPath() { - return path; - } - - /** - * return the file object to a local file - * this patches also 'strange' windows file paths - * @return the file as absolute path - */ - public File getLocalFile() { - char c = path.charAt(1); - if (c == ':') return new File(path.replace('/', '\\')); - if (c == '|') return new File(path.charAt(0) + ":" + path.substring(2).replace('/', '\\')); - c = path.charAt(2); - if (c == ':' || c == '|') return new File(path.charAt(1) + ":" + path.substring(3).replace('/', '\\')); - return new File(path); - } - - public String getAuthority() { - return ((port >= 0) && (host != null)) ? host + ":" + port : ((host != null) ? host : ""); - } - - public String getHost() { - return host; - } - - public int getPort() { - return port; - } - - public String getProtocol() { - return protocol; - } - - public String getRef() { - return ref; - } - - public void removeRef() { - ref = null; - } - - public String getUserInfo() { - return userInfo; - } - - public String getQuery() { - return quest; - } - - @Override - public String toString() { - return toNormalform(false, true); - } - - public String toNormalform(final boolean excludeReference, final boolean stripAmp) { - return toNormalform(excludeReference, stripAmp, false); - } - - public String toNormalform(final boolean excludeReference, final boolean stripAmp, final boolean removeSessionID) { - String result = toNormalform0(excludeReference, removeSessionID); - if (stripAmp) { - result = result.replaceAll("&", "&"); - } - return result; - } - - private String toNormalform0(final boolean excludeReference, final boolean removeSessionID) { - // generates a normal form of the URL - boolean defaultPort = false; - if (this.protocol.equals("mailto")) { - return this.protocol + ":" + this.userInfo + "@" + this.host; - } else if (isHTTP()) { - if (this.port < 0 || this.port == 80) { defaultPort = true; } - } else if (isHTTPS()) { - if (this.port < 0 || this.port == 443) { defaultPort = true; } - } else if (isFTP()) { - if (this.port < 0 || this.port == 21) { defaultPort = true; } - } else if (isSMB()) { - if (this.port < 0 || this.port == 445) { defaultPort = true; } - } else if (isFile()) { - defaultPort = true; - } - final String urlPath = this.getFile(excludeReference, removeSessionID); - - if (defaultPort) { - return - this.protocol + "://" + - ((this.getHost() == null) ? "" : ((this.userInfo != null) ? (this.userInfo + "@") : ("")) + this.getHost().toLowerCase()) + - urlPath; - } - return this.protocol + "://" + - ((this.userInfo != null) ? (this.userInfo + "@") : ("")) + - this.getHost().toLowerCase() + ((defaultPort) ? ("") : (":" + this.port)) + urlPath; - } - /* (non-Javadoc) * @see java.lang.Object#hashCode() */ @@ -750,63 +101,6 @@ public class DigestURI implements Serializable { return ByteArray.hashCode(this.hash()); } - /* (non-Javadoc) - * @see java.lang.Object#equals(java.lang.Object) - */ - @Override - public boolean equals(final Object obj) { - if (this == obj) return true; - if (obj == null) return false; - if (!(obj instanceof DigestURI)) return false; - DigestURI other = (DigestURI) obj; - return this.toString().equals(other.toString()); - } - - public int compareTo(final Object h) { - assert (h instanceof DigestURI); - return this.toString().compareTo(((DigestURI) h).toString()); - } - - public boolean isPOST() { - return (this.quest != null) && (this.quest.length() > 0); - } - - public final boolean isCGI() { - final String ls = unescape(path.toLowerCase()); - return ls.indexOf(".cgi") >= 0 || - ls.indexOf(".exe") >= 0; - } - - public final boolean isIndividual() { - final String q = unescape(path.toLowerCase()); - for (String sid: sessionIDnames) { - if (q.startsWith(sid.toLowerCase() + "=")) return true; - int p = q.indexOf("&" + sid.toLowerCase() + "="); - if (p >= 0) return true; - } - int pos; - return - ((pos = q.indexOf("sid")) > 0 && - (q.charAt(--pos) == '?' || q.charAt(pos) == '&' || q.charAt(pos) == ';') && - (pos += 5) < q.length() && - (q.charAt(pos) != '&' && q.charAt(--pos) == '=') - ) || - - ((pos = q.indexOf("sessionid")) > 0 && - (pos += 10) < q.length() && - (q.charAt(pos) != '&' && - (q.charAt(--pos) == '=' || q.charAt(pos) == '/')) - ) || - - ((pos = q.indexOf("phpsessid")) > 0 && - (pos += 10) < q.length() && - (q.charAt(pos) != '&' && - (q.charAt(--pos) == '=' || q.charAt(pos) == '/'))); - } - - - // static methods from plasmaURL - public static final int flagTypeID(final String hash) { return (Base64Order.enhancedCoder.decodeByte(hash.charAt(11)) & 32) >> 5; } @@ -833,7 +127,7 @@ public class DigestURI implements Serializable { assert this.hash == null; // should only be called if the hash was not computed before - final int id = Domains.getDomainID(this.host); // id=7: tld is local + final int id = Domains.getDomainID(host); // id=7: tld is local final boolean isHTTP = isHTTP(); int p = (host == null) ? -1 : this.host.lastIndexOf('.'); String dom = (p > 0) ? dom = host.substring(0, p) : ""; @@ -980,7 +274,7 @@ public class DigestURI implements Serializable { // checks for local/global IP range and local IP public final boolean isLocal() { if (this.hash == null) { - if (this.host.startsWith("127.") || this.host.equals("localhost") || this.host.startsWith("0:0:0:0:0:0:0:1")) return true; + if (super.isLocal()) return true; synchronized (this) { if (this.hash == null) this.hash = urlHashComputation(); } @@ -993,318 +287,4 @@ public class DigestURI implements Serializable { return domDomain(urlhash) == 7; } - // language calculation - public final String language() { - String language = "en"; - if (host == null) return language; - final int pos = host.lastIndexOf('.'); - if (pos > 0 && host.length() - pos == 3) language = host.substring(pos + 1).toLowerCase(); - if (language.equals("uk")) language = "en"; - return language; - } - - // The DigestURI may be used to integrate File- and SMB accessed into one object - // some extraction methods that generate File/SmbFile objects from the DigestURI - - /** - * create a standard java URL. - * Please call isHTTP(), isHTTPS() and isFTP() before using this class - */ - public java.net.URL getURL() { - if (!(isHTTP() || isHTTPS() || isFTP())) throw new UnsupportedOperationException(); - try { - return new java.net.URL(this.toNormalform(false, true)); - } catch (MalformedURLException e) { - // this should never happen because this class returns proper url objects - Log.logException(e); - return null; - } - } - - /** - * create a standard java File. - * Please call isFile() before using this class - */ - public java.io.File getFSFile() { - if (!isFile()) throw new UnsupportedOperationException(); - return new java.io.File(this.toNormalform(false, true).substring(7)); - } - - /** - * create a smb File - * Please call isSMB() before using this class - * @throws MalformedURLException - */ - public SmbFile getSmbFile() throws MalformedURLException { - if (!isSMB()) throw new UnsupportedOperationException(); - String url = this.toNormalform(false, true); - return new SmbFile(url); - } - - // some methods that let the DigestURI look like a java.io.File object - // to use these methods the object must be either of type isFile() or isSMB() - - public boolean exists() { - if (isFile()) return getFSFile().exists(); - if (isSMB()) try { - return getSmbFile().exists(); - } catch (SmbException e) { - Log.logWarning("DigestURI", "SMB.exists SmbException for " + this.toString() + ": " + e.getMessage()); - return false; - } catch (MalformedURLException e) { - Log.logWarning("DigestURI", "SMB.exists MalformedURLException for " + this.toString() + ": " + e.getMessage()); - return false; - } - return false; - } - - public boolean canRead() { - if (isFile()) return getFSFile().canRead(); - if (isSMB()) try { - return getSmbFile().canRead(); - } catch (SmbException e) { - Log.logWarning("DigestURI", "SMB.canRead SmbException for " + this.toString() + ": " + e.getMessage()); - return false; - } catch (MalformedURLException e) { - Log.logWarning("DigestURI", "SMB.canRead MalformedURLException for " + this.toString() + ": " + e.getMessage()); - return false; - } - return false; - } - - public boolean canWrite() { - if (isFile()) return getFSFile().canWrite(); - if (isSMB()) try { - return getSmbFile().canWrite(); - } catch (SmbException e) { - Log.logWarning("DigestURI", "SMB.canWrite SmbException for " + this.toString() + ": " + e.getMessage()); - return false; - } catch (MalformedURLException e) { - Log.logWarning("DigestURI", "SMB.canWrite MalformedURLException for " + this.toString() + ": " + e.getMessage()); - return false; - } - return false; - } - - // commented out since the canExecute() method is not part of java 1.5 -// public boolean canExecute() { -// if (isFile()) return getFSFile().canExecute(); -// if (isSMB()) return false; // no execute over smb -// return false; -// } - - public boolean isHidden() { - if (isFile()) return getFSFile().isHidden(); - if (isSMB()) try { - return getSmbFile().isHidden(); - } catch (SmbException e) { - Log.logWarning("DigestURI", "SMB.isHidden SmbException for " + this.toString() + ": " + e.getMessage()); - return false; - } catch (MalformedURLException e) { - Log.logWarning("DigestURI", "SMB.isHidden MalformedURLException for " + this.toString() + ": " + e.getMessage()); - return false; - } - return false; - } - - public boolean isDirectory() { - if (isFile()) return getFSFile().isDirectory(); - if (isSMB()) try { - return getSmbFile().isDirectory(); - } catch (SmbException e) { - Log.logWarning("DigestURI", "SMB.isDirectory SmbException for " + this.toString() + ": " + e.getMessage()); - return false; - } catch (MalformedURLException e) { - Log.logWarning("DigestURI", "SMB.isDirectory MalformedURLException for " + this.toString() + ": " + e.getMessage()); - return false; - } - return false; - } - - public long length() { - if (isFile()) return getFSFile().length(); - if (isSMB()) try { - return getSmbFile().length(); - } catch (SmbException e) { - Log.logWarning("DigestURI", "SMB.length SmbException for " + this.toString() + ": " + e.getMessage()); - return 0; - } catch (MalformedURLException e) { - Log.logWarning("DigestURI", "SMB.length MalformedURLException for " + this.toString() + ": " + e.getMessage()); - return 0; - } - return 0; - } - - public long lastModified() { - if (isFile()) return getFSFile().lastModified(); - if (isSMB()) try { - return getSmbFile().lastModified(); - } catch (SmbException e) { - Log.logWarning("DigestURI", "SMB.lastModified SmbException for " + this.toString() + ": " + e.getMessage()); - return 0; - } catch (MalformedURLException e) { - Log.logWarning("DigestURI", "SMB.lastModified MalformedURLException for " + this.toString() + ": " + e.getMessage()); - return 0; - } - return 0; - } - - public String getName() { - if (isFile()) return getFSFile().getName(); - if (isSMB()) try { - return getSmbFile().getName(); - } catch (MalformedURLException e) { - Log.logWarning("DigestURI", "SMB.getName MalformedURLException for " + this.toString() + ": " + e.getMessage()); - return null; - } - return null; - } - - public String[] list() { - if (isFile()) return getFSFile().list(); - if (isSMB()) try { - SmbFile sf = getSmbFile(); - try { - return sf.list(); - } catch (SmbException e) { - Log.logWarning("DigestURI", "SMB.list SmbException for " + sf.toString() + ": " + e.getMessage()); - return null; - } - } catch (MalformedURLException e) { - Log.logWarning("DigestURI", "SMB.list MalformedURLException for " + this.toString() + ": " + e.getMessage()); - return null; - } - return null; - } - - public InputStream getInputStream() throws IOException { - if (isFile()) return new MultiProtocolInputStream(getFSFile()); - if (isSMB()) return new MultiProtocolInputStream(getSmbFile()); - return null; - } - - public class MultiProtocolInputStream extends InputStream { - private InputStream is; - - public MultiProtocolInputStream(File jf) throws IOException { - this.is = new FileInputStream(jf); - } - - public MultiProtocolInputStream(SmbFile sf) throws IOException { - /* TODO: Forward Exception instead of String containing message once - * YaCy is ported to Java 1.6 some day. */ - try { - this.is = new SmbFileInputStream(sf); - } catch (SmbException e) { - throw new IOException(e.getMessage()); - } catch (MalformedURLException e) { - throw new IOException(e.getMessage()); - } catch (UnknownHostException e) { - throw new IOException(e.getMessage()); - } - } - - @Override - public int read() throws IOException { - return this.is.read(); - } - - } - - //--------------------- - - private static final String splitrex = " |/|\\(|\\)|-|\\:|_|\\.|,|\\?|!|'|" + '"'; - public static final Pattern splitpattern = Pattern.compile(splitrex); - public static String[] urlComps(String normalizedURL) { - final int p = normalizedURL.indexOf("//"); - if (p > 0) normalizedURL = normalizedURL.substring(p + 2); - return splitpattern.split(normalizedURL.toLowerCase()); // word components of the url - } - - public static void main(final String[] args) { - final String[][] test = new String[][]{ - new String[]{null, "C:WINDOWS\\CMD0.EXE"}, - new String[]{null, "file://C:WINDOWS\\CMD0.EXE"}, - new String[]{null, "file:/bin/yacy1"}, // file:/// may have many '/' if the host is omitted and the path starts with '/' - new String[]{null, "file:///bin/yacy2"}, // file:/// may have many '/' if the host is omitted and the path starts with '/' - new String[]{null, "file:C:WINDOWS\\CMD.EXE"}, - new String[]{null, "file:///C:WINDOWS\\CMD1.EXE"}, - new String[]{null, "file:///C|WINDOWS\\CMD2.EXE"}, - new String[]{null, "http://www.anomic.de/test/"}, - new String[]{null, "http://www.anomic.de/"}, - new String[]{null, "http://www.anomic.de"}, - new String[]{null, "http://www.anomic.de/home/test?x=1#home"}, - new String[]{null, "http://www.anomic.de/home/test?x=1"}, - new String[]{null, "http://www.anomic.de/home/test#home"}, - new String[]{null, "ftp://ftp.anomic.de/home/test#home"}, - new String[]{null, "http://www.anomic.de/home/../abc/"}, - new String[]{null, "mailto:abcdefg@nomailnomail.com"}, - new String[]{"http://www.anomic.de/home", "test"}, - new String[]{"http://www.anomic.de/home", "test/"}, - new String[]{"http://www.anomic.de/home/", "test"}, - new String[]{"http://www.anomic.de/home/", "test/"}, - new String[]{"http://www.anomic.de/home/index.html", "test.htm"}, - new String[]{"http://www.anomic.de/home/index.html", "http://www.yacy.net/test"}, - new String[]{"http://www.anomic.de/home/index.html", "ftp://ftp.yacy.net/test"}, - new String[]{"http://www.anomic.de/home/index.html", "../test"}, - new String[]{"http://www.anomic.de/home/index.html", "mailto:abcdefg@nomailnomail.com"}, - new String[]{null, "news:de.test"}, - new String[]{"http://www.anomic.de/home", "news:de.test"}, - new String[]{null, "mailto:bob@web.com"}, - new String[]{"http://www.anomic.de/home", "mailto:bob@web.com"}, - new String[]{"http://www.anomic.de/home", "ftp://ftp.anomic.de/src"}, - new String[]{null, "ftp://ftp.delegate.org/"}, - new String[]{"http://www.anomic.de/home", "ftp://ftp.delegate.org/"}, - new String[]{"http://www.anomic.de","mailto:yacy@weltherrschaft.org"}, - new String[]{"http://www.anomic.de","javascipt:temp"}, - new String[]{null,"http://yacy-websuche.de/wiki/index.php?title=De:IntroInformationFreedom&action=history"}, - new String[]{null, "http://diskusjion.no/index.php?s=5bad5f431a106d9a8355429b81bb0ca5&showuser=23585"}, - new String[]{null, "http://diskusjion.no/index.php?s=5bad5f431a106d9a8355429b81bb0ca5&showuser=23585"}, - new String[]{null, "http://www.scc.kit.edu/publikationen/80.php?PHPSESSID=5f3624d3e1c33d4c086ab600d4d5f5a1"}, - new String[]{null, "smb://localhost/"}, - new String[]{null, "smb://localhost/repository"}, // paths must end with '/' - new String[]{null, "smb://localhost/repository/"}, - new String[]{null, "\\\\localhost\\"}, // Windows-like notion of smb shares - new String[]{null, "\\\\localhost\\repository"}, - new String[]{null, "\\\\localhost\\repository\\"} - }; - DigestURI.initSessionIDNames(new File("defaults/sessionid.names")); - String environment, url; - DigestURI aURL, aURL1; - java.net.URL jURL; - for (int i = 0; i < test.length; i++) { - environment = test[i][0]; - url = test[i][1]; - try {aURL = DigestURI.newURL(environment, url);} catch (final MalformedURLException e) {Log.logException(e); aURL = null;} - if (aURL != null) System.out.println("normalized: " + aURL.toNormalform(true, true, true) + " - hash=" + new String(aURL.hash())); - if (environment == null) { - try {jURL = new java.net.URL(url);} catch (final MalformedURLException e) {jURL = null;} - } else { - try {jURL = new java.net.URL(new java.net.URL(environment), url);} catch (final MalformedURLException e) {jURL = null;} - } - - // check equality to java.net.URL - if (((aURL == null) && (jURL != null)) || - ((aURL != null) && (jURL == null)) || - ((aURL != null) && (jURL != null) && (!(jURL.toString().equals(aURL.toString()))))) { - System.out.println("Difference for environment=" + environment + ", url=" + url + ":"); - System.out.println((jURL == null) ? "jURL rejected input" : "jURL=" + jURL.toString()); - System.out.println((aURL == null) ? "aURL rejected input" : "aURL=" + aURL.toString()); - } - - // check stability: the normalform of the normalform must be equal to the normalform - if (aURL != null) try { - aURL1 = new DigestURI(aURL.toNormalform(false, true), null); - if (!(aURL1.toNormalform(false, true).equals(aURL.toNormalform(false, true)))) { - System.out.println("no stability for url:"); - System.out.println("aURL0=" + aURL.toString()); - System.out.println("aURL1=" + aURL1.toString()); - } - } catch (final MalformedURLException e) { - System.out.println("no stability for url:"); - System.out.println("aURL0=" + aURL.toString()); - System.out.println("aURL1 cannot be computed:" + e.getMessage()); - } - } - } } diff --git a/source/net/yacy/kelondro/util/Domains.java b/source/net/yacy/kelondro/util/Domains.java index b122c888f..b859f0a68 100644 --- a/source/net/yacy/kelondro/util/Domains.java +++ b/source/net/yacy/kelondro/util/Domains.java @@ -595,7 +595,7 @@ public class Domains { } public static boolean isLocal(final String host) { - assert (host != null); + if (host == null) return true; // FIXME IPv4 only // check local ip addresses diff --git a/source/net/yacy/repository/Blacklist.java b/source/net/yacy/repository/Blacklist.java index bf1b8466a..0e49c0919 100644 --- a/source/net/yacy/repository/Blacklist.java +++ b/source/net/yacy/repository/Blacklist.java @@ -283,7 +283,7 @@ public class Blacklist { } public boolean isListed(final String blacklistType, final DigestURI url) { - + if (url.getHost() == null) return false; final HandleSet urlHashCache = getCacheUrlHashsSet(blacklistType); if (!urlHashCache.has(url.hash())) { final boolean temp = isListed(blacklistType, url.getHost().toLowerCase(), url.getFile()); diff --git a/source/net/yacy/repository/LoaderDispatcher.java b/source/net/yacy/repository/LoaderDispatcher.java index a33a6e817..77730fd32 100644 --- a/source/net/yacy/repository/LoaderDispatcher.java +++ b/source/net/yacy/repository/LoaderDispatcher.java @@ -51,6 +51,7 @@ import net.yacy.kelondro.util.FileUtils; import de.anomic.crawler.CrawlProfile; import de.anomic.crawler.retrieval.FTPLoader; +import de.anomic.crawler.retrieval.FileLoader; import de.anomic.crawler.retrieval.HTTPLoader; import de.anomic.crawler.retrieval.Request; import de.anomic.crawler.retrieval.Response; @@ -73,17 +74,19 @@ public final class LoaderDispatcher { private final HTTPLoader httpLoader; private final FTPLoader ftpLoader; private final SMBLoader smbLoader; + private final FileLoader fileLoader; private final Log log; public LoaderDispatcher(final Switchboard sb) { this.sb = sb; - this.supportedProtocols = new HashSet(Arrays.asList(new String[]{"http","https","ftp","smb"})); + this.supportedProtocols = new HashSet(Arrays.asList(new String[]{"http","https","ftp","smb","file"})); // initiate loader objects this.log = new Log("LOADER"); httpLoader = new HTTPLoader(sb, log); ftpLoader = new FTPLoader(sb, log); smbLoader = new SMBLoader(sb, log); + fileLoader = new FileLoader(sb, log); } public boolean isSupportedProtocol(final String protocol) { @@ -251,13 +254,14 @@ public final class LoaderDispatcher { } // now it's for sure that we will access the target. Remember the access time - accessTime.put(host, System.currentTimeMillis()); + if (host != null) accessTime.put(host, System.currentTimeMillis()); // load resource from the internet Response response = null; if ((protocol.equals("http") || (protocol.equals("https")))) response = httpLoader.load(request, acceptOnlyParseable, maxFileSize); if (protocol.equals("ftp")) response = ftpLoader.load(request, true); if (protocol.equals("smb")) response = smbLoader.load(request, true); + if (protocol.equals("file")) response = fileLoader.load(request, true); if (response != null) { // we got something. Now check if we want to store that to the cache // first check looks if we want to store the content to the cache diff --git a/test/de/anomic/yacy/yacyURLTest.java b/test/de/anomic/yacy/yacyURLTest.java index 3f16cc1d8..66d3995c4 100644 --- a/test/de/anomic/yacy/yacyURLTest.java +++ b/test/de/anomic/yacy/yacyURLTest.java @@ -2,13 +2,14 @@ package de.anomic.yacy; import java.net.MalformedURLException; +import net.yacy.cora.document.MultiProtocolURI; import net.yacy.kelondro.data.meta.DigestURI; import junit.framework.TestCase; public class yacyURLTest extends TestCase { - public void testResolveBackpath() throws MalformedURLException { + public void testResolveBackpath() { String[][] testStrings = new String[][] { new String[]{"/..home","/..home"}, new String[]{"/test/..home/test.html","/test/..home/test.html"}, @@ -23,14 +24,13 @@ public class yacyURLTest extends TestCase { new String[]{"/home/..test/../hallo/../","/home/"} }; - DigestURI urlObj = new DigestURI("http://yacy.net"); for (int i=0; i < testStrings.length; i++) { // desired conversion result System.out.print("testResolveBackpath: " + testStrings[i][0]); String shouldBe = testStrings[i][1]; // conversion result - String resolvedURL = urlObj.resolveBackpath(testStrings[i][0]); + String resolvedURL = MultiProtocolURI.resolveBackpath(testStrings[i][0]); // test if equal assertEquals(shouldBe,resolvedURL);