mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
- added new protocol loader for 'file'-type URLs
- it is now possible to crawl the local file system with an intranet peer - redesign of URL handling - refactoring: created LGPLed package cora: 'content retrieval api' which may be used externally by other applications without yacy core elements because it has no dependencies to other parts of yacy git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6902 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
parent
2fd795207c
commit
11639aef35
|
@ -685,7 +685,10 @@ crawler.http.maxFileSize=1048576
|
|||
crawler.ftp.maxFileSize=1048576
|
||||
|
||||
# smb crawler specific settings: maximum size
|
||||
crawler.smb.maxFileSize=50000000
|
||||
crawler.smb.maxFileSize=100000000
|
||||
|
||||
# smb crawler specific settings: maximum size
|
||||
crawler.file.maxFileSize=100000000
|
||||
|
||||
# maximum number of crawler threads
|
||||
crawler.MaxActiveThreads = 200
|
||||
|
|
|
@ -24,7 +24,7 @@
|
|||
|
||||
import java.util.Random;
|
||||
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.cora.document.MultiProtocolURI;
|
||||
import net.yacy.kelondro.util.Domains;
|
||||
|
||||
import de.anomic.crawler.ResultImages;
|
||||
|
@ -90,8 +90,8 @@ public class Collage {
|
|||
final int yOffset = embed ? 0 : 70;
|
||||
for (int i = 0; i < fifoSize; i++) {
|
||||
|
||||
final DigestURI baseURL = origins[i].baseURL;
|
||||
final DigestURI imageURL = origins[i].imageEntry.url();
|
||||
final MultiProtocolURI baseURL = origins[i].baseURL;
|
||||
final MultiProtocolURI imageURL = origins[i].imageEntry.url();
|
||||
|
||||
// check if this loads a page from localhost, which must be prevented to protect the server
|
||||
// against attacks to the administration interface when localhost access is granted
|
||||
|
|
|
@ -36,6 +36,7 @@ import java.util.Set;
|
|||
import java.util.regex.Pattern;
|
||||
import java.util.regex.PatternSyntaxException;
|
||||
|
||||
import net.yacy.cora.document.MultiProtocolURI;
|
||||
import net.yacy.document.parser.html.ContentScraper;
|
||||
import net.yacy.document.parser.html.TransformerWriter;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
|
@ -234,7 +235,7 @@ public class Crawler_p {
|
|||
// stack url
|
||||
sb.crawler.profilesPassiveCrawls.removeEntry(crawlingStartURL.hash()); // if there is an old entry, delete it
|
||||
final CrawlProfile.entry pe = sb.crawler.profilesActiveCrawls.newEntry(
|
||||
crawlingStartURL.getHost(),
|
||||
(crawlingStartURL.getHost() == null) ? Long.toHexString(System.currentTimeMillis()) : crawlingStartURL.getHost(),
|
||||
crawlingStartURL,
|
||||
newcrawlingMustMatch,
|
||||
newcrawlingMustNotMatch,
|
||||
|
@ -345,7 +346,7 @@ public class Crawler_p {
|
|||
writer.close();
|
||||
|
||||
//String headline = scraper.getHeadline();
|
||||
final Map<DigestURI, String> hyperlinks = scraper.getAnchors();
|
||||
final Map<MultiProtocolURI, String> hyperlinks = scraper.getAnchors();
|
||||
|
||||
// creating a crawler profile
|
||||
final DigestURI crawlURL = new DigestURI("file://" + file.toString(), null);
|
||||
|
@ -370,11 +371,12 @@ public class Crawler_p {
|
|||
sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
|
||||
|
||||
// loop through the contained links
|
||||
final Iterator<Map.Entry<DigestURI, String>> linkiterator = hyperlinks.entrySet().iterator();
|
||||
final Iterator<Map.Entry<MultiProtocolURI, String>> linkiterator = hyperlinks.entrySet().iterator();
|
||||
DigestURI nexturl;
|
||||
while (linkiterator.hasNext()) {
|
||||
final Map.Entry<DigestURI, String> e = linkiterator.next();
|
||||
nexturl = e.getKey();
|
||||
final Map.Entry<MultiProtocolURI, String> e = linkiterator.next();
|
||||
if (e.getKey() == null) continue;
|
||||
nexturl = new DigestURI(e.getKey());
|
||||
if (nexturl == null) continue;
|
||||
|
||||
// enqueuing the url for crawling
|
||||
|
|
|
@ -25,9 +25,9 @@
|
|||
import java.io.IOException;
|
||||
import java.net.MalformedURLException;
|
||||
|
||||
import net.yacy.document.content.RSSMessage;
|
||||
import net.yacy.document.parser.xml.RSSFeed;
|
||||
import net.yacy.document.parser.xml.RSSReader;
|
||||
import net.yacy.cora.document.Hit;
|
||||
import net.yacy.cora.document.RSSFeed;
|
||||
import net.yacy.cora.document.RSSReader;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.kelondro.logging.Log;
|
||||
|
||||
|
@ -69,7 +69,7 @@ public class FeedReader_p {
|
|||
prop.putHTML("page_description", feed.getChannel().getDescription());
|
||||
|
||||
int i = 0;
|
||||
for (final RSSMessage item: feed) {
|
||||
for (final Hit item: feed) {
|
||||
prop.putHTML("page_items_" + i + "_author", item.getAuthor());
|
||||
prop.putHTML("page_items_" + i + "_title", item.getTitle());
|
||||
prop.putHTML("page_items_" + i + "_link", item.getLink());
|
||||
|
|
|
@ -159,23 +159,27 @@
|
|||
<td colspan="2"><strong>http Crawler Settings:</strong></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Maximum Filesize:</td>
|
||||
<td>Maximum HTTP Filesize:</td>
|
||||
<td class="settingsValue">#[crawler.http.maxFileSize]#</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td colspan="2"><strong>ftp Crawler Settings:</strong></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Maximum Filesize:</td>
|
||||
<td>Maximum FTP Filesize:</td>
|
||||
<td class="settingsValue">#[crawler.ftp.maxFileSize]#</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td colspan="2"><strong>smb Crawler Settings:</strong></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Maximum Filesize:</td>
|
||||
<td>Maximum SMB Filesize:</td>
|
||||
<td class="settingsValue">#[crawler.smb.maxFileSize]#</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Maximum file Filesize:</td>
|
||||
<td class="settingsValue">#[crawler.file.maxFileSize]#</td>
|
||||
</tr>
|
||||
</table>
|
||||
::<!-- 29: Crawler settings timeout error -->
|
||||
<p class="error">Invalid crawler timeout value: <tt>#[crawler.clientTimeout]#</tt></p>
|
||||
|
|
|
@ -503,18 +503,32 @@ public class SettingsAck_p {
|
|||
long maxSmbSize;
|
||||
try {
|
||||
maxSmbSize = Integer.parseInt(maxSizeStr);
|
||||
env.setConfig("crawler.smb.maxFileSize", Long.toString(maxFtpSize));
|
||||
env.setConfig("crawler.smb.maxFileSize", Long.toString(maxSmbSize));
|
||||
} catch (final NumberFormatException e) {
|
||||
prop.put("info", "31");
|
||||
prop.putHTML("info_crawler.smb.maxFileSize",post.get("crawler.smb.maxFileSize"));
|
||||
return prop;
|
||||
}
|
||||
|
||||
maxSizeStr = post.get("crawler.file.maxFileSize");
|
||||
if (maxSizeStr==null||maxSizeStr.length()==0) maxSizeStr = "-1";
|
||||
|
||||
long maxFileSize;
|
||||
try {
|
||||
maxFileSize = Integer.parseInt(maxSizeStr);
|
||||
env.setConfig("crawler.file.maxFileSize", Long.toString(maxFileSize));
|
||||
} catch (final NumberFormatException e) {
|
||||
prop.put("info", "31");
|
||||
prop.putHTML("info_crawler.file.maxFileSize",post.get("crawler.file.maxFileSize"));
|
||||
return prop;
|
||||
}
|
||||
|
||||
// everything is ok
|
||||
prop.put("info_crawler.clientTimeout",(crawlerTimeout==0) ? "0" :DateFormatter.formatInterval(crawlerTimeout));
|
||||
prop.put("info_crawler.http.maxFileSize",(maxHttpSize==-1)? "-1":Formatter.bytesToString(maxHttpSize));
|
||||
prop.put("info_crawler.ftp.maxFileSize", (maxFtpSize==-1) ? "-1":Formatter.bytesToString(maxFtpSize));
|
||||
prop.put("info_crawler.smb.maxFileSize", (maxFtpSize==-1) ? "-1":Formatter.bytesToString(maxSmbSize));
|
||||
prop.put("info_crawler.smb.maxFileSize", (maxSmbSize==-1) ? "-1":Formatter.bytesToString(maxSmbSize));
|
||||
prop.put("info_crawler.file.maxFileSize", (maxFileSize==-1) ? "-1":Formatter.bytesToString(maxFileSize));
|
||||
prop.put("info", "28");
|
||||
return prop;
|
||||
}
|
||||
|
|
|
@ -26,6 +26,22 @@
|
|||
</tr>
|
||||
<tr><td colspan="3"><hr /></td></tr>
|
||||
|
||||
<tr><td colspan="3"><p><strong>SMB Crawler Settings</strong>:</p></td></tr>
|
||||
<tr valign="top">
|
||||
<td>Maximum Filesize:</td>
|
||||
<td><input name="crawler.smb.maxFileSize" type="text" size="16" maxlength="16" value="#[crawler.smb.maxFileSize]#" /></td>
|
||||
<td><em>Maximum allowed file size in bytes that should be downloaded. Larger files will be skipped. <code>-1</code> means unlimited.</em></td>
|
||||
</tr>
|
||||
<tr><td colspan="3"><hr /></td></tr>
|
||||
|
||||
<tr><td colspan="3"><p><strong>Local File Crawler Settings</strong>:</p></td></tr>
|
||||
<tr valign="top">
|
||||
<td>Maximum Filesize:</td>
|
||||
<td><input name="crawler.file.maxFileSize" type="text" size="16" maxlength="16" value="#[crawler.file.maxFileSize]#" /></td>
|
||||
<td><em>Maximum allowed file size in bytes that should be downloaded. Larger files will be skipped. <code>-1</code> means unlimited.</em></td>
|
||||
</tr>
|
||||
<tr><td colspan="3"><hr /></td></tr>
|
||||
|
||||
<tr valign="top">
|
||||
<td> </td>
|
||||
<td><input type="submit" name="crawlerSettings" value="Submit" /></td>
|
||||
|
|
|
@ -202,6 +202,7 @@ public final class Settings_p {
|
|||
prop.putHTML("crawler.http.maxFileSize",sb.getConfig("crawler.http.maxFileSize", "-1"));
|
||||
prop.putHTML("crawler.ftp.maxFileSize",sb.getConfig("crawler.ftp.maxFileSize", "-1"));
|
||||
prop.putHTML("crawler.smb.maxFileSize",sb.getConfig("crawler.smb.maxFileSize", "-1"));
|
||||
prop.putHTML("crawler.file.maxFileSize",sb.getConfig("crawler.file.maxFileSize", "-1"));
|
||||
|
||||
// return rewrite properties
|
||||
return prop;
|
||||
|
|
|
@ -35,6 +35,7 @@ import java.util.HashMap;
|
|||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
|
||||
import net.yacy.cora.document.MultiProtocolURI;
|
||||
import net.yacy.document.Condenser;
|
||||
import net.yacy.document.Document;
|
||||
import net.yacy.document.ParserException;
|
||||
|
@ -372,7 +373,7 @@ public class ViewFile {
|
|||
i += putMediaInfo(prop, wordArray, i, document.getAudiolinks(), "audio", (i % 2 == 0));
|
||||
dark = (i % 2 == 0);
|
||||
|
||||
final HashMap<String, ImageEntry> ts = document.getImages();
|
||||
final HashMap<MultiProtocolURI, ImageEntry> ts = document.getImages();
|
||||
final Iterator<ImageEntry> tsi = ts.values().iterator();
|
||||
ImageEntry entry;
|
||||
while (tsi.hasNext()) {
|
||||
|
@ -439,9 +440,9 @@ public class ViewFile {
|
|||
return message;
|
||||
}
|
||||
|
||||
private static int putMediaInfo(final serverObjects prop, final String[] wordArray, int c, final Map<DigestURI, String> media, final String name, boolean dark) {
|
||||
final Iterator<Map.Entry<DigestURI, String>> mi = media.entrySet().iterator();
|
||||
Map.Entry<DigestURI, String> entry;
|
||||
private static int putMediaInfo(final serverObjects prop, final String[] wordArray, int c, final Map<MultiProtocolURI, String> media, final String name, boolean dark) {
|
||||
final Iterator<Map.Entry<MultiProtocolURI, String>> mi = media.entrySet().iterator();
|
||||
Map.Entry<MultiProtocolURI, String> entry;
|
||||
int i = 0;
|
||||
while (mi.hasNext()) {
|
||||
entry = mi.next();
|
||||
|
|
|
@ -2,8 +2,8 @@
|
|||
|
||||
import java.util.Date;
|
||||
|
||||
import net.yacy.document.content.RSSMessage;
|
||||
import net.yacy.document.parser.xml.RSSFeed;
|
||||
import net.yacy.cora.document.RSSFeed;
|
||||
import net.yacy.cora.document.RSSMessage;
|
||||
|
||||
import de.anomic.http.server.RequestHeader;
|
||||
import de.anomic.search.Switchboard;
|
||||
|
|
|
@ -30,8 +30,8 @@ import java.text.ParseException;
|
|||
import java.util.Date;
|
||||
import java.util.Iterator;
|
||||
|
||||
import net.yacy.document.content.RSSMessage;
|
||||
import net.yacy.document.parser.xml.RSSFeed;
|
||||
import net.yacy.cora.document.RSSFeed;
|
||||
import net.yacy.cora.document.Hit;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.kelondro.util.DateFormatter;
|
||||
|
||||
|
@ -57,7 +57,7 @@ public class rct_p {
|
|||
final yacySeed seed = (peerhash == null) ? null : sb.peers.getConnected(peerhash);
|
||||
final RSSFeed feed = (seed == null) ? null : yacyClient.queryRemoteCrawlURLs(sb.peers, seed, 20, 60000);
|
||||
if (feed != null) {
|
||||
for (final RSSMessage item: feed) {
|
||||
for (final Hit item: feed) {
|
||||
//System.out.println("URL=" + item.getLink() + ", desc=" + item.getDescription() + ", pubDate=" + item.getPubDate());
|
||||
|
||||
// put url on remote crawl stack
|
||||
|
|
|
@ -35,8 +35,8 @@ import java.util.Map;
|
|||
import java.util.TreeMap;
|
||||
import java.util.TreeSet;
|
||||
|
||||
import net.yacy.document.content.RSSMessage;
|
||||
import net.yacy.document.parser.xml.RSSFeed;
|
||||
import net.yacy.cora.document.RSSFeed;
|
||||
import net.yacy.cora.document.RSSMessage;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.kelondro.data.word.WordReference;
|
||||
import net.yacy.kelondro.data.word.WordReferenceRow;
|
||||
|
|
|
@ -30,8 +30,8 @@
|
|||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
|
||||
import net.yacy.document.content.RSSMessage;
|
||||
import net.yacy.document.parser.xml.RSSFeed;
|
||||
import net.yacy.cora.document.RSSFeed;
|
||||
import net.yacy.cora.document.RSSMessage;
|
||||
import net.yacy.kelondro.data.meta.URIMetadataRow;
|
||||
import net.yacy.kelondro.data.word.WordReferenceRow;
|
||||
import net.yacy.kelondro.index.HandleSet;
|
||||
|
|
|
@ -29,8 +29,8 @@
|
|||
import java.io.IOException;
|
||||
import java.text.ParseException;
|
||||
|
||||
import net.yacy.document.content.RSSMessage;
|
||||
import net.yacy.document.parser.xml.RSSFeed;
|
||||
import net.yacy.cora.document.RSSFeed;
|
||||
import net.yacy.cora.document.RSSMessage;
|
||||
import net.yacy.kelondro.data.meta.URIMetadataRow;
|
||||
import net.yacy.kelondro.logging.Log;
|
||||
import net.yacy.kelondro.util.DateFormatter;
|
||||
|
|
|
@ -32,11 +32,11 @@ import java.util.HashMap;
|
|||
import java.util.Iterator;
|
||||
import java.util.TreeSet;
|
||||
|
||||
import net.yacy.cora.document.RSSFeed;
|
||||
import net.yacy.cora.document.RSSMessage;
|
||||
import net.yacy.document.Condenser;
|
||||
import net.yacy.document.Document;
|
||||
import net.yacy.document.content.RSSMessage;
|
||||
import net.yacy.document.geolocalization.Location;
|
||||
import net.yacy.document.parser.xml.RSSFeed;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.kelondro.data.meta.URIMetadataRow;
|
||||
import net.yacy.kelondro.data.word.Word;
|
||||
|
|
|
@ -22,7 +22,8 @@ import java.util.Set;
|
|||
import java.util.concurrent.BlockingQueue;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
import net.yacy.document.content.RSSMessage;
|
||||
import net.yacy.cora.document.RSSMessage;
|
||||
import net.yacy.cora.services.Search;
|
||||
import net.yacy.document.geolocalization.Location;
|
||||
import de.anomic.data.LibraryProvider;
|
||||
import de.anomic.http.server.HeaderFramework;
|
||||
|
@ -32,7 +33,6 @@ import de.anomic.search.SwitchboardConstants;
|
|||
import de.anomic.server.serverCore;
|
||||
import de.anomic.server.serverObjects;
|
||||
import de.anomic.server.serverSwitch;
|
||||
import de.anomic.yacy.yacyClient;
|
||||
|
||||
import java.util.Date;
|
||||
import net.yacy.kelondro.util.DateFormatter;
|
||||
|
@ -91,7 +91,8 @@ public class yacysearch_location {
|
|||
|
||||
if (search_title || search_publisher || search_creator || search_subject) try {
|
||||
// get a queue of search results
|
||||
BlockingQueue<RSSMessage> results = yacyClient.search(null, query, false, false, maximumTime, Integer.MAX_VALUE);
|
||||
String rssSearchServiceURL = "http://localhost:" + sb.getConfig("port", "8080") + "/yacysearch.rss";
|
||||
BlockingQueue<RSSMessage> results = Search.search(rssSearchServiceURL, query, false, false, maximumTime, Integer.MAX_VALUE);
|
||||
|
||||
// take the results and compute some locations
|
||||
RSSMessage message;
|
||||
|
|
|
@ -36,8 +36,8 @@ import java.util.Iterator;
|
|||
import java.util.Map;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
|
||||
import net.yacy.document.content.RSSMessage;
|
||||
import net.yacy.document.parser.xml.RSSFeed;
|
||||
import net.yacy.cora.document.Hit;
|
||||
import net.yacy.cora.document.RSSFeed;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.kelondro.logging.Log;
|
||||
import net.yacy.kelondro.order.Base64Order;
|
||||
|
@ -421,7 +421,7 @@ public class CrawlQueues {
|
|||
// parse the rss
|
||||
DigestURI url, referrer;
|
||||
Date loaddate;
|
||||
for (final RSSMessage item: feed) {
|
||||
for (final Hit item: feed) {
|
||||
//System.out.println("URL=" + item.getLink() + ", desc=" + item.getDescription() + ", pubDate=" + item.getPubDate());
|
||||
|
||||
// put url on remote crawl stack
|
||||
|
|
|
@ -354,6 +354,7 @@ public final class CrawlStacker {
|
|||
// returns true if the url can be accepted accoring to network.unit.domain
|
||||
if (url == null) return "url is null";
|
||||
final String host = url.getHost();
|
||||
if (this.acceptLocalURLs && host == null && url.getProtocol().equals("file")) return null;
|
||||
if (host == null) return "url.host is null";
|
||||
if (this.acceptGlobalURLs && this.acceptLocalURLs) return null; // fast shortcut to avoid dnsResolve
|
||||
// check if this is a local address and we are allowed to index local pages:
|
||||
|
|
|
@ -30,9 +30,9 @@ import java.util.HashMap;
|
|||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.ConcurrentLinkedQueue;
|
||||
|
||||
import net.yacy.cora.document.MultiProtocolURI;
|
||||
import net.yacy.document.Document;
|
||||
import net.yacy.document.parser.html.ImageEntry;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
|
||||
|
||||
public class ResultImages {
|
||||
|
@ -48,18 +48,17 @@ public class ResultImages {
|
|||
// we also check all links for a double-check so we don't get the same image more than once in any queue
|
||||
// image links may appear double here even if the pages where the image links are embedded already are checked for double-occurrence:
|
||||
// the same images may be linked from different pages
|
||||
private static final ConcurrentHashMap<String, Long> doubleCheck = new ConcurrentHashMap<String, Long>(); // (url-hash, time) when the url appeared first
|
||||
private static final ConcurrentHashMap<MultiProtocolURI, Long> doubleCheck = new ConcurrentHashMap<MultiProtocolURI, Long>(); // (url, time) when the url appeared first
|
||||
|
||||
public static void registerImages(final Document document, final boolean privateEntry) {
|
||||
if (document == null) return;
|
||||
if (document.dc_source() == null) return;
|
||||
|
||||
final HashMap<String, ImageEntry> images = document.getImages();
|
||||
final HashMap<MultiProtocolURI, ImageEntry> images = document.getImages();
|
||||
for (final ImageEntry image: images.values()) {
|
||||
// do a double-check; attention: this can be time-consuming since this possibly needs a DNS-lookup
|
||||
String hashstring = new String(image.url().hash());
|
||||
if (doubleCheck.containsKey(hashstring)) continue;
|
||||
doubleCheck.put(hashstring, System.currentTimeMillis());
|
||||
if (doubleCheck.containsKey(image.url())) continue;
|
||||
doubleCheck.put(image.url(), System.currentTimeMillis());
|
||||
|
||||
final String name = image.url().getFile();
|
||||
boolean good = false;
|
||||
|
@ -144,8 +143,8 @@ public class ResultImages {
|
|||
|
||||
public static class OriginEntry {
|
||||
public ImageEntry imageEntry;
|
||||
public DigestURI baseURL;
|
||||
public OriginEntry(final ImageEntry imageEntry, final DigestURI baseURL) {
|
||||
public MultiProtocolURI baseURL;
|
||||
public OriginEntry(final ImageEntry imageEntry, final MultiProtocolURI baseURL) {
|
||||
this.imageEntry = imageEntry;
|
||||
this.baseURL = baseURL;
|
||||
}
|
||||
|
|
|
@ -35,6 +35,7 @@ import java.util.Date;
|
|||
import java.util.Map;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
|
||||
import net.yacy.cora.document.MultiProtocolURI;
|
||||
import net.yacy.kelondro.blob.BEncodedHeap;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.kelondro.logging.Log;
|
||||
|
@ -317,7 +318,7 @@ public class RobotsTxt {
|
|||
reqHeaders.put(HeaderFramework.USER_AGENT, HTTPLoader.crawlerUserAgent);
|
||||
|
||||
// adding referer
|
||||
reqHeaders.put(RequestHeader.REFERER, (DigestURI.newURL(robotsURL,"/")).toNormalform(true, true));
|
||||
reqHeaders.put(RequestHeader.REFERER, (MultiProtocolURI.newURL(robotsURL,"/")).toNormalform(true, true));
|
||||
|
||||
if (entry != null) {
|
||||
oldEtag = entry.getETag();
|
||||
|
@ -380,7 +381,7 @@ public class RobotsTxt {
|
|||
redirectionUrlString = redirectionUrlString.trim();
|
||||
|
||||
// generating the new URL object
|
||||
final DigestURI redirectionUrl = DigestURI.newURL(robotsURL, redirectionUrlString);
|
||||
final DigestURI redirectionUrl = new DigestURI(MultiProtocolURI.newURL(robotsURL, redirectionUrlString));
|
||||
|
||||
// following the redirection
|
||||
if (log.isFinest()) log.logFinest("Redirection detected for robots.txt with URL '" + robotsURL + "'." +
|
||||
|
|
|
@ -32,6 +32,7 @@ import java.io.IOException;
|
|||
import java.io.PrintStream;
|
||||
import java.util.Date;
|
||||
|
||||
import net.yacy.cora.document.MultiProtocolURI;
|
||||
import net.yacy.document.TextParser;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.kelondro.logging.Log;
|
||||
|
@ -272,8 +273,8 @@ public class FTPLoader {
|
|||
* @param entryUrl
|
||||
* @return
|
||||
*/
|
||||
private String getPath(final DigestURI entryUrl) {
|
||||
return DigestURI.unescape(entryUrl.getPath()).replace("\"", "\"\"");
|
||||
private String getPath(final MultiProtocolURI entryUrl) {
|
||||
return MultiProtocolURI.unescape(entryUrl.getPath()).replace("\"", "\"\"");
|
||||
}
|
||||
|
||||
}
|
||||
|
|
144
source/de/anomic/crawler/retrieval/FileLoader.java
Normal file
144
source/de/anomic/crawler/retrieval/FileLoader.java
Normal file
|
@ -0,0 +1,144 @@
|
|||
/**
|
||||
* FileLoader
|
||||
* Copyright 2010 by Michael Peter Christen
|
||||
* First released 25.5.2010 at http://yacy.net
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with this program in the file COPYING.LESSER.
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
package de.anomic.crawler.retrieval;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Date;
|
||||
import java.util.List;
|
||||
|
||||
import de.anomic.http.server.HeaderFramework;
|
||||
import de.anomic.http.server.RequestHeader;
|
||||
import de.anomic.http.server.ResponseHeader;
|
||||
import de.anomic.net.ftpc;
|
||||
import de.anomic.search.Segments;
|
||||
import de.anomic.search.Switchboard;
|
||||
import de.anomic.data.MimeTable;
|
||||
|
||||
import net.yacy.document.TextParser;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.kelondro.logging.Log;
|
||||
import net.yacy.kelondro.util.DateFormatter;
|
||||
import net.yacy.kelondro.util.FileUtils;
|
||||
|
||||
public class FileLoader {
|
||||
|
||||
private final Switchboard sb;
|
||||
private final Log log;
|
||||
private final int maxFileSize;
|
||||
|
||||
public FileLoader(final Switchboard sb, final Log log) {
|
||||
this.sb = sb;
|
||||
this.log = log;
|
||||
maxFileSize = (int) sb.getConfigLong("crawler.file.maxFileSize", -1l);
|
||||
}
|
||||
|
||||
public Response load(final Request request, boolean acceptOnlyParseable) throws IOException {
|
||||
DigestURI url = request.url();
|
||||
if (!url.getProtocol().equals("file")) throw new IOException("wrong loader for FileLoader: " + url.getProtocol());
|
||||
|
||||
RequestHeader requestHeader = new RequestHeader();
|
||||
if (request.referrerhash() != null) {
|
||||
DigestURI ur = sb.getURL(Segments.Process.LOCALCRAWLING, request.referrerhash());
|
||||
if (ur != null) requestHeader.put(RequestHeader.REFERER, ur.toNormalform(true, false));
|
||||
}
|
||||
|
||||
// process directories: transform them to html with meta robots=noindex (using the ftpc lib)
|
||||
if (url.isDirectory()) {
|
||||
String[] l = url.list();
|
||||
if (l == null) {
|
||||
// this can only happen if there is no connection or the directory does not exist
|
||||
log.logInfo("directory listing not available. URL = " + request.url().toString());
|
||||
sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, "directory listing not available. URL = " + request.url().toString());
|
||||
throw new IOException("directory listing not available. URL = " + request.url().toString());
|
||||
}
|
||||
String u = url.toNormalform(true, true);
|
||||
List<String> list = new ArrayList<String>();
|
||||
for (String s: l) {
|
||||
list.add(u + ((u.endsWith("/") || u.endsWith("\\")) ? "" : "/") + s);
|
||||
}
|
||||
|
||||
StringBuilder content = ftpc.dirhtml(u, null, null, null, list, true);
|
||||
|
||||
ResponseHeader responseHeader = new ResponseHeader();
|
||||
responseHeader.put(HeaderFramework.LAST_MODIFIED, DateFormatter.formatRFC1123(new Date()));
|
||||
responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html");
|
||||
Response response = new Response(
|
||||
request,
|
||||
requestHeader,
|
||||
responseHeader,
|
||||
"200",
|
||||
sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()),
|
||||
content.toString().getBytes());
|
||||
|
||||
return response;
|
||||
}
|
||||
|
||||
// create response header
|
||||
String mime = MimeTable.ext2mime(url.getFileExtension());
|
||||
ResponseHeader responseHeader = new ResponseHeader();
|
||||
responseHeader.put(HeaderFramework.LAST_MODIFIED, DateFormatter.formatRFC1123(new Date(url.lastModified())));
|
||||
responseHeader.put(HeaderFramework.CONTENT_TYPE, mime);
|
||||
|
||||
// check mime type and availability of parsers
|
||||
// and also check resource size and limitation of the size
|
||||
long size = url.length();
|
||||
String parserError = null;
|
||||
if ((acceptOnlyParseable && (parserError = TextParser.supports(url, mime)) != null) ||
|
||||
(size > maxFileSize && maxFileSize >= 0)) {
|
||||
// we know that we cannot process that file before loading
|
||||
// only the metadata is returned
|
||||
|
||||
if (parserError != null) {
|
||||
log.logInfo("No parser available in File crawler: '" + parserError + "' for URL " + request.url().toString() + ": parsing only metadata");
|
||||
} else {
|
||||
log.logInfo("Too big file in File crawler with size = " + size + " Bytes for URL " + request.url().toString() + ": parsing only metadata");
|
||||
}
|
||||
|
||||
// create response with metadata only
|
||||
responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/plain");
|
||||
Response response = new Response(
|
||||
request,
|
||||
requestHeader,
|
||||
responseHeader,
|
||||
"200",
|
||||
sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()),
|
||||
url.toNormalform(true, true).getBytes());
|
||||
return response;
|
||||
}
|
||||
|
||||
// load the resource
|
||||
InputStream is = url.getInputStream();
|
||||
byte[] b = FileUtils.read(is);
|
||||
is.close();
|
||||
|
||||
// create response with loaded content
|
||||
Response response = new Response(
|
||||
request,
|
||||
requestHeader,
|
||||
responseHeader,
|
||||
"200",
|
||||
sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()),
|
||||
b);
|
||||
return response;
|
||||
}
|
||||
}
|
|
@ -27,6 +27,7 @@ package de.anomic.crawler.retrieval;
|
|||
import java.io.IOException;
|
||||
import java.util.Date;
|
||||
|
||||
import net.yacy.cora.document.MultiProtocolURI;
|
||||
import net.yacy.document.TextParser;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.kelondro.logging.Log;
|
||||
|
@ -180,7 +181,7 @@ public final class HTTPLoader {
|
|||
}
|
||||
|
||||
// normalizing URL
|
||||
final DigestURI redirectionUrl = DigestURI.newURL(request.url(), redirectionUrlString);
|
||||
final DigestURI redirectionUrl = new DigestURI(MultiProtocolURI.newURL(request.url(), redirectionUrlString));
|
||||
|
||||
// restart crawling with new url
|
||||
this.log.logInfo("CRAWLER Redirection detected ('" + res.getStatusLine() + "') for URL " + request.url().toString());
|
||||
|
@ -289,7 +290,7 @@ public final class HTTPLoader {
|
|||
}
|
||||
|
||||
// normalizing URL
|
||||
final DigestURI redirectionUrl = DigestURI.newURL(request.url(), redirectionUrlString);
|
||||
final DigestURI redirectionUrl = new DigestURI(MultiProtocolURI.newURL(request.url(), redirectionUrlString));
|
||||
|
||||
|
||||
// if we are already doing a shutdown we don't need to retry crawling
|
||||
|
|
|
@ -52,6 +52,7 @@ import org.xml.sax.SAXException;
|
|||
|
||||
import de.anomic.data.bookmarksDB.Bookmark;
|
||||
import de.anomic.data.bookmarksDB.Tag;
|
||||
import net.yacy.cora.document.MultiProtocolURI;
|
||||
import net.yacy.document.parser.html.ContentScraper;
|
||||
import net.yacy.document.parser.html.TransformerWriter;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
|
@ -128,9 +129,9 @@ public class BookmarkHelper {
|
|||
|
||||
int importCount = 0;
|
||||
|
||||
Map<DigestURI, String> links = new HashMap<DigestURI, String>();
|
||||
Map<MultiProtocolURI, String> links = new HashMap<MultiProtocolURI, String>();
|
||||
String title;
|
||||
DigestURI url;
|
||||
MultiProtocolURI url;
|
||||
Bookmark bm;
|
||||
final Set<String> tags=listManager.string2set(tag); //this allow multiple default tags
|
||||
try {
|
||||
|
@ -142,14 +143,14 @@ public class BookmarkHelper {
|
|||
writer.close();
|
||||
links = scraper.getAnchors();
|
||||
} catch (final IOException e) { Log.logWarning("BOOKMARKS", "error during load of links: "+ e.getClass() +" "+ e.getMessage());}
|
||||
for (Entry<DigestURI, String> link: links.entrySet()) {
|
||||
url= link.getKey();
|
||||
title=link.getValue();
|
||||
for (Entry<MultiProtocolURI, String> link: links.entrySet()) {
|
||||
url = link.getKey();
|
||||
title = link.getValue();
|
||||
Log.logInfo("BOOKMARKS", "links.get(url)");
|
||||
if(title.equals("")){//cannot be displayed
|
||||
title=url.toString();
|
||||
if (title.equals("")) {//cannot be displayed
|
||||
title = url.toString();
|
||||
}
|
||||
bm=db.new Bookmark(url.toString());
|
||||
bm = db.new Bookmark(url.toString());
|
||||
bm.setProperty(Bookmark.BOOKMARK_TITLE, title);
|
||||
bm.setTags(tags);
|
||||
bm.setPublic(importPublic);
|
||||
|
|
|
@ -5,7 +5,7 @@ import java.io.File;
|
|||
import java.io.FileInputStream;
|
||||
import java.util.Properties;
|
||||
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.cora.document.MultiProtocolURI;
|
||||
|
||||
public class MimeTable {
|
||||
|
||||
|
@ -42,11 +42,11 @@ public class MimeTable {
|
|||
return mimeTable.getProperty(ext, dfltMime);
|
||||
}
|
||||
|
||||
public static String url2mime(final DigestURI url, final String dfltMime) {
|
||||
public static String url2mime(final MultiProtocolURI url, final String dfltMime) {
|
||||
return ext2mime(url.getFileExtension(), dfltMime);
|
||||
}
|
||||
|
||||
public static String url2mime(final DigestURI url) {
|
||||
public static String url2mime(final MultiProtocolURI url) {
|
||||
return ext2mime(url.getFileExtension());
|
||||
}
|
||||
}
|
||||
|
|
|
@ -2645,7 +2645,7 @@ public class ftpc {
|
|||
page.append("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 3.2 Final//EN\">\n");
|
||||
page.append("<html><head>\n");
|
||||
page.append(" <title>" + title + "</title>\n");
|
||||
page.append(" <meta name=\"generator\" content=\"YaCy ftpc dirlisting\">\n");
|
||||
page.append(" <meta name=\"generator\" content=\"YaCy directory listing\">\n");
|
||||
if (metaRobotNoindex) {
|
||||
page.append(" <meta name=\"robots\" content=\"noindex\">\n");
|
||||
}
|
||||
|
@ -2674,7 +2674,7 @@ public class ftpc {
|
|||
if (line.length() > nameEnd) {
|
||||
page.append(line.substring(nameEnd));
|
||||
}
|
||||
} else if (line.startsWith("http://") || line.startsWith("ftp://") || line.startsWith("smb://")) {
|
||||
} else if (line.startsWith("http://") || line.startsWith("ftp://") || line.startsWith("smb://") || line.startsWith("file://")) {
|
||||
page.append("<a href=\"" + line + "\">" + line + "</a>");
|
||||
} else {
|
||||
// raw
|
||||
|
|
|
@ -146,7 +146,7 @@ public class DocumentIndex extends Segment {
|
|||
* If the given file is a path to a directory, the complete sub-tree is indexed
|
||||
* @param start
|
||||
*/
|
||||
public void addConcurrent(DigestURI start) {
|
||||
public void addConcurrent(DigestURI start) throws IOException {
|
||||
assert (start != null);
|
||||
assert (start.canRead()) : start.toString();
|
||||
if (!start.isDirectory()) {
|
||||
|
|
|
@ -32,6 +32,7 @@ import java.util.TreeSet;
|
|||
|
||||
import de.anomic.data.MimeTable;
|
||||
|
||||
import net.yacy.cora.document.MultiProtocolURI;
|
||||
import net.yacy.document.Document;
|
||||
import net.yacy.document.parser.html.ImageEntry;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
|
@ -130,25 +131,25 @@ public class MediaSnippet implements Comparable<MediaSnippet>, Comparator<MediaS
|
|||
public static ArrayList<MediaSnippet> computeMediaSnippets(final Document document, final HandleSet queryhashes, final ContentDomain mediatype) {
|
||||
|
||||
if (document == null) return new ArrayList<MediaSnippet>();
|
||||
Map<DigestURI, String> media = null;
|
||||
Map<MultiProtocolURI, String> media = null;
|
||||
if (mediatype == ContentDomain.AUDIO) media = document.getAudiolinks();
|
||||
else if (mediatype == ContentDomain.VIDEO) media = document.getVideolinks();
|
||||
else if (mediatype == ContentDomain.APP) media = document.getApplinks();
|
||||
if (media == null) return null;
|
||||
|
||||
final Iterator<Map.Entry<DigestURI, String>> i = media.entrySet().iterator();
|
||||
Map.Entry<DigestURI, String> entry;
|
||||
final Iterator<Map.Entry<MultiProtocolURI, String>> i = media.entrySet().iterator();
|
||||
Map.Entry<MultiProtocolURI, String> entry;
|
||||
DigestURI url;
|
||||
String desc;
|
||||
final ArrayList<MediaSnippet> result = new ArrayList<MediaSnippet>();
|
||||
while (i.hasNext()) {
|
||||
entry = i.next();
|
||||
url = entry.getKey();
|
||||
url = new DigestURI(entry.getKey());
|
||||
desc = entry.getValue();
|
||||
int ranking = TextSnippet.removeAppearanceHashes(url.toNormalform(false, false), queryhashes).size() +
|
||||
TextSnippet.removeAppearanceHashes(desc, queryhashes).size();
|
||||
if (ranking < 2 * queryhashes.size()) {
|
||||
result.add(new MediaSnippet(mediatype, url, MimeTable.url2mime(url), desc, document.getTextLength(), null, ranking, document.dc_source()));
|
||||
result.add(new MediaSnippet(mediatype, url, MimeTable.url2mime(url), desc, document.getTextLength(), null, ranking, new DigestURI(document.dc_source())));
|
||||
}
|
||||
}
|
||||
return result;
|
||||
|
@ -167,7 +168,7 @@ public class MediaSnippet implements Comparable<MediaSnippet>, Comparator<MediaS
|
|||
final ArrayList<MediaSnippet> result = new ArrayList<MediaSnippet>();
|
||||
while (i.hasNext()) {
|
||||
ientry = i.next();
|
||||
url = ientry.url();
|
||||
url = new DigestURI(ientry.url());
|
||||
String u = url.toString();
|
||||
if (u.indexOf(".ico") >= 0 || u.indexOf("favicon") >= 0) continue;
|
||||
if (ientry.height() > 0 && ientry.height() < 64) continue;
|
||||
|
@ -177,7 +178,7 @@ public class MediaSnippet implements Comparable<MediaSnippet>, Comparator<MediaS
|
|||
TextSnippet.removeAppearanceHashes(url.toNormalform(false, false), queryhashes).size() -
|
||||
TextSnippet.removeAppearanceHashes(desc, queryhashes).size();
|
||||
final int ranking = Integer.MAX_VALUE - (ientry.height() + 1) * (ientry.width() + 1) * (appcount + 1);
|
||||
result.add(new MediaSnippet(ContentDomain.IMAGE, url, MimeTable.url2mime(url), desc, ientry.fileSize(), ientry.width(), ientry.height(), ranking, document.dc_source()));
|
||||
result.add(new MediaSnippet(ContentDomain.IMAGE, url, MimeTable.url2mime(url), desc, ientry.fileSize(), ientry.width(), ientry.height(), ranking, new DigestURI(document.dc_source())));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
|
|
@ -38,6 +38,7 @@ import java.util.Iterator;
|
|||
import java.util.Map;
|
||||
import java.util.TreeSet;
|
||||
|
||||
import net.yacy.cora.document.MultiProtocolURI;
|
||||
import net.yacy.document.parser.html.CharacterCoding;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.kelondro.data.meta.URIMetadataRow;
|
||||
|
@ -516,7 +517,7 @@ public final class MetadataRepository implements Iterable<byte[]> {
|
|||
if (format == 2) {
|
||||
pw.println("<item>");
|
||||
pw.println("<title>" + CharacterCoding.unicode2xml(metadata.dc_title(), true) + "</title>");
|
||||
pw.println("<link>" + DigestURI.escape(url) + "</link>");
|
||||
pw.println("<link>" + MultiProtocolURI.escape(url) + "</link>");
|
||||
if (metadata.dc_creator().length() > 0) pw.println("<author>" + CharacterCoding.unicode2xml(metadata.dc_creator(), true) + "</author>");
|
||||
if (metadata.dc_subject().length() > 0) pw.println("<description>" + CharacterCoding.unicode2xml(metadata.dc_subject(), true) + "</description>");
|
||||
pw.println("<pubDate>" + entry.moddate().toString() + "</pubDate>");
|
||||
|
|
|
@ -39,6 +39,7 @@ import java.util.concurrent.BlockingQueue;
|
|||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
import net.yacy.cora.document.MultiProtocolURI;
|
||||
import net.yacy.document.Condenser;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.kelondro.data.meta.URIMetadataRow;
|
||||
|
@ -631,7 +632,7 @@ public final class RankingProcess extends Thread {
|
|||
// take out relevant information for reference computation
|
||||
if ((resultEntry.url() == null) || (resultEntry.title() == null)) return;
|
||||
//final String[] urlcomps = htmlFilterContentScraper.urlComps(resultEntry.url().toNormalform(true, true)); // word components of the url
|
||||
final String[] descrcomps = DigestURI.splitpattern.split(resultEntry.title().toLowerCase()); // words in the description
|
||||
final String[] descrcomps = MultiProtocolURI.splitpattern.split(resultEntry.title().toLowerCase()); // words in the description
|
||||
|
||||
// add references
|
||||
//addTopic(urlcomps);
|
||||
|
|
|
@ -31,6 +31,7 @@ import java.util.ArrayList;
|
|||
import java.util.Comparator;
|
||||
import java.util.Date;
|
||||
|
||||
import net.yacy.cora.document.MultiProtocolURI;
|
||||
import net.yacy.document.Condenser;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.kelondro.data.meta.URIMetadataRow;
|
||||
|
@ -124,7 +125,7 @@ public class ResultEntry implements Comparable<ResultEntry>, Comparator<ResultEn
|
|||
return (alternative_urlstring == null) ? urlcomps.url().toNormalform(false, true) : alternative_urlstring;
|
||||
}
|
||||
public String urlname() {
|
||||
return (alternative_urlname == null) ? DigestURI.unescape(urlcomps.url().toNormalform(false, true)) : alternative_urlname;
|
||||
return (alternative_urlname == null) ? MultiProtocolURI.unescape(urlcomps.url().toNormalform(false, true)) : alternative_urlname;
|
||||
}
|
||||
public String title() {
|
||||
return urlcomps.dc_title();
|
||||
|
|
|
@ -30,8 +30,8 @@ import java.util.ArrayList;
|
|||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
|
||||
import net.yacy.cora.document.MultiProtocolURI;
|
||||
import net.yacy.document.Condenser;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.kelondro.data.meta.URIMetadataRow;
|
||||
import net.yacy.kelondro.data.word.Word;
|
||||
import net.yacy.kelondro.index.HandleSet;
|
||||
|
@ -370,8 +370,8 @@ public class ResultFetcher {
|
|||
|
||||
// apply 'common-sense' heuristic using references
|
||||
final String urlstring = rentry.url().toNormalform(true, true);
|
||||
final String[] urlcomps = DigestURI.urlComps(urlstring);
|
||||
final String[] descrcomps = DigestURI.splitpattern.split(rentry.title().toLowerCase());
|
||||
final String[] urlcomps = MultiProtocolURI.urlComps(urlstring);
|
||||
final String[] descrcomps = MultiProtocolURI.splitpattern.split(rentry.title().toLowerCase());
|
||||
Navigator.Item tc;
|
||||
for (int j = 0; j < urlcomps.length; j++) {
|
||||
tc = topwords.get(urlcomps[j]);
|
||||
|
|
|
@ -37,6 +37,7 @@ import java.util.Map;
|
|||
import java.util.Set;
|
||||
import java.util.TreeSet;
|
||||
|
||||
import net.yacy.cora.document.MultiProtocolURI;
|
||||
import net.yacy.document.Condenser;
|
||||
import net.yacy.document.Document;
|
||||
import net.yacy.document.ParserException;
|
||||
|
@ -198,7 +199,7 @@ public class Segment {
|
|||
private int addPageIndex(final DigestURI url, final Date urlModified, final Document document, final Condenser condenser, final String language, final char doctype, final int outlinksSame, final int outlinksOther) {
|
||||
int wordCount = 0;
|
||||
final int urlLength = url.toNormalform(true, true).length();
|
||||
final int urlComps = DigestURI.urlComps(url.toString()).length;
|
||||
final int urlComps = MultiProtocolURI.urlComps(url.toString()).length;
|
||||
|
||||
// iterate over all words of context text
|
||||
final Iterator<Map.Entry<String, Word>> i = condenser.words().entrySet().iterator();
|
||||
|
@ -273,10 +274,10 @@ public class Segment {
|
|||
if (!u.contains("/" + language + "/") && !u.contains("/" + ISO639.country(language).toLowerCase() + "/")) {
|
||||
// no confirmation using the url, use the TLD
|
||||
language = url.language();
|
||||
System.out.println(error + ", corrected using the TLD");
|
||||
log.logWarning(error + ", corrected using the TLD");
|
||||
} else {
|
||||
// this is a strong hint that the statistics was in fact correct
|
||||
System.out.println(error + ", but the url proves that the statistic is correct");
|
||||
log.logWarning(error + ", but the url proves that the statistic is correct");
|
||||
}
|
||||
}
|
||||
} else {
|
||||
|
|
|
@ -70,16 +70,17 @@ import java.util.zip.GZIPOutputStream;
|
|||
import java.util.zip.ZipEntry;
|
||||
import java.util.zip.ZipInputStream;
|
||||
|
||||
import net.yacy.cora.document.MultiProtocolURI;
|
||||
import net.yacy.cora.document.RSSFeed;
|
||||
import net.yacy.cora.document.RSSMessage;
|
||||
import net.yacy.document.Condenser;
|
||||
import net.yacy.document.Document;
|
||||
import net.yacy.document.TextParser;
|
||||
import net.yacy.document.ParserException;
|
||||
import net.yacy.document.content.DCEntry;
|
||||
import net.yacy.document.content.RSSMessage;
|
||||
import net.yacy.document.content.SurrogateReader;
|
||||
import net.yacy.document.importer.OAIListFriendsLoader;
|
||||
import net.yacy.document.parser.html.ImageEntry;
|
||||
import net.yacy.document.parser.xml.RSSFeed;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.kelondro.data.meta.URIMetadataRow;
|
||||
import net.yacy.kelondro.data.meta.URIMetadataRow.Components;
|
||||
|
@ -291,7 +292,7 @@ public final class Switchboard extends serverSwitch {
|
|||
// init sessionid name file
|
||||
final String sessionidNamesFile = getConfig("sessionidNamesFile","");
|
||||
this.log.logConfig("Loading sessionid file " + sessionidNamesFile);
|
||||
DigestURI.initSessionIDNames(new File(getRootPath(), sessionidNamesFile));
|
||||
MultiProtocolURI.initSessionIDNames(FileUtils.loadList(new File(getRootPath(), sessionidNamesFile)));
|
||||
|
||||
// init tables
|
||||
this.tables = new WorkTables(this.workPath);
|
||||
|
@ -1733,7 +1734,7 @@ public final class Switchboard extends serverSwitch {
|
|||
((response.profile() == null) || (response.depth() < response.profile().depth()))
|
||||
) {
|
||||
// get the hyperlinks
|
||||
final Map<DigestURI, String> hl = document.getHyperlinks();
|
||||
final Map<MultiProtocolURI, String> hl = document.getHyperlinks();
|
||||
|
||||
// add all images also to the crawl stack
|
||||
for (ImageEntry imageReference : document.getImages().values()) {
|
||||
|
@ -1741,15 +1742,15 @@ public final class Switchboard extends serverSwitch {
|
|||
}
|
||||
|
||||
// insert those hyperlinks to the crawler
|
||||
DigestURI nextUrl;
|
||||
for (Map.Entry<DigestURI, String> nextEntry : hl.entrySet()) {
|
||||
MultiProtocolURI nextUrl;
|
||||
for (Map.Entry<MultiProtocolURI, String> nextEntry : hl.entrySet()) {
|
||||
// check for interruption
|
||||
checkInterruption();
|
||||
|
||||
// process the next hyperlink
|
||||
nextUrl = nextEntry.getKey();
|
||||
String u = nextUrl.toNormalform(true, true, true);
|
||||
if (!(u.startsWith("http") || u.startsWith("ftp") || u.startsWith("smb"))) continue;
|
||||
if (!(u.startsWith("http://") || u.startsWith("ftp://") || u.startsWith("smb://") || u.startsWith("file://"))) continue;
|
||||
// enqueue the hyperlink into the pre-notice-url db
|
||||
try {
|
||||
crawlStacker.enqueueEntry(new Request(
|
||||
|
|
|
@ -405,7 +405,7 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
|
|||
/* ===========================================================================
|
||||
* COMPUTE SNIPPET
|
||||
* =========================================================================== */
|
||||
final DigestURI resFavicon = document.getFavicon();
|
||||
final DigestURI resFavicon = (document.getFavicon() == null) ? null : new DigestURI(document.getFavicon());
|
||||
if (resFavicon != null) faviconCache.put(new String(url.hash()), resFavicon);
|
||||
// we have found a parseable non-empty file: use the lines
|
||||
|
||||
|
|
|
@ -52,8 +52,8 @@ import java.util.ArrayList;
|
|||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import net.yacy.cora.document.MultiProtocolURI;
|
||||
import net.yacy.document.parser.html.CharacterCoding;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.kelondro.logging.Log;
|
||||
import net.yacy.kelondro.util.DateFormatter;
|
||||
import net.yacy.kelondro.util.Formatter;
|
||||
|
@ -369,9 +369,9 @@ public class serverObjects extends HashMap<String, String> implements Cloneable
|
|||
if (this.size() == 0) return "";
|
||||
StringBuilder param = new StringBuilder();
|
||||
for (Map.Entry<String, String> entry: this.entrySet()) {
|
||||
param.append(DigestURI.escape(entry.getKey()));
|
||||
param.append(MultiProtocolURI.escape(entry.getKey()));
|
||||
param.append('=');
|
||||
param.append(DigestURI.escape(entry.getValue()));
|
||||
param.append(MultiProtocolURI.escape(entry.getValue()));
|
||||
param.append('&');
|
||||
}
|
||||
param.setLength(param.length() - 1);
|
||||
|
|
|
@ -37,6 +37,7 @@ import java.util.SortedMap;
|
|||
import java.util.TreeMap;
|
||||
import java.util.TreeSet;
|
||||
|
||||
import net.yacy.cora.document.MultiProtocolURI;
|
||||
import net.yacy.document.Condenser;
|
||||
import net.yacy.document.Document;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
|
@ -95,11 +96,11 @@ public class WebStructureGraph {
|
|||
}
|
||||
|
||||
public Integer[] /*(outlinksSame, outlinksOther)*/ generateCitationReference(final Document document, final Condenser condenser, final Date docDate) {
|
||||
final DigestURI url = document.dc_source();
|
||||
final DigestURI url = new DigestURI(document.dc_source());
|
||||
|
||||
// generate citation reference
|
||||
final Map<DigestURI, String> hl = document.getHyperlinks();
|
||||
final Iterator<DigestURI> it = hl.keySet().iterator();
|
||||
final Map<MultiProtocolURI, String> hl = document.getHyperlinks();
|
||||
final Iterator<MultiProtocolURI> it = hl.keySet().iterator();
|
||||
byte[] nexturlhashb;
|
||||
String nexturlhash;
|
||||
final StringBuilder cpg = new StringBuilder(12 * (hl.size() + 1) + 1);
|
||||
|
@ -109,7 +110,7 @@ public class WebStructureGraph {
|
|||
int GCount = 0;
|
||||
int LCount = 0;
|
||||
while (it.hasNext()) {
|
||||
nexturlhashb = it.next().hash();
|
||||
nexturlhashb = new DigestURI(it.next()).hash();
|
||||
if (nexturlhashb != null) {
|
||||
nexturlhash = new String(nexturlhashb);
|
||||
assert nexturlhash.length() == 12 : "nexturlhash.length() = " + nexturlhash.length() + ", nexturlhash = " + nexturlhash;
|
||||
|
|
|
@ -54,14 +54,12 @@ import java.util.Iterator;
|
|||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.TreeMap;
|
||||
import java.util.concurrent.BlockingQueue;
|
||||
import java.util.concurrent.LinkedBlockingQueue;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import net.yacy.document.content.RSSMessage;
|
||||
import net.yacy.document.parser.xml.RSSFeed;
|
||||
import net.yacy.document.parser.xml.RSSReader;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.cora.document.RSSFeed;
|
||||
import net.yacy.cora.document.RSSReader;
|
||||
import net.yacy.cora.protocol.HttpConnector;
|
||||
import net.yacy.cora.services.Search;
|
||||
import net.yacy.kelondro.data.meta.URIMetadataRow;
|
||||
import net.yacy.kelondro.data.word.Word;
|
||||
import net.yacy.kelondro.data.word.WordReference;
|
||||
|
@ -86,10 +84,8 @@ import de.anomic.crawler.retrieval.HTTPLoader;
|
|||
import de.anomic.http.client.DefaultCharsetFilePart;
|
||||
import de.anomic.http.client.DefaultCharsetStringPart;
|
||||
import de.anomic.http.client.Client;
|
||||
import de.anomic.http.client.RemoteProxyConfig;
|
||||
import de.anomic.http.server.HeaderFramework;
|
||||
import de.anomic.http.server.RequestHeader;
|
||||
import de.anomic.http.server.ResponseContainer;
|
||||
import de.anomic.search.RankingProfile;
|
||||
import de.anomic.search.RankingProcess;
|
||||
import de.anomic.search.Segment;
|
||||
|
@ -101,6 +97,22 @@ import de.anomic.tools.crypt;
|
|||
|
||||
public final class yacyClient {
|
||||
|
||||
|
||||
/**
|
||||
* @see wput
|
||||
* @param target
|
||||
* @param filename
|
||||
* @param post
|
||||
* @return
|
||||
* @throws IOException
|
||||
*/
|
||||
private static byte[] postToFile(final yacySeed target, final String filename, final List<Part> post, final int timeout) throws IOException {
|
||||
return HttpConnector.wput("http://" + target.getClusterAddress() + "/yacy/" + filename, target.getHexHash() + ".yacyh", post, timeout, false);
|
||||
}
|
||||
private static byte[] postToFile(final yacySeedDB seedDB, final String targetHash, final String filename, final List<Part> post, final int timeout) throws IOException {
|
||||
return HttpConnector.wput("http://" + targetAddress(seedDB, targetHash) + "/yacy/" + filename, yacySeed.b64Hash2hexHash(targetHash)+ ".yacyh", post, timeout, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* this is called to enrich the seed information by
|
||||
* - own address (if peer is behind a nat/router)
|
||||
|
@ -134,7 +146,7 @@ public final class yacyClient {
|
|||
post.add(new DefaultCharsetStringPart("seed", mySeed.genSeedStr(salt)));
|
||||
// send request
|
||||
final long start = System.currentTimeMillis();
|
||||
final byte[] content = wput("http://" + address + "/yacy/hello.html", yacySeed.b64Hash2hexHash(otherHash) + ".yacyh", post, 30000, false);
|
||||
final byte[] content = HttpConnector.wput("http://" + address + "/yacy/hello.html", yacySeed.b64Hash2hexHash(otherHash) + ".yacyh", post, 30000, false);
|
||||
yacyCore.log.logInfo("yacyClient.publishMySeed thread '" + Thread.currentThread().getName() + "' contacted peer at " + address + ", received " + ((content == null) ? "null" : content.length) + " bytes, time = " + (System.currentTimeMillis() - start) + " milliseconds");
|
||||
result = FileUtils.table(content);
|
||||
break;
|
||||
|
@ -237,82 +249,6 @@ public final class yacyClient {
|
|||
return count;
|
||||
}
|
||||
|
||||
/**
|
||||
* send data to the server named by vhost
|
||||
*
|
||||
* @param address address of the server
|
||||
* @param vhost name of the server at address which should respond
|
||||
* @param post data to send (name-value-pairs)
|
||||
* @param gzipBody send with content gzip encoded
|
||||
* @return response body
|
||||
* @throws IOException
|
||||
*/
|
||||
/*
|
||||
private static byte[] wput(final String url, String vhost, final List<Part> post, boolean gzipBody) throws IOException {
|
||||
return wput(url, vhost, post, 10000, gzipBody);
|
||||
}
|
||||
*/
|
||||
/**
|
||||
* send data to the server named by vhost
|
||||
*
|
||||
* @param address address of the server
|
||||
* @param vhost name of the server at address which should respond
|
||||
* @param post data to send (name-value-pairs)
|
||||
* @param timeout in milliseconds
|
||||
* @return response body
|
||||
* @throws IOException
|
||||
*/
|
||||
private static byte[] wput(final String url, final String vhost, final List<Part> post, final int timeout) throws IOException {
|
||||
return wput(url, vhost, post, timeout, false);
|
||||
}
|
||||
/**
|
||||
* send data to the server named by vhost
|
||||
*
|
||||
* @param address address of the server
|
||||
* @param vhost name of the server at address which should respond
|
||||
* @param post data to send (name-value-pairs)
|
||||
* @param timeout in milliseconds
|
||||
* @param gzipBody send with content gzip encoded
|
||||
* @return response body
|
||||
* @throws IOException
|
||||
*/
|
||||
private static byte[] wput(final String url, final String vhost, final List<Part> post, final int timeout, final boolean gzipBody) throws IOException {
|
||||
final RequestHeader header = new RequestHeader();
|
||||
header.put(HeaderFramework.USER_AGENT, HTTPLoader.yacyUserAgent);
|
||||
header.put(HeaderFramework.HOST, vhost);
|
||||
final Client client = new Client(timeout, header);
|
||||
client.setProxy(proxyConfig());
|
||||
|
||||
ResponseContainer res = null;
|
||||
byte[] content = null;
|
||||
try {
|
||||
// send request/data
|
||||
res = client.POST(url, post, gzipBody);
|
||||
content = res.getData();
|
||||
} finally {
|
||||
if(res != null) {
|
||||
// release connection
|
||||
res.closeStream();
|
||||
}
|
||||
}
|
||||
return content;
|
||||
}
|
||||
|
||||
/**
|
||||
* @see wput
|
||||
* @param target
|
||||
* @param filename
|
||||
* @param post
|
||||
* @return
|
||||
* @throws IOException
|
||||
*/
|
||||
private static byte[] postToFile(final yacySeed target, final String filename, final List<Part> post, final int timeout) throws IOException {
|
||||
return wput("http://" + target.getClusterAddress() + "/yacy/" + filename, target.getHexHash() + ".yacyh", post, timeout, false);
|
||||
}
|
||||
private static byte[] postToFile(final yacySeedDB seedDB, final String targetHash, final String filename, final List<Part> post, final int timeout) throws IOException {
|
||||
return wput("http://" + targetAddress(seedDB, targetHash) + "/yacy/" + filename, yacySeed.b64Hash2hexHash(targetHash)+ ".yacyh", post, timeout, false);
|
||||
}
|
||||
|
||||
public static yacySeed querySeed(final yacySeed target, final String seedHash) {
|
||||
// prepare request
|
||||
final String salt = crypt.randomSalt();
|
||||
|
@ -400,7 +336,7 @@ public final class yacyClient {
|
|||
// send request
|
||||
try {
|
||||
/* a long time-out is needed */
|
||||
final byte[] result = wput("http://" + target.getClusterAddress() + "/yacy/urls.xml", target.getHexHash() + ".yacyh", post, (int) maxTime);
|
||||
final byte[] result = HttpConnector.wput("http://" + target.getClusterAddress() + "/yacy/urls.xml", target.getHexHash() + ".yacyh", post, (int) maxTime);
|
||||
final RSSReader reader = RSSReader.parse(result);
|
||||
if (reader == null) {
|
||||
yacyCore.log.logWarning("yacyClient.queryRemoteCrawlURLs failed asking peer '" + target.getName() + "': probably bad response from remote peer (1), reader == null");
|
||||
|
@ -425,120 +361,11 @@ public final class yacyClient {
|
|||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public static BlockingQueue<RSSMessage> search(String urlBase, String query, boolean verify, boolean global, long timeout, int maximumRecords) {
|
||||
if (urlBase == null) {
|
||||
urlBase = "http://localhost:" + Switchboard.getSwitchboard().getConfig("port", "8080") + "/yacysearch.rss";
|
||||
}
|
||||
BlockingQueue<RSSMessage> queue = new LinkedBlockingQueue<RSSMessage>();
|
||||
searchJob job = new searchJob(urlBase, query, verify, global, timeout, maximumRecords, queue);
|
||||
job.start();
|
||||
return queue;
|
||||
}
|
||||
|
||||
private final static int recordsPerSession = 10;
|
||||
|
||||
public static class searchJob extends Thread {
|
||||
|
||||
String urlBase, query;
|
||||
boolean verify, global;
|
||||
long timeout;
|
||||
int startRecord, maximumRecords;
|
||||
BlockingQueue<RSSMessage> queue;
|
||||
|
||||
public searchJob(String urlBase, String query, boolean verify, boolean global, long timeout, int maximumRecords, BlockingQueue<RSSMessage> queue) {
|
||||
this.urlBase = urlBase;
|
||||
this.query = query;
|
||||
this.verify = verify;
|
||||
this.global = global;
|
||||
this.timeout = timeout;
|
||||
this.startRecord = 0;
|
||||
this.maximumRecords = maximumRecords;
|
||||
this.queue = queue;
|
||||
}
|
||||
|
||||
public void run() {
|
||||
RSSMessage message;
|
||||
mainloop: while (timeout > 0 && maximumRecords > 0) {
|
||||
long st = System.currentTimeMillis();
|
||||
RSSFeed feed = search(urlBase, query, verify, global, timeout, startRecord, recordsPerSession);
|
||||
if (feed == null || feed.isEmpty()) break mainloop;
|
||||
maximumRecords -= feed.size();
|
||||
innerloop: while (!feed.isEmpty()) {
|
||||
message = feed.pollMessage();
|
||||
if (message == null) break innerloop;
|
||||
try {
|
||||
queue.put(message);
|
||||
} catch (InterruptedException e) {
|
||||
break innerloop;
|
||||
}
|
||||
}
|
||||
startRecord += recordsPerSession;
|
||||
timeout -= System.currentTimeMillis() - st;
|
||||
}
|
||||
try { queue.put(RSSMessage.POISON); } catch (InterruptedException e) {}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* send a query to a yacy public search interface
|
||||
* @param urlBase the target url base (everything before the ? that follows the SRU request syntax properties). can null, then the local peer is used
|
||||
* @param query the query as string
|
||||
* @param startRecord number of first record
|
||||
* @param maximumRecords maximum number of records
|
||||
* @param verify if true, result entries are verified using the snippet fetch (slow); if false simply the result is returned
|
||||
* @param global if true also search results from other peers are included
|
||||
* @param timeout milliseconds that are waited at maximum for a search result
|
||||
* @return
|
||||
*/
|
||||
public static RSSFeed search(String urlBase, String query, boolean verify, boolean global, long timeout, int startRecord, int maximumRecords) {
|
||||
// returns a search result from a peer
|
||||
if (urlBase == null) {
|
||||
urlBase = "http://localhost:" + Switchboard.getSwitchboard().getConfig("port", "8080") + "/yacysearch.rss";
|
||||
}
|
||||
DigestURI uri = null;
|
||||
try {
|
||||
uri = new DigestURI(urlBase, null);
|
||||
} catch (MalformedURLException e) {
|
||||
yacyCore.log.logWarning("yacyClient.search failed asking peer '" + urlBase + "': bad url, " + e.getMessage());
|
||||
return null;
|
||||
}
|
||||
|
||||
// prepare request
|
||||
final List<Part> post = new ArrayList<Part>();
|
||||
post.add(new DefaultCharsetStringPart("query", query));
|
||||
post.add(new DefaultCharsetStringPart("startRecord", Integer.toString(startRecord)));
|
||||
post.add(new DefaultCharsetStringPart("maximumRecords", Long.toString(maximumRecords)));
|
||||
post.add(new DefaultCharsetStringPart("verify", verify ? "true" : "false"));
|
||||
post.add(new DefaultCharsetStringPart("resource", global ? "global" : "local"));
|
||||
|
||||
// send request
|
||||
try {
|
||||
final byte[] result = wput(urlBase, uri.getHost(), post, (int) timeout);
|
||||
//String debug = new String(result); System.out.println("*** DEBUG: " + debug);
|
||||
final RSSReader reader = RSSReader.parse(result);
|
||||
if (reader == null) {
|
||||
yacyCore.log.logWarning("yacyClient.search failed asking peer '" + uri.getHost() + "': probably bad response from remote peer (1), reader == null");
|
||||
return null;
|
||||
}
|
||||
final RSSFeed feed = reader.getFeed();
|
||||
if (feed == null) {
|
||||
// case where the rss reader does not understand the content
|
||||
yacyCore.log.logWarning("yacyClient.search failed asking peer '" + uri.getHost() + "': probably bad response from remote peer (2)");
|
||||
return null;
|
||||
}
|
||||
return feed;
|
||||
} catch (final IOException e) {
|
||||
yacyCore.log.logSevere("yacyClient.search error asking peer '" + uri.getHost() + "':" + e.toString());
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
public static RSSFeed search(final yacySeed targetSeed, String query, boolean verify, boolean global, long timeout, int startRecord, int maximumRecords) {
|
||||
public static RSSFeed search(final yacySeed targetSeed, String query, boolean verify, boolean global, long timeout, int startRecord, int maximumRecords) throws IOException {
|
||||
String address = (targetSeed == null || targetSeed == Switchboard.getSwitchboard().peers.mySeed()) ? "localhost:" + Switchboard.getSwitchboard().getConfig("port", "8080") : targetSeed.getClusterAddress();
|
||||
String urlBase = "http://" + address + "/yacysearch.rss";
|
||||
return search(urlBase, query, verify, global, timeout, startRecord, maximumRecords);
|
||||
return Search.search(urlBase, query, verify, global, timeout, startRecord, maximumRecords);
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
|
@ -607,7 +434,7 @@ public final class yacyClient {
|
|||
// send request
|
||||
HashMap<String, String> result = null;
|
||||
try {
|
||||
result = FileUtils.table(wput("http://" + target.getClusterAddress() + "/yacy/search.html", target.getHexHash() + ".yacyh", post, 60000));
|
||||
result = FileUtils.table(HttpConnector.wput("http://" + target.getClusterAddress() + "/yacy/search.html", target.getHexHash() + ".yacyh", post, 60000));
|
||||
} catch (final IOException e) {
|
||||
yacyCore.log.logInfo("SEARCH failed, Peer: " + target.hash + ":" + target.getName() + " (" + e.getMessage() + "), score=" + target.selectscore);
|
||||
//yacyCore.peerActions.peerDeparture(target, "search request to peer created io exception: " + e.getMessage());
|
||||
|
@ -878,7 +705,7 @@ public final class yacyClient {
|
|||
|
||||
// send request
|
||||
try {
|
||||
final byte[] content = wput("http://" + targetAddress + "/yacy/transfer.html", targetAddress, post, 10000);
|
||||
final byte[] content = HttpConnector.wput("http://" + targetAddress + "/yacy/transfer.html", targetAddress, post, 10000);
|
||||
final HashMap<String, String> result = FileUtils.table(content);
|
||||
return result;
|
||||
} catch (final Exception e) {
|
||||
|
@ -902,7 +729,7 @@ public final class yacyClient {
|
|||
|
||||
// send request
|
||||
try {
|
||||
final byte[] content = wput("http://" + targetAddress + "/yacy/transfer.html", targetAddress, post, 20000);
|
||||
final byte[] content = HttpConnector.wput("http://" + targetAddress + "/yacy/transfer.html", targetAddress, post, 20000);
|
||||
final HashMap<String, String> result = FileUtils.table(content);
|
||||
return result;
|
||||
} catch (final Exception e) {
|
||||
|
@ -977,7 +804,7 @@ public final class yacyClient {
|
|||
|
||||
// send request
|
||||
try {
|
||||
final byte[] content = wput("http://" + address + "/yacy/crawlReceipt.html", target.getHexHash() + ".yacyh", post, 10000);
|
||||
final byte[] content = HttpConnector.wput("http://" + address + "/yacy/crawlReceipt.html", target.getHexHash() + ".yacyh", post, 10000);
|
||||
return FileUtils.table(content);
|
||||
} catch (final Exception e) {
|
||||
// most probably a network time-out exception
|
||||
|
@ -1127,7 +954,7 @@ public final class yacyClient {
|
|||
post.add(new DefaultCharsetStringPart("entryc", Integer.toString(indexcount)));
|
||||
post.add(new DefaultCharsetStringPart("indexes", entrypost.toString()));
|
||||
try {
|
||||
final byte[] content = wput("http://" + address + "/yacy/transferRWI.html", targetSeed.getHexHash() + ".yacyh", post, timeout, gzipBody);
|
||||
final byte[] content = HttpConnector.wput("http://" + address + "/yacy/transferRWI.html", targetSeed.getHexHash() + ".yacyh", post, timeout, gzipBody);
|
||||
final Iterator<String> v = FileUtils.strings(content);
|
||||
// this should return a list of urlhashes that are unknown
|
||||
|
||||
|
@ -1171,7 +998,7 @@ public final class yacyClient {
|
|||
}
|
||||
post.add(new DefaultCharsetStringPart("urlc", Integer.toString(urlc)));
|
||||
try {
|
||||
final byte[] content = wput("http://" + address + "/yacy/transferURL.html", targetSeed.getHexHash() + ".yacyh", post, timeout, gzipBody);
|
||||
final byte[] content = HttpConnector.wput("http://" + address + "/yacy/transferURL.html", targetSeed.getHexHash() + ".yacyh", post, timeout, gzipBody);
|
||||
final Iterator<String> v = FileUtils.strings(content);
|
||||
|
||||
final HashMap<String, String> result = FileUtils.table(v);
|
||||
|
@ -1193,7 +1020,7 @@ public final class yacyClient {
|
|||
String address = targetSeed.getClusterAddress();
|
||||
if (address == null) { address = "localhost:8080"; }
|
||||
try {
|
||||
final byte[] content = wput("http://" + address + "/yacy/profile.html", targetSeed.getHexHash() + ".yacyh", post, 5000);
|
||||
final byte[] content = HttpConnector.wput("http://" + address + "/yacy/profile.html", targetSeed.getHexHash() + ".yacyh", post, 5000);
|
||||
return FileUtils.table(content);
|
||||
} catch (final Exception e) {
|
||||
yacyCore.log.logSevere("yacyClient.getProfile error:" + e.getMessage());
|
||||
|
@ -1201,14 +1028,6 @@ public final class yacyClient {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* proxy for "to YaCy connections"
|
||||
* @return
|
||||
*/
|
||||
private static final RemoteProxyConfig proxyConfig() {
|
||||
final RemoteProxyConfig p = RemoteProxyConfig.getRemoteProxyConfig();
|
||||
return ((p != null) && (p.useProxy()) && (p.useProxy4Yacy())) ? p : null;
|
||||
}
|
||||
|
||||
public static void main(final String[] args) {
|
||||
if(args.length > 1) {
|
||||
|
@ -1262,7 +1081,7 @@ public final class yacyClient {
|
|||
//post.add(new FilePart("filename", new ByteArrayPartSource(filename, file)));
|
||||
// do it!
|
||||
try {
|
||||
final byte[] response = wput(url.toString(), vhost, post, timeout, gzipBody);
|
||||
final byte[] response = HttpConnector.wput(url.toString(), vhost, post, timeout, gzipBody);
|
||||
System.out.println(new String(response));
|
||||
} catch (final IOException e) {
|
||||
Log.logException(e);
|
||||
|
|
|
@ -48,8 +48,8 @@ import java.util.List;
|
|||
import java.util.Map;
|
||||
import java.util.concurrent.Semaphore;
|
||||
|
||||
import net.yacy.document.content.RSSMessage;
|
||||
import net.yacy.document.parser.xml.RSSFeed;
|
||||
import net.yacy.cora.document.RSSFeed;
|
||||
import net.yacy.cora.document.RSSMessage;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.kelondro.logging.Log;
|
||||
import net.yacy.kelondro.util.DateFormatter;
|
||||
|
|
|
@ -26,8 +26,8 @@ package de.anomic.yacy;
|
|||
|
||||
import java.util.HashMap;
|
||||
|
||||
import net.yacy.document.content.RSSMessage;
|
||||
import net.yacy.document.parser.xml.RSSFeed;
|
||||
import net.yacy.cora.document.RSSFeed;
|
||||
import net.yacy.cora.document.RSSMessage;
|
||||
import net.yacy.kelondro.logging.Log;
|
||||
import net.yacy.kelondro.util.DateFormatter;
|
||||
import net.yacy.kelondro.util.MapTools;
|
||||
|
|
|
@ -45,8 +45,8 @@ import java.util.Map;
|
|||
import java.util.SortedSet;
|
||||
import java.util.TreeSet;
|
||||
|
||||
import net.yacy.cora.document.MultiProtocolURI;
|
||||
import net.yacy.document.parser.html.ContentScraper;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.kelondro.io.CharBuffer;
|
||||
import net.yacy.kelondro.logging.Log;
|
||||
import net.yacy.kelondro.order.Base64Order;
|
||||
|
@ -74,17 +74,17 @@ public final class yacyRelease extends yacyVersion {
|
|||
private static Map<yacyUpdateLocation, DevAndMainVersions> latestReleases = new HashMap<yacyUpdateLocation, DevAndMainVersions>();
|
||||
public final static List<yacyUpdateLocation> latestReleaseLocations = new ArrayList<yacyUpdateLocation>(); // will be initialized with value in defaults/yacy.network.freeworld.unit
|
||||
|
||||
private DigestURI url;
|
||||
private MultiProtocolURI url;
|
||||
private File releaseFile;
|
||||
|
||||
private PublicKey publicKey;
|
||||
|
||||
public yacyRelease(final DigestURI url) {
|
||||
public yacyRelease(final MultiProtocolURI url) {
|
||||
super(url.getFileName());
|
||||
this.url = url;
|
||||
}
|
||||
|
||||
public yacyRelease(final DigestURI url, PublicKey publicKey) {
|
||||
public yacyRelease(final MultiProtocolURI url, PublicKey publicKey) {
|
||||
this(url);
|
||||
this.publicKey = publicKey;
|
||||
}
|
||||
|
@ -94,7 +94,7 @@ public final class yacyRelease extends yacyVersion {
|
|||
this.releaseFile = releaseFile;
|
||||
}
|
||||
|
||||
public DigestURI getUrl() {
|
||||
public MultiProtocolURI getUrl() {
|
||||
return url;
|
||||
}
|
||||
|
||||
|
@ -241,10 +241,10 @@ public final class yacyRelease extends yacyVersion {
|
|||
}
|
||||
|
||||
// analyse links in scraper resource, and find link to latest release in it
|
||||
final Map<DigestURI, String> anchors = scraper.getAnchors(); // a url (String) / name (String) relation
|
||||
final Map<MultiProtocolURI, String> anchors = scraper.getAnchors(); // a url (String) / name (String) relation
|
||||
final TreeSet<yacyRelease> mainReleases = new TreeSet<yacyRelease>();
|
||||
final TreeSet<yacyRelease> devReleases = new TreeSet<yacyRelease>();
|
||||
for(DigestURI url : anchors.keySet()) {
|
||||
for (MultiProtocolURI url : anchors.keySet()) {
|
||||
try {
|
||||
yacyRelease release = new yacyRelease(url, location.getPublicKey());
|
||||
//System.out.println("r " + release.toAnchor());
|
||||
|
|
42
source/net/yacy/cora/document/Channel.java
Normal file
42
source/net/yacy/cora/document/Channel.java
Normal file
|
@ -0,0 +1,42 @@
|
|||
/**
|
||||
* Channel
|
||||
* Copyright 2010 by Michael Peter Christen
|
||||
* First released 10.5.2010 at http://yacy.net
|
||||
*
|
||||
* This file is part of YaCy Content Integration
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with this program in the file COPYING.LESSER.
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
package net.yacy.cora.document;
|
||||
|
||||
public interface Channel extends Iterable<Hit> {
|
||||
|
||||
public void setTitle(String title);
|
||||
|
||||
public void setLink(String link);
|
||||
|
||||
public void setDescription(String description);
|
||||
|
||||
public void setImageURL(String imageUrl);
|
||||
|
||||
public void setTotalResults(String totalResults);
|
||||
|
||||
public void setStartIndex(String startIndex);
|
||||
|
||||
public void setItemsPerPage(String itemsPerPage);
|
||||
|
||||
public void setSearchTerms(String searchTerms);
|
||||
}
|
27
source/net/yacy/cora/document/Channels.java
Normal file
27
source/net/yacy/cora/document/Channels.java
Normal file
|
@ -0,0 +1,27 @@
|
|||
/**
|
||||
* Channels
|
||||
* Copyright 2010 by Michael Peter Christen
|
||||
* First released 10.5.2010 at http://yacy.net
|
||||
*
|
||||
* This file is part of YaCy Content Integration
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with this program in the file COPYING.LESSER.
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
package net.yacy.cora.document;
|
||||
|
||||
public class Channels {
|
||||
|
||||
}
|
74
source/net/yacy/cora/document/Hit.java
Normal file
74
source/net/yacy/cora/document/Hit.java
Normal file
|
@ -0,0 +1,74 @@
|
|||
/**
|
||||
* Hit
|
||||
* Copyright 2010 by Michael Peter Christen
|
||||
* First released 10.5.2010 at http://yacy.net
|
||||
*
|
||||
* This file is part of YaCy Content Integration
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with this program in the file COPYING.LESSER.
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
package net.yacy.cora.document;
|
||||
|
||||
public interface Hit {
|
||||
|
||||
public void setAuthor(String title);
|
||||
|
||||
public void setCopyright(String title);
|
||||
|
||||
public void setCategory(String title);
|
||||
|
||||
public void setTitle(String title);
|
||||
|
||||
public void setLink(String link);
|
||||
|
||||
public void setReferrer(String title);
|
||||
|
||||
public void setLanguage(String title);
|
||||
|
||||
public void setDescription(String description);
|
||||
|
||||
public void setCreator(String pubdate);
|
||||
|
||||
public void setPubDate(String pubdate);
|
||||
|
||||
public void setGuid(String guid);
|
||||
|
||||
public void setDocs(String guid);
|
||||
|
||||
|
||||
public String getAuthor();
|
||||
|
||||
public String getCopyright();
|
||||
|
||||
public String getCategory();
|
||||
|
||||
public String getTitle();
|
||||
|
||||
public String getLink();
|
||||
|
||||
public String getReferrer();
|
||||
|
||||
public String getLanguage();
|
||||
|
||||
public String getDescription();
|
||||
|
||||
public String getPubDate();
|
||||
|
||||
public String getGuid();
|
||||
|
||||
public String getDocs();
|
||||
|
||||
}
|
1037
source/net/yacy/cora/document/MultiProtocolURI.java
Normal file
1037
source/net/yacy/cora/document/MultiProtocolURI.java
Normal file
File diff suppressed because it is too large
Load Diff
|
@ -21,19 +21,19 @@
|
|||
* USA
|
||||
*/
|
||||
|
||||
package net.yacy.kelondro.util;
|
||||
package net.yacy.cora.document;
|
||||
|
||||
|
||||
public class Punycode {
|
||||
/* Punycode parameters */
|
||||
final static int TMIN = 1;
|
||||
final static int TMAX = 26;
|
||||
final static int BASE = 36;
|
||||
final static int INITIAL_N = 128;
|
||||
final static int INITIAL_BIAS = 72;
|
||||
final static int DAMP = 700;
|
||||
final static int SKEW = 38;
|
||||
final static char DELIMITER = '-';
|
||||
private final static int TMIN = 1;
|
||||
private final static int TMAX = 26;
|
||||
private final static int BASE = 36;
|
||||
private final static int INITIAL_N = 128;
|
||||
private final static int INITIAL_BIAS = 72;
|
||||
private final static int DAMP = 700;
|
||||
private final static int SKEW = 38;
|
||||
private final static char DELIMITER = '-';
|
||||
|
||||
/**
|
||||
* Punycodes a unicode string.
|
|
@ -1,40 +1,31 @@
|
|||
// RSSFeed.java
|
||||
// (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
|
||||
// first published 24.04.2008 on http://yacy.net
|
||||
//
|
||||
// This is a part of YaCy, a peer-to-peer based web search engine
|
||||
//
|
||||
// $LastChangedDate$
|
||||
// $LastChangedRevision$
|
||||
// $LastChangedBy$
|
||||
//
|
||||
// LICENSE
|
||||
//
|
||||
// This program is free software; you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation; either version 2 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
/**
|
||||
* RSSFeed
|
||||
* Copyright 2007 by Michael Peter Christen
|
||||
* First released 16.7.2007 at http://yacy.net
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with this program in the file COPYING.LESSER.
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
package net.yacy.document.parser.xml;
|
||||
package net.yacy.cora.document;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.ConcurrentLinkedQueue;
|
||||
|
||||
import net.yacy.document.content.RSSMessage;
|
||||
|
||||
|
||||
public class RSSFeed implements Iterable<RSSMessage> {
|
||||
public class RSSFeed implements Iterable<Hit> {
|
||||
|
||||
// static channel names of feeds
|
||||
public static final String TEST = "TEST";
|
||||
|
@ -119,7 +110,7 @@ public class RSSFeed implements Iterable<RSSMessage> {
|
|||
return messages.size();
|
||||
}
|
||||
|
||||
public Iterator<RSSMessage> iterator() {
|
||||
public Iterator<Hit> iterator() {
|
||||
return new messageIterator();
|
||||
}
|
||||
|
||||
|
@ -131,7 +122,7 @@ public class RSSFeed implements Iterable<RSSMessage> {
|
|||
return messages.remove(nextGUID);
|
||||
}
|
||||
|
||||
public class messageIterator implements Iterator<RSSMessage>{
|
||||
public class messageIterator implements Iterator<Hit>{
|
||||
|
||||
Iterator<String> GUIDiterator;
|
||||
String lastGUID;
|
|
@ -1,31 +1,24 @@
|
|||
// RSSMessage.java
|
||||
// (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
|
||||
// first published 16.07.2007 on http://yacy.net
|
||||
//
|
||||
// This is a part of YaCy, a peer-to-peer based web search engine
|
||||
//
|
||||
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
|
||||
// $LastChangedRevision: 1986 $
|
||||
// $LastChangedBy: orbiter $
|
||||
//
|
||||
// LICENSE
|
||||
//
|
||||
// This program is free software; you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation; either version 2 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
/**
|
||||
* RSSMessage
|
||||
* Copyright 2007 by Michael Peter Christen
|
||||
* First released 16.7.2007 at http://yacy.net
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with this program in the file COPYING.LESSER.
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
|
||||
package net.yacy.document.content;
|
||||
package net.yacy.cora.document;
|
||||
|
||||
import java.util.Date;
|
||||
import java.util.HashMap;
|
||||
|
@ -33,7 +26,7 @@ import java.util.HashSet;
|
|||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
public class RSSMessage {
|
||||
public class RSSMessage implements Hit {
|
||||
|
||||
// statics for item generation and automatic categorization
|
||||
private static int guidcount = 0;
|
||||
|
@ -165,4 +158,74 @@ public class RSSMessage {
|
|||
public String toString() {
|
||||
return this.map.toString();
|
||||
}
|
||||
|
||||
public void setAuthor(String title) {
|
||||
// TODO Auto-generated method stub
|
||||
|
||||
}
|
||||
|
||||
public void setCategory(String title) {
|
||||
// TODO Auto-generated method stub
|
||||
|
||||
}
|
||||
|
||||
public void setCopyright(String title) {
|
||||
// TODO Auto-generated method stub
|
||||
|
||||
}
|
||||
|
||||
public void setCreator(String pubdate) {
|
||||
// TODO Auto-generated method stub
|
||||
|
||||
}
|
||||
|
||||
public void setDescription(String description) {
|
||||
// TODO Auto-generated method stub
|
||||
|
||||
}
|
||||
|
||||
public void setDocs(String guid) {
|
||||
// TODO Auto-generated method stub
|
||||
|
||||
}
|
||||
|
||||
public void setGuid(String guid) {
|
||||
// TODO Auto-generated method stub
|
||||
|
||||
}
|
||||
|
||||
public void setLanguage(String title) {
|
||||
// TODO Auto-generated method stub
|
||||
|
||||
}
|
||||
|
||||
public void setLink(String link) {
|
||||
// TODO Auto-generated method stub
|
||||
|
||||
}
|
||||
|
||||
public void setPubDate(String pubdate) {
|
||||
// TODO Auto-generated method stub
|
||||
|
||||
}
|
||||
|
||||
public void setReferrer(String title) {
|
||||
// TODO Auto-generated method stub
|
||||
|
||||
}
|
||||
|
||||
public void setSize(long size) {
|
||||
// TODO Auto-generated method stub
|
||||
|
||||
}
|
||||
|
||||
public void setSizename(String sizename) {
|
||||
// TODO Auto-generated method stub
|
||||
|
||||
}
|
||||
|
||||
public void setTitle(String title) {
|
||||
// TODO Auto-generated method stub
|
||||
|
||||
}
|
||||
}
|
|
@ -1,30 +1,24 @@
|
|||
// RSSReader.java
|
||||
// (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
|
||||
// first published 16.07.2007 on http://yacy.net
|
||||
//
|
||||
// This is a part of YaCy, a peer-to-peer based web search engine
|
||||
//
|
||||
// $LastChangedDate$
|
||||
// $LastChangedRevision$
|
||||
// $LastChangedBy$
|
||||
//
|
||||
// LICENSE
|
||||
//
|
||||
// This program is free software; you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation; either version 2 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
/**
|
||||
* RSSReader
|
||||
* Copyright 2007 by Michael Peter Christen
|
||||
* First released 16.7.2007 at http://yacy.net
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with this program in the file COPYING.LESSER.
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
package net.yacy.document.parser.xml;
|
||||
package net.yacy.cora.document;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.IOException;
|
||||
|
@ -34,10 +28,6 @@ import javax.xml.parsers.ParserConfigurationException;
|
|||
import javax.xml.parsers.SAXParser;
|
||||
import javax.xml.parsers.SAXParserFactory;
|
||||
|
||||
import net.yacy.document.content.RSSMessage;
|
||||
import net.yacy.kelondro.logging.Log;
|
||||
import net.yacy.kelondro.util.ByteBuffer;
|
||||
|
||||
import org.xml.sax.Attributes;
|
||||
import org.xml.sax.SAXException;
|
||||
import org.xml.sax.helpers.DefaultHandler;
|
||||
|
@ -86,25 +76,21 @@ public class RSSReader extends DefaultHandler {
|
|||
}
|
||||
}
|
||||
|
||||
public static RSSReader parse(final byte[] a) {
|
||||
public static RSSReader parse(final byte[] a) throws IOException {
|
||||
|
||||
// check integrity of array
|
||||
if ((a == null) || (a.length == 0)) {
|
||||
Log.logWarning("rssReader", "response=null");
|
||||
return null;
|
||||
throw new IOException("response=null");
|
||||
}
|
||||
if (a.length < 100) {
|
||||
Log.logWarning("rssReader", "response=" + new String(a));
|
||||
return null;
|
||||
throw new IOException("response=" + new String(a));
|
||||
}
|
||||
if (!ByteBuffer.equals(a, "<?xml".getBytes())) {
|
||||
Log.logWarning("rssReader", "response does not contain valid xml");
|
||||
return null;
|
||||
if (!equals(a, "<?xml".getBytes())) {
|
||||
throw new IOException("response does not contain valid xml");
|
||||
}
|
||||
final String end = new String(a, a.length - 10, 10);
|
||||
if (end.indexOf("rss") < 0) {
|
||||
Log.logWarning("rssReader", "response incomplete");
|
||||
return null;
|
||||
throw new IOException("response incomplete");
|
||||
}
|
||||
|
||||
// make input stream
|
||||
|
@ -115,13 +101,18 @@ public class RSSReader extends DefaultHandler {
|
|||
try {
|
||||
reader = new RSSReader(bais);
|
||||
} catch (final Exception e) {
|
||||
Log.logException(e);
|
||||
Log.logWarning("rssReader", "parse exception: " + e.getMessage(), e);
|
||||
return null;
|
||||
throw new IOException("parse exception: " + e.getMessage(), e);
|
||||
}
|
||||
try { bais.close(); } catch (final IOException e) {}
|
||||
return reader;
|
||||
}
|
||||
|
||||
private final static boolean equals(final byte[] buffer, final byte[] pattern) {
|
||||
// compares two byte arrays: true, if pattern appears completely at offset position
|
||||
if (buffer.length < pattern.length) return false;
|
||||
for (int i = 0; i < pattern.length; i++) if (buffer[i] != pattern[i]) return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void startElement(final String uri, final String name, final String tag, final Attributes atts) throws SAXException {
|
90
source/net/yacy/cora/protocol/HttpConnector.java
Normal file
90
source/net/yacy/cora/protocol/HttpConnector.java
Normal file
|
@ -0,0 +1,90 @@
|
|||
/**
|
||||
* HttpConnector
|
||||
* Copyright 2010 by Michael Peter Christen
|
||||
* First released 25.05.2010 at http://yacy.net
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with this program in the file COPYING.LESSER.
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
|
||||
package net.yacy.cora.protocol;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.commons.httpclient.methods.multipart.Part;
|
||||
|
||||
import de.anomic.crawler.retrieval.HTTPLoader;
|
||||
import de.anomic.http.client.Client;
|
||||
import de.anomic.http.client.RemoteProxyConfig;
|
||||
import de.anomic.http.server.HeaderFramework;
|
||||
import de.anomic.http.server.RequestHeader;
|
||||
import de.anomic.http.server.ResponseContainer;
|
||||
|
||||
public class HttpConnector {
|
||||
|
||||
/**
|
||||
* send data to the server named by vhost
|
||||
*
|
||||
* @param address address of the server
|
||||
* @param vhost name of the server at address which should respond
|
||||
* @param post data to send (name-value-pairs)
|
||||
* @param timeout in milliseconds
|
||||
* @return response body
|
||||
* @throws IOException
|
||||
*/
|
||||
public static byte[] wput(final String url, final String vhost, final List<Part> post, final int timeout) throws IOException {
|
||||
return wput(url, vhost, post, timeout, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* send data to the server named by vhost
|
||||
*
|
||||
* @param address address of the server
|
||||
* @param vhost name of the server at address which should respond
|
||||
* @param post data to send (name-value-pairs)
|
||||
* @param timeout in milliseconds
|
||||
* @param gzipBody send with content gzip encoded
|
||||
* @return response body
|
||||
* @throws IOException
|
||||
*/
|
||||
public static byte[] wput(final String url, final String vhost, final List<Part> post, final int timeout, final boolean gzipBody) throws IOException {
|
||||
final RequestHeader header = new RequestHeader();
|
||||
header.put(HeaderFramework.USER_AGENT, HTTPLoader.yacyUserAgent);
|
||||
header.put(HeaderFramework.HOST, vhost);
|
||||
final Client client = new Client(timeout, header);
|
||||
client.setProxy(proxyConfig());
|
||||
|
||||
ResponseContainer res = null;
|
||||
byte[] content = null;
|
||||
try {
|
||||
// send request/data
|
||||
res = client.POST(url, post, gzipBody);
|
||||
content = res.getData();
|
||||
} finally {
|
||||
if(res != null) {
|
||||
// release connection
|
||||
res.closeStream();
|
||||
}
|
||||
}
|
||||
return content;
|
||||
}
|
||||
|
||||
|
||||
private static final RemoteProxyConfig proxyConfig() {
|
||||
final RemoteProxyConfig p = RemoteProxyConfig.getRemoteProxyConfig();
|
||||
return ((p != null) && (p.useProxy()) && (p.useProxy4Yacy())) ? p : null;
|
||||
}
|
||||
}
|
145
source/net/yacy/cora/services/Search.java
Normal file
145
source/net/yacy/cora/services/Search.java
Normal file
|
@ -0,0 +1,145 @@
|
|||
/**
|
||||
* Search
|
||||
* Copyright 2010 by Michael Peter Christen
|
||||
* First released 25.05.2010 at http://yacy.net
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with this program in the file COPYING.LESSER.
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
|
||||
package net.yacy.cora.services;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.MalformedURLException;
|
||||
import java.nio.charset.Charset;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.BlockingQueue;
|
||||
import java.util.concurrent.LinkedBlockingQueue;
|
||||
|
||||
import net.yacy.cora.document.MultiProtocolURI;
|
||||
import net.yacy.cora.document.RSSFeed;
|
||||
import net.yacy.cora.document.RSSMessage;
|
||||
import net.yacy.cora.document.RSSReader;
|
||||
import net.yacy.cora.protocol.HttpConnector;
|
||||
|
||||
import org.apache.commons.httpclient.methods.multipart.Part;
|
||||
import org.apache.commons.httpclient.methods.multipart.StringPart;
|
||||
|
||||
public class Search {
|
||||
|
||||
public static BlockingQueue<RSSMessage> search(String rssSearchServiceURL, String query, boolean verify, boolean global, long timeout, int maximumRecords) {
|
||||
BlockingQueue<RSSMessage> queue = new LinkedBlockingQueue<RSSMessage>();
|
||||
searchJob job = new searchJob(rssSearchServiceURL, query, verify, global, timeout, maximumRecords, queue);
|
||||
job.start();
|
||||
return queue;
|
||||
}
|
||||
|
||||
private final static int recordsPerSession = 10;
|
||||
|
||||
public static class searchJob extends Thread {
|
||||
|
||||
String urlBase, query;
|
||||
boolean verify, global;
|
||||
long timeout;
|
||||
int startRecord, maximumRecords;
|
||||
BlockingQueue<RSSMessage> queue;
|
||||
|
||||
public searchJob(String urlBase, String query, boolean verify, boolean global, long timeout, int maximumRecords, BlockingQueue<RSSMessage> queue) {
|
||||
this.urlBase = urlBase;
|
||||
this.query = query;
|
||||
this.verify = verify;
|
||||
this.global = global;
|
||||
this.timeout = timeout;
|
||||
this.startRecord = 0;
|
||||
this.maximumRecords = maximumRecords;
|
||||
this.queue = queue;
|
||||
}
|
||||
|
||||
public void run() {
|
||||
RSSMessage message;
|
||||
mainloop: while (timeout > 0 && maximumRecords > 0) {
|
||||
long st = System.currentTimeMillis();
|
||||
RSSFeed feed;
|
||||
try {
|
||||
feed = search(urlBase, query, verify, global, timeout, startRecord, recordsPerSession);
|
||||
} catch (IOException e1) {
|
||||
break mainloop;
|
||||
}
|
||||
if (feed == null || feed.isEmpty()) break mainloop;
|
||||
maximumRecords -= feed.size();
|
||||
innerloop: while (!feed.isEmpty()) {
|
||||
message = feed.pollMessage();
|
||||
if (message == null) break innerloop;
|
||||
try {
|
||||
queue.put(message);
|
||||
} catch (InterruptedException e) {
|
||||
break innerloop;
|
||||
}
|
||||
}
|
||||
startRecord += recordsPerSession;
|
||||
timeout -= System.currentTimeMillis() - st;
|
||||
}
|
||||
try { queue.put(RSSMessage.POISON); } catch (InterruptedException e) {}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* send a query to a yacy public search interface
|
||||
* @param rssSearchServiceURL the target url base (everything before the ? that follows the SRU request syntax properties). can null, then the local peer is used
|
||||
* @param query the query as string
|
||||
* @param startRecord number of first record
|
||||
* @param maximumRecords maximum number of records
|
||||
* @param verify if true, result entries are verified using the snippet fetch (slow); if false simply the result is returned
|
||||
* @param global if true also search results from other peers are included
|
||||
* @param timeout milliseconds that are waited at maximum for a search result
|
||||
* @return
|
||||
*/
|
||||
public static RSSFeed search(String rssSearchServiceURL, String query, boolean verify, boolean global, long timeout, int startRecord, int maximumRecords) throws IOException {
|
||||
MultiProtocolURI uri = null;
|
||||
try {
|
||||
uri = new MultiProtocolURI(rssSearchServiceURL);
|
||||
} catch (MalformedURLException e) {
|
||||
throw new IOException("cora.Search failed asking peer '" + rssSearchServiceURL + "': bad url, " + e.getMessage());
|
||||
}
|
||||
|
||||
// prepare request
|
||||
final List<Part> post = new ArrayList<Part>();
|
||||
post.add(new StringPart("query", query, Charset.defaultCharset().name()));
|
||||
post.add(new StringPart("startRecord", Integer.toString(startRecord), Charset.defaultCharset().name()));
|
||||
post.add(new StringPart("maximumRecords", Long.toString(maximumRecords), Charset.defaultCharset().name()));
|
||||
post.add(new StringPart("verify", verify ? "true" : "false", Charset.defaultCharset().name()));
|
||||
post.add(new StringPart("resource", global ? "global" : "local", Charset.defaultCharset().name()));
|
||||
|
||||
// send request
|
||||
try {
|
||||
final byte[] result = HttpConnector.wput(rssSearchServiceURL, uri.getHost(), post, (int) timeout);
|
||||
//String debug = new String(result); System.out.println("*** DEBUG: " + debug);
|
||||
final RSSReader reader = RSSReader.parse(result);
|
||||
if (reader == null) {
|
||||
throw new IOException("cora.Search failed asking peer '" + uri.getHost() + "': probably bad response from remote peer (1), reader == null");
|
||||
}
|
||||
final RSSFeed feed = reader.getFeed();
|
||||
if (feed == null) {
|
||||
// case where the rss reader does not understand the content
|
||||
throw new IOException("cora.Search failed asking peer '" + uri.getHost() + "': probably bad response from remote peer (2)");
|
||||
}
|
||||
return feed;
|
||||
} catch (final IOException e) {
|
||||
throw new IOException("cora.Search error asking peer '" + uri.getHost() + "':" + e.toString());
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -33,7 +33,7 @@ import java.io.FileNotFoundException;
|
|||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.cora.document.MultiProtocolURI;
|
||||
import net.yacy.kelondro.logging.Log;
|
||||
import net.yacy.kelondro.workflow.WorkflowThread;
|
||||
|
||||
|
@ -108,7 +108,7 @@ public abstract class AbstractParser implements Idiom {
|
|||
return tempFile;
|
||||
}
|
||||
|
||||
public int parseDir(final DigestURI location, final String prefix, final File dir, final Document doc)
|
||||
public int parseDir(final MultiProtocolURI location, final String prefix, final File dir, final Document doc)
|
||||
throws ParserException, InterruptedException, IOException {
|
||||
if (!dir.isDirectory())
|
||||
throw new ParserException("tried to parse ordinary file " + dir + " as directory", location);
|
||||
|
@ -122,7 +122,7 @@ public abstract class AbstractParser implements Idiom {
|
|||
if (file.isDirectory()) {
|
||||
result += parseDir(location, prefix, file, doc);
|
||||
} else try {
|
||||
final DigestURI url = DigestURI.newURL(location, "/" + prefix + "/"
|
||||
final MultiProtocolURI url = MultiProtocolURI.newURL(location, "/" + prefix + "/"
|
||||
// XXX: workaround for relative paths within document
|
||||
+ file.getPath().substring(file.getPath().indexOf(File.separatorChar) + 1)
|
||||
+ "/" + file.getName());
|
||||
|
@ -151,7 +151,7 @@ public abstract class AbstractParser implements Idiom {
|
|||
* @see net.yacy.document.Idiom#parse(de.anomic.net.URL, java.lang.String, byte[])
|
||||
*/
|
||||
public Document parse(
|
||||
final DigestURI location,
|
||||
final MultiProtocolURI location,
|
||||
final String mimeType,
|
||||
final String charset,
|
||||
final byte[] source
|
||||
|
@ -186,7 +186,7 @@ public abstract class AbstractParser implements Idiom {
|
|||
* @see net.yacy.document.Idiom#parse(de.anomic.net.URL, java.lang.String, java.io.File)
|
||||
*/
|
||||
public Document parse(
|
||||
final DigestURI location,
|
||||
final MultiProtocolURI location,
|
||||
final String mimeType,
|
||||
final String charset,
|
||||
final File sourceFile
|
||||
|
@ -220,7 +220,7 @@ public abstract class AbstractParser implements Idiom {
|
|||
*
|
||||
* @see net.yacy.document.Idiom#parse(de.anomic.net.URL, java.lang.String, java.io.InputStream)
|
||||
*/
|
||||
public abstract Document parse(DigestURI location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException;
|
||||
public abstract Document parse(MultiProtocolURI location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException;
|
||||
|
||||
/**
|
||||
* Return the name of the parser
|
||||
|
|
|
@ -46,10 +46,10 @@ import java.util.Properties;
|
|||
import java.util.TreeMap;
|
||||
import java.util.TreeSet;
|
||||
|
||||
import net.yacy.cora.document.MultiProtocolURI;
|
||||
import net.yacy.document.language.Identificator;
|
||||
import net.yacy.document.parser.html.ContentScraper;
|
||||
import net.yacy.document.parser.html.ImageEntry;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.kelondro.data.word.Word;
|
||||
import net.yacy.kelondro.data.word.WordReferenceRow;
|
||||
import net.yacy.kelondro.logging.Log;
|
||||
|
@ -125,7 +125,7 @@ public final class Condenser {
|
|||
this.languageIdentificator = new Identificator();
|
||||
|
||||
|
||||
Map.Entry<DigestURI, String> entry;
|
||||
Map.Entry<MultiProtocolURI, String> entry;
|
||||
if (indexText) {
|
||||
createCondensement(document.getText());
|
||||
// the phrase counter:
|
||||
|
@ -179,7 +179,7 @@ public final class Condenser {
|
|||
if (indexMedia) {
|
||||
// add anchor descriptions: here, we also add the url components
|
||||
// audio
|
||||
Iterator<Map.Entry<DigestURI, String>> i = document.getAudiolinks().entrySet().iterator();
|
||||
Iterator<Map.Entry<MultiProtocolURI, String>> i = document.getAudiolinks().entrySet().iterator();
|
||||
while (i.hasNext()) {
|
||||
entry = i.next();
|
||||
insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasaudio, RESULT_FLAGS, false);
|
||||
|
|
|
@ -45,9 +45,9 @@ import java.util.Map;
|
|||
import java.util.Set;
|
||||
import java.util.TreeSet;
|
||||
|
||||
import net.yacy.cora.document.MultiProtocolURI;
|
||||
import net.yacy.document.parser.html.ContentScraper;
|
||||
import net.yacy.document.parser.html.ImageEntry;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.kelondro.logging.Log;
|
||||
import net.yacy.kelondro.util.DateFormatter;
|
||||
import net.yacy.kelondro.util.FileUtils;
|
||||
|
@ -55,7 +55,7 @@ import net.yacy.kelondro.util.FileUtils;
|
|||
|
||||
public class Document {
|
||||
|
||||
private final DigestURI source; // the source url
|
||||
private final MultiProtocolURI source; // the source url
|
||||
private final String mimeType; // mimeType as taken from http header
|
||||
private final String charset; // the charset of the document
|
||||
private final List<String> keywords; // most resources provide a keyword field
|
||||
|
@ -65,24 +65,24 @@ public class Document {
|
|||
private final List<String> sections; // if present: more titles/headlines appearing in the document
|
||||
private final StringBuilder description; // an abstract, if present: short content description
|
||||
private Object text; // the clear text, all that is visible
|
||||
private final Map<DigestURI, String> anchors; // all links embedded as clickeable entities (anchor tags)
|
||||
private final HashMap<String, ImageEntry> images; // all visible pictures in document
|
||||
private final Map<MultiProtocolURI, String> anchors; // all links embedded as clickeable entities (anchor tags)
|
||||
private final HashMap<MultiProtocolURI, ImageEntry> images; // all visible pictures in document
|
||||
// the anchors and images - Maps are URL-to-EntityDescription mappings.
|
||||
// The EntityDescription appear either as visible text in anchors or as alternative
|
||||
// text in image tags.
|
||||
private Map<DigestURI, String> hyperlinks, audiolinks, videolinks, applinks;
|
||||
private Map<MultiProtocolURI, String> hyperlinks, audiolinks, videolinks, applinks;
|
||||
private Map<String, String> emaillinks;
|
||||
private DigestURI favicon;
|
||||
private MultiProtocolURI favicon;
|
||||
private boolean resorted;
|
||||
private InputStream textStream;
|
||||
private int inboundLinks, outboundLinks; // counters for inbound and outbound links, are counted after calling notifyWebStructure
|
||||
private Set<String> languages;
|
||||
private boolean indexingDenied;
|
||||
|
||||
public Document(final DigestURI location, final String mimeType, final String charset, final Set<String> languages,
|
||||
public Document(final MultiProtocolURI location, final String mimeType, final String charset, final Set<String> languages,
|
||||
final String[] keywords, final String title, final String author, final String publisher,
|
||||
final String[] sections, final String abstrct,
|
||||
final Object text, final Map<DigestURI, String> anchors, final HashMap<String, ImageEntry> images,
|
||||
final Object text, final Map<MultiProtocolURI, String> anchors, final HashMap<MultiProtocolURI, ImageEntry> images,
|
||||
boolean indexingDenied) {
|
||||
this.source = location;
|
||||
this.mimeType = (mimeType == null) ? "application/octet-stream" : mimeType;
|
||||
|
@ -92,8 +92,8 @@ public class Document {
|
|||
this.creator = (author == null) ? new StringBuilder(0) : new StringBuilder(author);
|
||||
this.sections = (sections == null) ? new LinkedList<String>() : Arrays.asList(sections);
|
||||
this.description = (abstrct == null) ? new StringBuilder(0) : new StringBuilder(abstrct);
|
||||
this.anchors = (anchors == null) ? new HashMap<DigestURI, String>(0) : anchors;
|
||||
this.images = (images == null) ? new HashMap<String, ImageEntry>() : images;
|
||||
this.anchors = (anchors == null) ? new HashMap<MultiProtocolURI, String>(0) : anchors;
|
||||
this.images = (images == null) ? new HashMap<MultiProtocolURI, ImageEntry>() : images;
|
||||
this.publisher = publisher;
|
||||
this.hyperlinks = null;
|
||||
this.audiolinks = null;
|
||||
|
@ -159,7 +159,7 @@ dc_rights
|
|||
*/
|
||||
|
||||
public String dc_title() {
|
||||
return title.toString();
|
||||
return (title == null) ? "" : title.toString();
|
||||
}
|
||||
|
||||
public void setTitle(String title) {
|
||||
|
@ -167,9 +167,7 @@ dc_rights
|
|||
}
|
||||
|
||||
public String dc_creator() {
|
||||
if (creator == null)
|
||||
return "";
|
||||
return creator.toString();
|
||||
return (creator == null) ? "" : creator.toString();
|
||||
}
|
||||
|
||||
public String dc_subject(final char separator) {
|
||||
|
@ -196,7 +194,7 @@ dc_rights
|
|||
}
|
||||
|
||||
public String dc_publisher() {
|
||||
return this.publisher;
|
||||
return this.publisher == null ? "" : this.publisher;
|
||||
}
|
||||
|
||||
public String dc_format() {
|
||||
|
@ -207,7 +205,7 @@ dc_rights
|
|||
return this.source.toNormalform(true, false);
|
||||
}
|
||||
|
||||
public DigestURI dc_source() {
|
||||
public MultiProtocolURI dc_source() {
|
||||
return this.source;
|
||||
}
|
||||
|
||||
|
@ -282,7 +280,7 @@ dc_rights
|
|||
return this.keywords;
|
||||
}
|
||||
|
||||
public Map<DigestURI, String> getAnchors() {
|
||||
public Map<MultiProtocolURI, String> getAnchors() {
|
||||
// returns all links embedded as anchors (clickeable entities)
|
||||
// this is a url(String)/text(String) map
|
||||
return anchors;
|
||||
|
@ -291,30 +289,30 @@ dc_rights
|
|||
|
||||
// the next three methods provide a calculated view on the getAnchors/getImages:
|
||||
|
||||
public Map<DigestURI, String> getHyperlinks() {
|
||||
public Map<MultiProtocolURI, String> getHyperlinks() {
|
||||
// this is a subset of the getAnchor-set: only links to other hyperrefs
|
||||
if (!resorted) resortLinks();
|
||||
return hyperlinks;
|
||||
}
|
||||
|
||||
public Map<DigestURI, String> getAudiolinks() {
|
||||
public Map<MultiProtocolURI, String> getAudiolinks() {
|
||||
if (!resorted) resortLinks();
|
||||
return this.audiolinks;
|
||||
}
|
||||
|
||||
public Map<DigestURI, String> getVideolinks() {
|
||||
public Map<MultiProtocolURI, String> getVideolinks() {
|
||||
if (!resorted) resortLinks();
|
||||
return this.videolinks;
|
||||
}
|
||||
|
||||
public HashMap<String, ImageEntry> getImages() {
|
||||
public HashMap<MultiProtocolURI, ImageEntry> getImages() {
|
||||
// returns all links enbedded as pictures (visible in document)
|
||||
// this resturns a htmlFilterImageEntry collection
|
||||
if (!resorted) resortLinks();
|
||||
return images;
|
||||
}
|
||||
|
||||
public Map<DigestURI, String> getApplinks() {
|
||||
public Map<MultiProtocolURI, String> getApplinks() {
|
||||
if (!resorted) resortLinks();
|
||||
return this.applinks;
|
||||
}
|
||||
|
@ -329,18 +327,18 @@ dc_rights
|
|||
if (this.resorted) return;
|
||||
|
||||
// extract hyperlinks, medialinks and emaillinks from anchorlinks
|
||||
DigestURI url;
|
||||
MultiProtocolURI url;
|
||||
String u;
|
||||
int extpos, qpos;
|
||||
String ext = null;
|
||||
final Iterator<Map.Entry<DigestURI, String>> i = anchors.entrySet().iterator();
|
||||
hyperlinks = new HashMap<DigestURI, String>();
|
||||
videolinks = new HashMap<DigestURI, String>();
|
||||
audiolinks = new HashMap<DigestURI, String>();
|
||||
applinks = new HashMap<DigestURI, String>();
|
||||
final Iterator<Map.Entry<MultiProtocolURI, String>> i = anchors.entrySet().iterator();
|
||||
hyperlinks = new HashMap<MultiProtocolURI, String>();
|
||||
videolinks = new HashMap<MultiProtocolURI, String>();
|
||||
audiolinks = new HashMap<MultiProtocolURI, String>();
|
||||
applinks = new HashMap<MultiProtocolURI, String>();
|
||||
emaillinks = new HashMap<String, String>();
|
||||
final HashMap<String, ImageEntry> collectedImages = new HashMap<String, ImageEntry>(); // this is a set that is collected now and joined later to the imagelinks
|
||||
Map.Entry<DigestURI, String> entry;
|
||||
final HashMap<MultiProtocolURI, ImageEntry> collectedImages = new HashMap<MultiProtocolURI, ImageEntry>(); // this is a set that is collected now and joined later to the imagelinks
|
||||
Map.Entry<MultiProtocolURI, String> entry;
|
||||
while (i.hasNext()) {
|
||||
entry = i.next();
|
||||
url = entry.getKey();
|
||||
|
@ -393,21 +391,21 @@ dc_rights
|
|||
this.resorted = true;
|
||||
}
|
||||
|
||||
public static Map<DigestURI, String> allSubpaths(final Collection<?> links) {
|
||||
public static Map<MultiProtocolURI, String> allSubpaths(final Collection<?> links) {
|
||||
// links is either a Set of Strings (urls) or a Set of
|
||||
// htmlFilterImageEntries
|
||||
final HashSet<String> h = new HashSet<String>();
|
||||
Iterator<?> i = links.iterator();
|
||||
Object o;
|
||||
DigestURI url;
|
||||
MultiProtocolURI url;
|
||||
String u;
|
||||
int pos;
|
||||
int l;
|
||||
while (i.hasNext())
|
||||
try {
|
||||
o = i.next();
|
||||
if (o instanceof DigestURI) url = (DigestURI) o;
|
||||
else if (o instanceof String) url = new DigestURI((String) o, null);
|
||||
if (o instanceof MultiProtocolURI) url = (MultiProtocolURI) o;
|
||||
else if (o instanceof String) url = new MultiProtocolURI((String) o);
|
||||
else if (o instanceof ImageEntry) url = ((ImageEntry) o).url();
|
||||
else {
|
||||
assert false;
|
||||
|
@ -428,11 +426,11 @@ dc_rights
|
|||
} catch (final MalformedURLException e) { }
|
||||
// now convert the strings to yacyURLs
|
||||
i = h.iterator();
|
||||
final HashMap<DigestURI, String> v = new HashMap<DigestURI, String>();
|
||||
final HashMap<MultiProtocolURI, String> v = new HashMap<MultiProtocolURI, String>();
|
||||
while (i.hasNext()) {
|
||||
u = (String) i.next();
|
||||
try {
|
||||
url = new DigestURI(u, null);
|
||||
url = new MultiProtocolURI(u);
|
||||
v.put(url, "sub");
|
||||
} catch (final MalformedURLException e) {
|
||||
}
|
||||
|
@ -440,23 +438,23 @@ dc_rights
|
|||
return v;
|
||||
}
|
||||
|
||||
public static Map<DigestURI, String> allReflinks(final Collection<?> links) {
|
||||
public static Map<MultiProtocolURI, String> allReflinks(final Collection<?> links) {
|
||||
// links is either a Set of Strings (with urls) or
|
||||
// htmlFilterImageEntries
|
||||
// we find all links that are part of a reference inside a url
|
||||
final HashMap<DigestURI, String> v = new HashMap<DigestURI, String>();
|
||||
final HashMap<MultiProtocolURI, String> v = new HashMap<MultiProtocolURI, String>();
|
||||
final Iterator<?> i = links.iterator();
|
||||
Object o;
|
||||
DigestURI url;
|
||||
MultiProtocolURI url;
|
||||
String u;
|
||||
int pos;
|
||||
loop: while (i.hasNext())
|
||||
try {
|
||||
o = i.next();
|
||||
if (o instanceof DigestURI)
|
||||
url = (DigestURI) o;
|
||||
if (o instanceof MultiProtocolURI)
|
||||
url = (MultiProtocolURI) o;
|
||||
else if (o instanceof String)
|
||||
url = new DigestURI((String) o, null);
|
||||
url = new MultiProtocolURI((String) o);
|
||||
else if (o instanceof ImageEntry)
|
||||
url = ((ImageEntry) o).url();
|
||||
else {
|
||||
|
@ -469,7 +467,7 @@ dc_rights
|
|||
u = u.substring(pos);
|
||||
while ((pos = u.toLowerCase().indexOf("http://", 7)) > 0)
|
||||
u = u.substring(pos);
|
||||
url = new DigestURI(u, null);
|
||||
url = new MultiProtocolURI(u);
|
||||
if (!(v.containsKey(url)))
|
||||
v.put(url, "ref");
|
||||
continue loop;
|
||||
|
@ -479,7 +477,7 @@ dc_rights
|
|||
u = "http:/" + u.substring(pos);
|
||||
while ((pos = u.toLowerCase().indexOf("/www.", 7)) > 0)
|
||||
u = "http:/" + u.substring(pos);
|
||||
url = new DigestURI(u, null);
|
||||
url = new MultiProtocolURI(u);
|
||||
if (!(v.containsKey(url)))
|
||||
v.put(url, "ref");
|
||||
continue loop;
|
||||
|
@ -512,14 +510,14 @@ dc_rights
|
|||
/**
|
||||
* @return the {@link URL} to the favicon that belongs to the document
|
||||
*/
|
||||
public DigestURI getFavicon() {
|
||||
public MultiProtocolURI getFavicon() {
|
||||
return this.favicon;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param faviconURL the {@link URL} to the favicon that belongs to the document
|
||||
*/
|
||||
public void setFavicon(final DigestURI faviconURL) {
|
||||
public void setFavicon(final MultiProtocolURI faviconURL) {
|
||||
this.favicon = faviconURL;
|
||||
}
|
||||
|
||||
|
|
|
@ -29,7 +29,7 @@ import java.io.File;
|
|||
import java.io.InputStream;
|
||||
import java.util.Set;
|
||||
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.cora.document.MultiProtocolURI;
|
||||
|
||||
|
||||
/**
|
||||
|
@ -51,7 +51,7 @@ public interface Idiom {
|
|||
*
|
||||
* @throws ParserException if the content could not be parsed properly
|
||||
*/
|
||||
public Document parse(DigestURI location, String mimeType, String charset, byte[] source)
|
||||
public Document parse(MultiProtocolURI location, String mimeType, String charset, byte[] source)
|
||||
throws ParserException, InterruptedException;
|
||||
|
||||
/**
|
||||
|
@ -65,7 +65,7 @@ public interface Idiom {
|
|||
*
|
||||
* @throws ParserException if the content could not be parsed properly
|
||||
*/
|
||||
public Document parse(DigestURI location, String mimeType, String charset, File sourceFile)
|
||||
public Document parse(MultiProtocolURI location, String mimeType, String charset, File sourceFile)
|
||||
throws ParserException, InterruptedException;
|
||||
|
||||
/**
|
||||
|
@ -79,7 +79,7 @@ public interface Idiom {
|
|||
*
|
||||
* @throws ParserException if the content could not be parsed properly
|
||||
*/
|
||||
public Document parse(DigestURI location, String mimeType, String charset, InputStream source)
|
||||
public Document parse(MultiProtocolURI location, String mimeType, String charset, InputStream source)
|
||||
throws ParserException, InterruptedException;
|
||||
|
||||
/**
|
||||
|
|
|
@ -24,10 +24,10 @@
|
|||
|
||||
package net.yacy.document;
|
||||
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.cora.document.MultiProtocolURI;
|
||||
|
||||
public class ParserException extends Exception {
|
||||
private DigestURI url = null;
|
||||
private MultiProtocolURI url = null;
|
||||
|
||||
private static final long serialVersionUID = 1L;
|
||||
|
||||
|
@ -35,12 +35,12 @@ public class ParserException extends Exception {
|
|||
super();
|
||||
}
|
||||
|
||||
public ParserException(final String message, final DigestURI url) {
|
||||
public ParserException(final String message, final MultiProtocolURI url) {
|
||||
super(message + "; url = " + url.toNormalform(true, false));
|
||||
this.url = url;
|
||||
}
|
||||
|
||||
public DigestURI getURL() {
|
||||
public MultiProtocolURI getURL() {
|
||||
return this.url;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -40,6 +40,7 @@ import java.util.Map;
|
|||
import java.util.Set;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
|
||||
import net.yacy.cora.document.MultiProtocolURI;
|
||||
import net.yacy.document.parser.bzipParser;
|
||||
import net.yacy.document.parser.csvParser;
|
||||
import net.yacy.document.parser.docParser;
|
||||
|
@ -61,7 +62,6 @@ import net.yacy.document.parser.vsdParser;
|
|||
import net.yacy.document.parser.xlsParser;
|
||||
import net.yacy.document.parser.zipParser;
|
||||
import net.yacy.document.parser.images.genericImageParser;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.kelondro.logging.Log;
|
||||
import net.yacy.kelondro.util.FileUtils;
|
||||
|
||||
|
@ -138,7 +138,7 @@ public final class TextParser {
|
|||
}
|
||||
|
||||
public static Document parseSource(
|
||||
final DigestURI location,
|
||||
final MultiProtocolURI location,
|
||||
final String mimeType,
|
||||
final String charset,
|
||||
final File sourceFile
|
||||
|
@ -167,7 +167,7 @@ public final class TextParser {
|
|||
}
|
||||
|
||||
public static Document parseSource(
|
||||
final DigestURI location,
|
||||
final MultiProtocolURI location,
|
||||
String mimeType,
|
||||
final String charset,
|
||||
final byte[] content
|
||||
|
@ -176,7 +176,7 @@ public final class TextParser {
|
|||
}
|
||||
|
||||
public static Document parseSource(
|
||||
final DigestURI location,
|
||||
final MultiProtocolURI location,
|
||||
String mimeType,
|
||||
final String charset,
|
||||
final long contentLength,
|
||||
|
@ -211,7 +211,7 @@ public final class TextParser {
|
|||
}
|
||||
|
||||
private static Document parseSource(
|
||||
final DigestURI location,
|
||||
final MultiProtocolURI location,
|
||||
String mimeType,
|
||||
Idiom idiom,
|
||||
final String charset,
|
||||
|
@ -233,7 +233,7 @@ public final class TextParser {
|
|||
}
|
||||
|
||||
private static Document parseSource(
|
||||
final DigestURI location,
|
||||
final MultiProtocolURI location,
|
||||
String mimeType,
|
||||
List<Idiom> idioms,
|
||||
final String charset,
|
||||
|
@ -280,7 +280,7 @@ public final class TextParser {
|
|||
* @param mimeType
|
||||
* @return returns null if the content is supported. If the content is not supported, return a error string.
|
||||
*/
|
||||
public static String supports(final DigestURI url, String mimeType) {
|
||||
public static String supports(final MultiProtocolURI url, String mimeType) {
|
||||
try {
|
||||
// try to get a parser. If this works, we don't need the parser itself, we just return null to show that everything is ok.
|
||||
List<Idiom> idioms = idiomParser(url, mimeType);
|
||||
|
@ -304,7 +304,7 @@ public final class TextParser {
|
|||
* @return a list of Idiom parsers that may be appropriate for the given criteria
|
||||
* @throws ParserException
|
||||
*/
|
||||
private static List<Idiom> idiomParser(final DigestURI url, String mimeType1) throws ParserException {
|
||||
private static List<Idiom> idiomParser(final MultiProtocolURI url, String mimeType1) throws ParserException {
|
||||
List<Idiom> idioms = new ArrayList<Idiom>(2);
|
||||
|
||||
// check extension
|
||||
|
@ -345,7 +345,7 @@ public final class TextParser {
|
|||
return null;
|
||||
}
|
||||
|
||||
public static String supportsExtension(final DigestURI url) {
|
||||
public static String supportsExtension(final MultiProtocolURI url) {
|
||||
String ext = url.getFileExtension().toLowerCase();
|
||||
if (ext == null || ext.length() == 0) return null;
|
||||
if (denyExtensionx.containsKey(ext)) return "file extension '" + ext + "' is denied (2)";
|
||||
|
@ -357,7 +357,7 @@ public final class TextParser {
|
|||
return null;
|
||||
}
|
||||
|
||||
public static String mimeOf(DigestURI url) {
|
||||
public static String mimeOf(MultiProtocolURI url) {
|
||||
return mimeOf(url.getFileExtension());
|
||||
}
|
||||
|
||||
|
|
|
@ -33,12 +33,12 @@ import java.io.InputStream;
|
|||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import net.yacy.cora.document.MultiProtocolURI;
|
||||
import net.yacy.document.AbstractParser;
|
||||
import net.yacy.document.Document;
|
||||
import net.yacy.document.Idiom;
|
||||
import net.yacy.document.TextParser;
|
||||
import net.yacy.document.ParserException;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.kelondro.util.FileUtils;
|
||||
|
||||
import org.apache.tools.bzip2.CBZip2InputStream;
|
||||
|
@ -75,7 +75,7 @@ public class bzipParser extends AbstractParser implements Idiom {
|
|||
return SUPPORTED_EXTENSIONS;
|
||||
}
|
||||
|
||||
public Document parse(final DigestURI location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException {
|
||||
public Document parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException {
|
||||
|
||||
File tempFile = null;
|
||||
try {
|
||||
|
|
|
@ -37,11 +37,11 @@ import java.util.HashSet;
|
|||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import net.yacy.cora.document.MultiProtocolURI;
|
||||
import net.yacy.document.AbstractParser;
|
||||
import net.yacy.document.Document;
|
||||
import net.yacy.document.Idiom;
|
||||
import net.yacy.document.ParserException;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
|
||||
/**
|
||||
* a parser for comma-separated values
|
||||
|
@ -73,7 +73,7 @@ public class csvParser extends AbstractParser implements Idiom {
|
|||
}
|
||||
|
||||
@Override
|
||||
public Document parse(DigestURI location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException {
|
||||
public Document parse(MultiProtocolURI location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException {
|
||||
// construct a document using all cells of the document
|
||||
// the first row is used as headline
|
||||
// all lines are artificially terminated by a '.' to separate them as sentence for the condenser.
|
||||
|
@ -112,7 +112,7 @@ public class csvParser extends AbstractParser implements Idiom {
|
|||
return sb.toString();
|
||||
}
|
||||
|
||||
public List<String[]> getTable(DigestURI location, String mimeType, String charset, InputStream source) {
|
||||
public List<String[]> getTable(MultiProtocolURI location, String mimeType, String charset, InputStream source) {
|
||||
ArrayList<String[]> rows = new ArrayList<String[]>();
|
||||
BufferedReader reader;
|
||||
try {
|
||||
|
|
|
@ -32,11 +32,11 @@ import java.io.UnsupportedEncodingException;
|
|||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import net.yacy.cora.document.MultiProtocolURI;
|
||||
import net.yacy.document.AbstractParser;
|
||||
import net.yacy.document.Document;
|
||||
import net.yacy.document.Idiom;
|
||||
import net.yacy.document.ParserException;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
|
||||
import org.apache.poi.hwpf.extractor.WordExtractor;
|
||||
|
||||
|
@ -65,7 +65,7 @@ public class docParser extends AbstractParser implements Idiom {
|
|||
super("Word Document Parser");
|
||||
}
|
||||
|
||||
public Document parse(final DigestURI location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException {
|
||||
public Document parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException {
|
||||
|
||||
final WordExtractor extractor;
|
||||
|
||||
|
|
|
@ -34,12 +34,12 @@ import java.util.HashSet;
|
|||
import java.util.Set;
|
||||
import java.util.zip.GZIPInputStream;
|
||||
|
||||
import net.yacy.cora.document.MultiProtocolURI;
|
||||
import net.yacy.document.AbstractParser;
|
||||
import net.yacy.document.Document;
|
||||
import net.yacy.document.Idiom;
|
||||
import net.yacy.document.TextParser;
|
||||
import net.yacy.document.ParserException;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.kelondro.util.FileUtils;
|
||||
|
||||
|
||||
|
@ -74,7 +74,7 @@ public class gzipParser extends AbstractParser implements Idiom {
|
|||
return SUPPORTED_EXTENSIONS;
|
||||
}
|
||||
|
||||
public Document parse(final DigestURI location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException {
|
||||
public Document parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException {
|
||||
|
||||
File tempFile = null;
|
||||
try {
|
||||
|
|
|
@ -44,8 +44,8 @@ import java.util.Properties;
|
|||
|
||||
import javax.swing.event.EventListenerList;
|
||||
|
||||
import net.yacy.cora.document.MultiProtocolURI;
|
||||
import net.yacy.document.parser.htmlParser;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.kelondro.io.CharBuffer;
|
||||
import net.yacy.kelondro.logging.Log;
|
||||
import net.yacy.kelondro.util.FileUtils;
|
||||
|
@ -79,8 +79,8 @@ public class ContentScraper extends AbstractScraper implements Scraper {
|
|||
}
|
||||
|
||||
// class variables: collectors for links
|
||||
private HashMap<DigestURI, String> anchors;
|
||||
private HashMap<String, ImageEntry> images; // urlhash/image relation
|
||||
private HashMap<MultiProtocolURI, String> anchors;
|
||||
private HashMap<MultiProtocolURI, ImageEntry> images; // urlhash/image relation
|
||||
private final HashMap<String, String> metas;
|
||||
private String title;
|
||||
//private String headline;
|
||||
|
@ -89,23 +89,23 @@ public class ContentScraper extends AbstractScraper implements Scraper {
|
|||
private final EventListenerList htmlFilterEventListeners;
|
||||
|
||||
/**
|
||||
* {@link DigestURI} to the favicon that belongs to the document
|
||||
* {@link MultiProtocolURI} to the favicon that belongs to the document
|
||||
*/
|
||||
private DigestURI favicon;
|
||||
private MultiProtocolURI favicon;
|
||||
|
||||
/**
|
||||
* The document root {@link DigestURI}
|
||||
* The document root {@link MultiProtocolURI}
|
||||
*/
|
||||
private DigestURI root;
|
||||
private MultiProtocolURI root;
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
public ContentScraper(final DigestURI root) {
|
||||
public ContentScraper(final MultiProtocolURI root) {
|
||||
// the root value here will not be used to load the resource.
|
||||
// it is only the reference for relative links
|
||||
super(linkTags0, linkTags1);
|
||||
this.root = root;
|
||||
this.anchors = new HashMap<DigestURI, String>();
|
||||
this.images = new HashMap<String, ImageEntry>();
|
||||
this.anchors = new HashMap<MultiProtocolURI, String>();
|
||||
this.images = new HashMap<MultiProtocolURI, ImageEntry>();
|
||||
this.metas = new HashMap<String, String>();
|
||||
this.title = "";
|
||||
this.headlines = new ArrayList[4];
|
||||
|
@ -133,9 +133,9 @@ public class ContentScraper extends AbstractScraper implements Scraper {
|
|||
if (b.length() != 0) content.append(b).append(32);
|
||||
}
|
||||
|
||||
private DigestURI absolutePath(final String relativePath) {
|
||||
private MultiProtocolURI absolutePath(final String relativePath) {
|
||||
try {
|
||||
return DigestURI.newURL(root, relativePath);
|
||||
return MultiProtocolURI.newURL(root, relativePath);
|
||||
} catch (final Exception e) {
|
||||
return null;
|
||||
}
|
||||
|
@ -149,7 +149,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
|
|||
if (width > 15 && height > 15) {
|
||||
final float ratio = (float) Math.min(width, height) / Math.max(width, height);
|
||||
if (ratio > 0.4) {
|
||||
final DigestURI url = absolutePath(tagopts.getProperty("src", ""));
|
||||
final MultiProtocolURI url = absolutePath(tagopts.getProperty("src", ""));
|
||||
final ImageEntry ie = new ImageEntry(url, tagopts.getProperty("alt", ""), width, height, -1);
|
||||
addImage(images, ie);
|
||||
}
|
||||
|
@ -162,7 +162,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
|
|||
} catch (final NumberFormatException e) {}
|
||||
}
|
||||
if (tagname.equalsIgnoreCase("base")) try {
|
||||
root = new DigestURI(tagopts.getProperty("href", ""), null);
|
||||
root = new MultiProtocolURI(tagopts.getProperty("href", ""));
|
||||
} catch (final MalformedURLException e) {}
|
||||
if (tagname.equalsIgnoreCase("frame")) {
|
||||
anchors.put(absolutePath(tagopts.getProperty("src", "")), tagopts.getProperty("name",""));
|
||||
|
@ -185,7 +185,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
|
|||
if (href.length() > 0) anchors.put(absolutePath(href), areatitle);
|
||||
}
|
||||
if (tagname.equalsIgnoreCase("link")) {
|
||||
final DigestURI newLink = absolutePath(tagopts.getProperty("href", ""));
|
||||
final MultiProtocolURI newLink = absolutePath(tagopts.getProperty("href", ""));
|
||||
|
||||
if (newLink != null) {
|
||||
final String type = tagopts.getProperty("rel", "");
|
||||
|
@ -193,7 +193,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
|
|||
|
||||
if (type.equalsIgnoreCase("shortcut icon")) {
|
||||
final ImageEntry ie = new ImageEntry(newLink, linktitle, -1, -1, -1);
|
||||
images.put(new String(ie.url().hash()), ie);
|
||||
images.put(ie.url(), ie);
|
||||
this.favicon = newLink;
|
||||
} else if (!type.equalsIgnoreCase("stylesheet") && !type.equalsIgnoreCase("alternate stylesheet")) {
|
||||
anchors.put(newLink, linktitle);
|
||||
|
@ -220,7 +220,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
|
|||
// System.out.println("ScrapeTag1: tagname=" + tagname + ", opts=" + tagopts.toString() + ", text=" + new String(text));
|
||||
if (tagname.equalsIgnoreCase("a") && text.length < 2048) {
|
||||
final String href = tagopts.getProperty("href", "");
|
||||
DigestURI url;
|
||||
MultiProtocolURI url;
|
||||
if ((href.length() > 0) && ((url = absolutePath(href)) != null)) {
|
||||
final String f = url.getFile();
|
||||
final int p = f.lastIndexOf('.');
|
||||
|
@ -350,7 +350,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
|
|||
}
|
||||
}
|
||||
|
||||
public Map<DigestURI, String> getAnchors() {
|
||||
public Map<MultiProtocolURI, String> getAnchors() {
|
||||
// returns a url (String) / name (String) relation
|
||||
return anchors;
|
||||
}
|
||||
|
@ -359,7 +359,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
|
|||
* get all images
|
||||
* @return a map of <urlhash, ImageEntry>
|
||||
*/
|
||||
public HashMap<String, ImageEntry> getImages() {
|
||||
public HashMap<MultiProtocolURI, ImageEntry> getImages() {
|
||||
// this resturns a String(absolute url)/htmlFilterImageEntry - relation
|
||||
return images;
|
||||
}
|
||||
|
@ -369,9 +369,9 @@ public class ContentScraper extends AbstractScraper implements Scraper {
|
|||
}
|
||||
|
||||
/**
|
||||
* @return the {@link DigestURI} to the favicon that belongs to the document
|
||||
* @return the {@link MultiProtocolURI} to the favicon that belongs to the document
|
||||
*/
|
||||
public DigestURI getFavicon() {
|
||||
public MultiProtocolURI getFavicon() {
|
||||
return this.favicon;
|
||||
}
|
||||
|
||||
|
@ -442,7 +442,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
|
|||
if (s == null) s = metas.get("dc.description");
|
||||
if (s == null) s = "";
|
||||
if (s.length() == 0) {
|
||||
return DigestURI.splitpattern.split(getTitle().toLowerCase());
|
||||
return MultiProtocolURI.splitpattern.split(getTitle().toLowerCase());
|
||||
}
|
||||
if (s.contains(",")) return s.split(" |,");
|
||||
if (s.contains(";")) return s.split(" |;");
|
||||
|
@ -536,32 +536,32 @@ public class ContentScraper extends AbstractScraper implements Scraper {
|
|||
if (page == null) throw new IOException("no content in file " + file.toString());
|
||||
|
||||
// scrape document to look up charset
|
||||
final ScraperInputStream htmlFilter = new ScraperInputStream(new ByteArrayInputStream(page),"UTF-8",new DigestURI("http://localhost", null),null,false);
|
||||
final ScraperInputStream htmlFilter = new ScraperInputStream(new ByteArrayInputStream(page),"UTF-8", new MultiProtocolURI("http://localhost"),null,false);
|
||||
final String charset = htmlParser.patchCharsetEncoding(htmlFilter.detectCharset());
|
||||
|
||||
// scrape content
|
||||
final ContentScraper scraper = new ContentScraper(new DigestURI("http://localhost", null));
|
||||
final ContentScraper scraper = new ContentScraper(new MultiProtocolURI("http://localhost"));
|
||||
final Writer writer = new TransformerWriter(null, null, scraper, null, false);
|
||||
FileUtils.copy(new ByteArrayInputStream(page), writer, Charset.forName(charset));
|
||||
|
||||
return scraper;
|
||||
}
|
||||
|
||||
public static void addAllImages(final HashMap<String, ImageEntry> a, final HashMap<String, ImageEntry> b) {
|
||||
final Iterator<Map.Entry<String, ImageEntry>> i = b.entrySet().iterator();
|
||||
Map.Entry<String, ImageEntry> ie;
|
||||
public static void addAllImages(final HashMap<MultiProtocolURI, ImageEntry> a, final HashMap<MultiProtocolURI, ImageEntry> b) {
|
||||
final Iterator<Map.Entry<MultiProtocolURI, ImageEntry>> i = b.entrySet().iterator();
|
||||
Map.Entry<MultiProtocolURI, ImageEntry> ie;
|
||||
while (i.hasNext()) {
|
||||
ie = i.next();
|
||||
addImage(a, ie.getValue());
|
||||
}
|
||||
}
|
||||
|
||||
public static void addImage(final HashMap<String, ImageEntry> a, final ImageEntry ie) {
|
||||
if (a.containsKey(new String(ie.url().hash()))) {
|
||||
public static void addImage(final HashMap<MultiProtocolURI, ImageEntry> a, final ImageEntry ie) {
|
||||
if (a.containsKey(ie.url())) {
|
||||
// in case of a collision, take that image that has the better image size tags
|
||||
if ((ie.height() > 0) && (ie.width() > 0)) a.put(new String(ie.url().hash()), ie);
|
||||
if ((ie.height() > 0) && (ie.width() > 0)) a.put(ie.url(), ie);
|
||||
} else {
|
||||
a.put(new String(ie.url().hash()), ie);
|
||||
a.put(ie.url(), ie);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -26,16 +26,16 @@ package net.yacy.document.parser.html;
|
|||
|
||||
import java.util.Comparator;
|
||||
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.cora.document.MultiProtocolURI;
|
||||
|
||||
public class ImageEntry implements Comparable<ImageEntry>, Comparator<ImageEntry> {
|
||||
|
||||
private final DigestURI url;
|
||||
private final MultiProtocolURI url;
|
||||
private final String alt;
|
||||
private final int width, height;
|
||||
private final long fileSize;
|
||||
|
||||
public ImageEntry(final DigestURI url, final String alt, final int width, final int height, long fileSize) {
|
||||
public ImageEntry(final MultiProtocolURI url, final String alt, final int width, final int height, long fileSize) {
|
||||
this.url = url;
|
||||
this.alt = alt;
|
||||
this.width = width;
|
||||
|
@ -43,7 +43,7 @@ public class ImageEntry implements Comparable<ImageEntry>, Comparator<ImageEntry
|
|||
this.fileSize = fileSize;
|
||||
}
|
||||
|
||||
public DigestURI url() {
|
||||
public MultiProtocolURI url() {
|
||||
return this.url;
|
||||
}
|
||||
|
||||
|
|
|
@ -35,7 +35,7 @@ import java.io.UnsupportedEncodingException;
|
|||
import java.io.Writer;
|
||||
import java.util.Properties;
|
||||
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.cora.document.MultiProtocolURI;
|
||||
|
||||
|
||||
public class ScraperInputStream extends InputStream implements ScraperListener {
|
||||
|
@ -58,7 +58,7 @@ public class ScraperInputStream extends InputStream implements ScraperListener {
|
|||
public ScraperInputStream(
|
||||
final InputStream inStream,
|
||||
final String inputStreamCharset,
|
||||
final DigestURI rooturl,
|
||||
final MultiProtocolURI rooturl,
|
||||
final Transformer transformer,
|
||||
final boolean passbyIfBinarySuspect
|
||||
) {
|
||||
|
|
|
@ -34,6 +34,7 @@ import java.nio.charset.UnsupportedCharsetException;
|
|||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import net.yacy.cora.document.MultiProtocolURI;
|
||||
import net.yacy.document.AbstractParser;
|
||||
import net.yacy.document.Document;
|
||||
import net.yacy.document.Idiom;
|
||||
|
@ -41,7 +42,6 @@ import net.yacy.document.ParserException;
|
|||
import net.yacy.document.parser.html.ContentScraper;
|
||||
import net.yacy.document.parser.html.ScraperInputStream;
|
||||
import net.yacy.document.parser.html.TransformerWriter;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.kelondro.util.FileUtils;
|
||||
|
||||
|
||||
|
@ -84,7 +84,7 @@ public class htmlParser extends AbstractParser implements Idiom {
|
|||
|
||||
@Override
|
||||
public Document parse(
|
||||
final DigestURI location,
|
||||
final MultiProtocolURI location,
|
||||
final String mimeType,
|
||||
final String documentCharset,
|
||||
final InputStream sourceStream) throws ParserException, InterruptedException {
|
||||
|
@ -136,7 +136,7 @@ public class htmlParser extends AbstractParser implements Idiom {
|
|||
return transformScraper(location, mimeType, documentCharset, scraper);
|
||||
}
|
||||
|
||||
private static Document transformScraper(final DigestURI location, final String mimeType, final String charSet, final ContentScraper scraper) {
|
||||
private static Document transformScraper(final MultiProtocolURI location, final String mimeType, final String charSet, final ContentScraper scraper) {
|
||||
final String[] sections = new String[scraper.getHeadlines(1).length + scraper.getHeadlines(2).length + scraper.getHeadlines(3).length + scraper.getHeadlines(4).length];
|
||||
int p = 0;
|
||||
for (int i = 1; i <= 4; i++) for (int j = 0; j < scraper.getHeadlines(i).length; j++) sections[p++] = scraper.getHeadlines(i)[j];
|
||||
|
|
|
@ -50,13 +50,13 @@ import com.sun.image.codec.jpeg.JPEGCodec;
|
|||
import com.sun.image.codec.jpeg.JPEGDecodeParam;
|
||||
import com.sun.image.codec.jpeg.JPEGImageDecoder;
|
||||
|
||||
import net.yacy.cora.document.MultiProtocolURI;
|
||||
import net.yacy.document.AbstractParser;
|
||||
import net.yacy.document.Document;
|
||||
import net.yacy.document.Idiom;
|
||||
import net.yacy.document.ParserException;
|
||||
import net.yacy.document.parser.html.ImageEntry;
|
||||
import net.yacy.document.parser.images.bmpParser.IMAGEMAP;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.kelondro.logging.Log;
|
||||
import net.yacy.kelondro.util.FileUtils;
|
||||
|
||||
|
@ -88,7 +88,7 @@ public class genericImageParser extends AbstractParser implements Idiom {
|
|||
@SuppressWarnings("unchecked")
|
||||
@Override
|
||||
public Document parse(
|
||||
final DigestURI location,
|
||||
final MultiProtocolURI location,
|
||||
final String mimeType,
|
||||
final String documentCharset,
|
||||
final InputStream sourceStream) throws ParserException, InterruptedException {
|
||||
|
@ -170,11 +170,11 @@ public class genericImageParser extends AbstractParser implements Idiom {
|
|||
}
|
||||
|
||||
final HashSet<String> languages = new HashSet<String>();
|
||||
final HashMap<DigestURI, String> anchors = new HashMap<DigestURI, String>();
|
||||
final HashMap<String, ImageEntry> images = new HashMap<String, ImageEntry>();
|
||||
final HashMap<MultiProtocolURI, String> anchors = new HashMap<MultiProtocolURI, String>();
|
||||
final HashMap<MultiProtocolURI, ImageEntry> images = new HashMap<MultiProtocolURI, ImageEntry>();
|
||||
// add this image to the map of images
|
||||
String infoString = ii.info.toString();
|
||||
images.put(infoString, new ImageEntry(location, "", ii.width, ii.height, -1));
|
||||
images.put(ii.location, new ImageEntry(location, "", ii.width, ii.height, -1));
|
||||
|
||||
if (title == null) title = location.toNormalform(true, true);
|
||||
|
||||
|
@ -204,7 +204,7 @@ public class genericImageParser extends AbstractParser implements Idiom {
|
|||
}
|
||||
|
||||
public static ImageInfo parseJavaImage(
|
||||
final DigestURI location,
|
||||
final MultiProtocolURI location,
|
||||
final InputStream sourceStream) throws ParserException {
|
||||
BufferedImage image = null;
|
||||
try {
|
||||
|
@ -222,7 +222,7 @@ public class genericImageParser extends AbstractParser implements Idiom {
|
|||
}
|
||||
|
||||
public static ImageInfo parseJavaImage(
|
||||
final DigestURI location,
|
||||
final MultiProtocolURI location,
|
||||
final BufferedImage image) {
|
||||
ImageInfo ii = new ImageInfo(location);
|
||||
ii.image = image;
|
||||
|
@ -259,12 +259,12 @@ public class genericImageParser extends AbstractParser implements Idiom {
|
|||
}
|
||||
|
||||
public static class ImageInfo {
|
||||
public DigestURI location;
|
||||
public MultiProtocolURI location;
|
||||
public BufferedImage image;
|
||||
public StringBuilder info;
|
||||
public int height;
|
||||
public int width;
|
||||
public ImageInfo(final DigestURI location) {
|
||||
public ImageInfo(final MultiProtocolURI location) {
|
||||
this.location = location;
|
||||
this.image = null;
|
||||
this.info = new StringBuilder();
|
||||
|
@ -278,9 +278,9 @@ public class genericImageParser extends AbstractParser implements Idiom {
|
|||
public static void main(final String[] args) {
|
||||
File image = new File(args[0]);
|
||||
genericImageParser parser = new genericImageParser();
|
||||
DigestURI uri;
|
||||
MultiProtocolURI uri;
|
||||
try {
|
||||
uri = new DigestURI("http://localhost/" + image.getName());
|
||||
uri = new MultiProtocolURI("http://localhost/" + image.getName());
|
||||
Document document = parser.parse(uri, "image/" + uri.getFileExtension(), "UTF-8", new FileInputStream(image));
|
||||
System.out.println(document.toString());
|
||||
} catch (MalformedURLException e) {
|
||||
|
|
|
@ -39,13 +39,13 @@ import java.util.zip.ZipFile;
|
|||
import javax.xml.parsers.SAXParser;
|
||||
import javax.xml.parsers.SAXParserFactory;
|
||||
|
||||
import net.yacy.cora.document.MultiProtocolURI;
|
||||
import net.yacy.document.AbstractParser;
|
||||
import net.yacy.document.Document;
|
||||
import net.yacy.document.Idiom;
|
||||
import net.yacy.document.ParserException;
|
||||
import net.yacy.document.parser.xml.ODContentHandler;
|
||||
import net.yacy.document.parser.xml.ODMetaHandler;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.kelondro.io.CharBuffer;
|
||||
import net.yacy.kelondro.util.FileUtils;
|
||||
|
||||
|
@ -106,7 +106,7 @@ public class odtParser extends AbstractParser implements Idiom {
|
|||
}
|
||||
|
||||
@Override
|
||||
public Document parse(final DigestURI location, final String mimeType, final String charset, final File dest) throws ParserException, InterruptedException {
|
||||
public Document parse(final MultiProtocolURI location, final String mimeType, final String charset, final File dest) throws ParserException, InterruptedException {
|
||||
|
||||
Writer writer = null;
|
||||
File writerFile = null;
|
||||
|
@ -228,7 +228,7 @@ public class odtParser extends AbstractParser implements Idiom {
|
|||
}
|
||||
}
|
||||
|
||||
public Document parse(final DigestURI location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException {
|
||||
public Document parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException {
|
||||
File dest = null;
|
||||
try {
|
||||
// creating a tempfile
|
||||
|
|
|
@ -39,13 +39,13 @@ import java.util.zip.ZipFile;
|
|||
import javax.xml.parsers.SAXParser;
|
||||
import javax.xml.parsers.SAXParserFactory;
|
||||
|
||||
import net.yacy.cora.document.MultiProtocolURI;
|
||||
import net.yacy.document.AbstractParser;
|
||||
import net.yacy.document.Document;
|
||||
import net.yacy.document.Idiom;
|
||||
import net.yacy.document.ParserException;
|
||||
import net.yacy.document.parser.xml.ODContentHandler;
|
||||
import net.yacy.document.parser.xml.ODMetaHandler;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.kelondro.io.CharBuffer;
|
||||
import net.yacy.kelondro.logging.Log;
|
||||
import net.yacy.kelondro.util.FileUtils;
|
||||
|
@ -90,7 +90,7 @@ public class ooxmlParser extends AbstractParser implements Idiom {
|
|||
}
|
||||
|
||||
@Override
|
||||
public Document parse(final DigestURI location, final String mimeType, final String charset, final File dest) throws ParserException, InterruptedException {
|
||||
public Document parse(final MultiProtocolURI location, final String mimeType, final String charset, final File dest) throws ParserException, InterruptedException {
|
||||
|
||||
Writer writer = null;
|
||||
File writerFile = null;
|
||||
|
@ -215,7 +215,7 @@ public class ooxmlParser extends AbstractParser implements Idiom {
|
|||
}
|
||||
}
|
||||
|
||||
public Document parse(final DigestURI location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException {
|
||||
public Document parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException {
|
||||
File dest = null;
|
||||
try {
|
||||
// creating a tempfile
|
||||
|
|
|
@ -44,11 +44,11 @@ import org.apache.pdfbox.pdmodel.encryption.BadSecurityHandlerException;
|
|||
import org.apache.pdfbox.pdmodel.encryption.StandardDecryptionMaterial;
|
||||
import org.apache.pdfbox.util.PDFTextStripper;
|
||||
|
||||
import net.yacy.cora.document.MultiProtocolURI;
|
||||
import net.yacy.document.AbstractParser;
|
||||
import net.yacy.document.Document;
|
||||
import net.yacy.document.Idiom;
|
||||
import net.yacy.document.ParserException;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.kelondro.io.CharBuffer;
|
||||
import net.yacy.kelondro.logging.Log;
|
||||
import net.yacy.kelondro.util.FileUtils;
|
||||
|
@ -84,7 +84,7 @@ public class pdfParser extends AbstractParser implements Idiom {
|
|||
return SUPPORTED_EXTENSIONS;
|
||||
}
|
||||
|
||||
public Document parse(final DigestURI location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException {
|
||||
public Document parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException {
|
||||
|
||||
// create a pdf parser
|
||||
final PDDocument theDocument;
|
||||
|
|
|
@ -32,11 +32,11 @@ import java.io.InputStream;
|
|||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import net.yacy.cora.document.MultiProtocolURI;
|
||||
import net.yacy.document.AbstractParser;
|
||||
import net.yacy.document.Document;
|
||||
import net.yacy.document.Idiom;
|
||||
import net.yacy.document.ParserException;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.kelondro.logging.Log;
|
||||
|
||||
import org.apache.poi.hslf.extractor.PowerPointExtractor;
|
||||
|
@ -70,7 +70,7 @@ public class pptParser extends AbstractParser implements Idiom {
|
|||
* parses the source documents and returns a plasmaParserDocument containing
|
||||
* all extracted information about the parsed document
|
||||
*/
|
||||
public Document parse(final DigestURI location, final String mimeType,
|
||||
public Document parse(final MultiProtocolURI location, final String mimeType,
|
||||
final String charset, final InputStream source) throws ParserException,
|
||||
InterruptedException {
|
||||
try {
|
||||
|
|
|
@ -37,11 +37,11 @@ import java.io.InputStreamReader;
|
|||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import net.yacy.cora.document.MultiProtocolURI;
|
||||
import net.yacy.document.AbstractParser;
|
||||
import net.yacy.document.Document;
|
||||
import net.yacy.document.Idiom;
|
||||
import net.yacy.document.ParserException;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.kelondro.util.FileUtils;
|
||||
|
||||
|
||||
|
@ -104,7 +104,7 @@ public class psParser extends AbstractParser implements Idiom {
|
|||
|
||||
|
||||
@Override
|
||||
public Document parse(final DigestURI location, final String mimeType, final String charset, final File sourceFile) throws ParserException, InterruptedException {
|
||||
public Document parse(final MultiProtocolURI location, final String mimeType, final String charset, final File sourceFile) throws ParserException, InterruptedException {
|
||||
|
||||
File outputFile = null;
|
||||
try {
|
||||
|
@ -277,7 +277,7 @@ public class psParser extends AbstractParser implements Idiom {
|
|||
super.reset();
|
||||
}
|
||||
|
||||
public Document parse(final DigestURI location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException {
|
||||
public Document parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException {
|
||||
|
||||
File tempFile = null;
|
||||
try {
|
||||
|
|
|
@ -40,18 +40,18 @@ import java.util.LinkedList;
|
|||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import net.yacy.cora.document.Hit;
|
||||
import net.yacy.cora.document.MultiProtocolURI;
|
||||
import net.yacy.cora.document.RSSFeed;
|
||||
import net.yacy.cora.document.RSSReader;
|
||||
import net.yacy.document.AbstractParser;
|
||||
import net.yacy.document.Document;
|
||||
import net.yacy.document.Idiom;
|
||||
import net.yacy.document.ParserException;
|
||||
import net.yacy.document.content.RSSMessage;
|
||||
import net.yacy.document.parser.html.AbstractScraper;
|
||||
import net.yacy.document.parser.html.ContentScraper;
|
||||
import net.yacy.document.parser.html.ImageEntry;
|
||||
import net.yacy.document.parser.html.TransformerWriter;
|
||||
import net.yacy.document.parser.xml.RSSFeed;
|
||||
import net.yacy.document.parser.xml.RSSReader;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.kelondro.io.CharBuffer;
|
||||
import net.yacy.kelondro.util.ByteBuffer;
|
||||
import net.yacy.kelondro.util.FileUtils;
|
||||
|
@ -78,11 +78,11 @@ public class rssParser extends AbstractParser implements Idiom {
|
|||
super("Rich Site Summary/Atom Feed Parser");
|
||||
}
|
||||
|
||||
public Document parse(final DigestURI location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException {
|
||||
public Document parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException {
|
||||
|
||||
final LinkedList<String> feedSections = new LinkedList<String>();
|
||||
final HashMap<DigestURI, String> anchors = new HashMap<DigestURI, String>();
|
||||
final HashMap<String, ImageEntry> images = new HashMap<String, ImageEntry>();
|
||||
final HashMap<MultiProtocolURI, String> anchors = new HashMap<MultiProtocolURI, String>();
|
||||
final HashMap<MultiProtocolURI, ImageEntry> images = new HashMap<MultiProtocolURI, ImageEntry>();
|
||||
final ByteBuffer text = new ByteBuffer();
|
||||
final CharBuffer authors = new CharBuffer();
|
||||
|
||||
|
@ -119,20 +119,20 @@ public class rssParser extends AbstractParser implements Idiom {
|
|||
|
||||
if (feed.getImage() != null) {
|
||||
try {
|
||||
DigestURI imgURL = new DigestURI(feed.getImage(), null);
|
||||
images.put(new String(imgURL.hash()), new ImageEntry(imgURL, feedTitle, -1, -1, -1));
|
||||
MultiProtocolURI imgURL = new MultiProtocolURI(feed.getImage());
|
||||
images.put(imgURL, new ImageEntry(imgURL, feedTitle, -1, -1, -1));
|
||||
} catch (MalformedURLException e) {}
|
||||
}
|
||||
|
||||
// loop through the feed items
|
||||
for (final RSSMessage item: feed) {
|
||||
for (final Hit item: feed) {
|
||||
// check for interruption
|
||||
checkInterruption();
|
||||
|
||||
final String itemTitle = item.getTitle();
|
||||
DigestURI itemURL = null;
|
||||
MultiProtocolURI itemURL = null;
|
||||
try {
|
||||
itemURL = new DigestURI(item.getLink(), null);
|
||||
itemURL = new MultiProtocolURI(item.getLink());
|
||||
} catch (MalformedURLException e) {
|
||||
continue;
|
||||
}
|
||||
|
@ -164,12 +164,12 @@ public class rssParser extends AbstractParser implements Idiom {
|
|||
feedSections.add(itemHeadline);
|
||||
}
|
||||
|
||||
final Map<DigestURI, String> itemLinks = scraper.getAnchors();
|
||||
final Map<MultiProtocolURI, String> itemLinks = scraper.getAnchors();
|
||||
if (itemLinks != null && !itemLinks.isEmpty()) {
|
||||
anchors.putAll(itemLinks);
|
||||
}
|
||||
|
||||
final HashMap<String, ImageEntry> itemImages = scraper.getImages();
|
||||
final HashMap<MultiProtocolURI, ImageEntry> itemImages = scraper.getImages();
|
||||
if (itemImages != null && !itemImages.isEmpty()) {
|
||||
ContentScraper.addAllImages(images, itemImages);
|
||||
}
|
||||
|
|
|
@ -34,11 +34,11 @@ import java.util.Set;
|
|||
import javax.swing.text.DefaultStyledDocument;
|
||||
import javax.swing.text.rtf.RTFEditorKit;
|
||||
|
||||
import net.yacy.cora.document.MultiProtocolURI;
|
||||
import net.yacy.document.AbstractParser;
|
||||
import net.yacy.document.Document;
|
||||
import net.yacy.document.Idiom;
|
||||
import net.yacy.document.ParserException;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
|
||||
|
||||
public class rtfParser extends AbstractParser implements Idiom {
|
||||
|
@ -62,7 +62,7 @@ public class rtfParser extends AbstractParser implements Idiom {
|
|||
super("Rich Text Format Parser");
|
||||
}
|
||||
|
||||
public Document parse(final DigestURI location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException {
|
||||
public Document parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException {
|
||||
|
||||
|
||||
try {
|
||||
|
|
|
@ -36,12 +36,12 @@ import java.io.OutputStream;
|
|||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import net.yacy.cora.document.MultiProtocolURI;
|
||||
import net.yacy.document.AbstractParser;
|
||||
import net.yacy.document.Document;
|
||||
import net.yacy.document.Idiom;
|
||||
import net.yacy.document.TextParser;
|
||||
import net.yacy.document.ParserException;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.kelondro.logging.Log;
|
||||
import net.yacy.kelondro.util.FileUtils;
|
||||
|
||||
|
@ -69,7 +69,7 @@ public class sevenzipParser extends AbstractParser implements Idiom {
|
|||
super("7zip Archive Parser");
|
||||
}
|
||||
|
||||
public Document parse(final DigestURI location, final String mimeType, final String charset, final IInStream source) throws ParserException, InterruptedException {
|
||||
public Document parse(final MultiProtocolURI location, final String mimeType, final String charset, final IInStream source) throws ParserException, InterruptedException {
|
||||
final Document doc = new Document(location, mimeType, charset, null, null, null, null, null, null, null, (Object)null, null, null, false);
|
||||
Handler archive;
|
||||
super.theLogger.logFine("opening 7zip archive...");
|
||||
|
@ -99,13 +99,13 @@ public class sevenzipParser extends AbstractParser implements Idiom {
|
|||
}
|
||||
|
||||
@Override
|
||||
public Document parse(final DigestURI location, final String mimeType, final String charset,
|
||||
public Document parse(final MultiProtocolURI location, final String mimeType, final String charset,
|
||||
final byte[] source) throws ParserException, InterruptedException {
|
||||
return parse(location, mimeType, charset, new ByteArrayIInStream(source));
|
||||
}
|
||||
|
||||
@Override
|
||||
public Document parse(final DigestURI location, final String mimeType, final String charset,
|
||||
public Document parse(final MultiProtocolURI location, final String mimeType, final String charset,
|
||||
final File sourceFile) throws ParserException, InterruptedException {
|
||||
try {
|
||||
return parse(location, mimeType, charset, new MyRandomAccessFile(sourceFile, "r"));
|
||||
|
@ -114,7 +114,7 @@ public class sevenzipParser extends AbstractParser implements Idiom {
|
|||
}
|
||||
}
|
||||
|
||||
public Document parse(final DigestURI location, final String mimeType, final String charset,
|
||||
public Document parse(final MultiProtocolURI location, final String mimeType, final String charset,
|
||||
final InputStream source) throws ParserException, InterruptedException {
|
||||
try {
|
||||
final ByteArrayOutputStream cfos = new ByteArrayOutputStream();
|
||||
|
@ -189,7 +189,7 @@ public class sevenzipParser extends AbstractParser implements Idiom {
|
|||
Document theDoc;
|
||||
// workaround for relative links in file, normally '#' shall be used behind the location, see
|
||||
// below for reversion of the effects
|
||||
final DigestURI url = DigestURI.newURL(doc.dc_source(), this.prefix + "/" + super.filePath);
|
||||
final MultiProtocolURI url = MultiProtocolURI.newURL(doc.dc_source(), this.prefix + "/" + super.filePath);
|
||||
final String mime = TextParser.mimeOf(super.filePath.substring(super.filePath.lastIndexOf('.') + 1));
|
||||
theDoc = TextParser.parseSource(url, mime, null, this.cfos.toByteArray());
|
||||
|
||||
|
|
|
@ -33,11 +33,11 @@ import java.util.HashMap;
|
|||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import net.yacy.cora.document.MultiProtocolURI;
|
||||
import net.yacy.document.AbstractParser;
|
||||
import net.yacy.document.Document;
|
||||
import net.yacy.document.Idiom;
|
||||
import net.yacy.document.ParserException;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.kelondro.logging.Log;
|
||||
|
||||
import pt.tumba.parser.swf.SWF2HTML;
|
||||
|
@ -74,7 +74,7 @@ public class swfParser extends AbstractParser implements Idiom {
|
|||
* parses the source documents and returns a plasmaParserDocument containing
|
||||
* all extracted information about the parsed document
|
||||
*/
|
||||
public Document parse(final DigestURI location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException {
|
||||
public Document parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException {
|
||||
|
||||
try {
|
||||
final SWF2HTML swf2html = new SWF2HTML();
|
||||
|
@ -97,7 +97,7 @@ public class swfParser extends AbstractParser implements Idiom {
|
|||
final String[] sections = null;
|
||||
final String abstrct = null;
|
||||
//TreeSet images = null;
|
||||
final HashMap<DigestURI, String> anchors = new HashMap<DigestURI, String>();
|
||||
final HashMap<MultiProtocolURI, String> anchors = new HashMap<MultiProtocolURI, String>();
|
||||
int urls = 0;
|
||||
int urlStart = -1;
|
||||
int urlEnd = 0;
|
||||
|
@ -114,7 +114,7 @@ public class swfParser extends AbstractParser implements Idiom {
|
|||
urlEnd = contents.indexOf(linebreak,urlStart);
|
||||
url = contents.substring(urlStart,urlEnd);
|
||||
urlnr = (Integer.valueOf(++urls)).toString();
|
||||
anchors.put(new DigestURI(url, null), urlnr);
|
||||
anchors.put(new MultiProtocolURI(url), urlnr);
|
||||
contents = contents.substring(0,urlStart)+contents.substring(urlEnd);
|
||||
}
|
||||
|
||||
|
|
|
@ -38,6 +38,7 @@ import java.util.Map;
|
|||
import java.util.Set;
|
||||
import java.util.zip.GZIPInputStream;
|
||||
|
||||
import net.yacy.cora.document.MultiProtocolURI;
|
||||
import net.yacy.document.AbstractParser;
|
||||
import net.yacy.document.Document;
|
||||
import net.yacy.document.Idiom;
|
||||
|
@ -45,7 +46,6 @@ import net.yacy.document.TextParser;
|
|||
import net.yacy.document.ParserException;
|
||||
import net.yacy.document.parser.html.ContentScraper;
|
||||
import net.yacy.document.parser.html.ImageEntry;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.kelondro.util.ByteBuffer;
|
||||
import net.yacy.kelondro.util.FileUtils;
|
||||
|
||||
|
@ -81,7 +81,7 @@ public class tarParser extends AbstractParser implements Idiom {
|
|||
return SUPPORTED_EXTENSIONS;
|
||||
}
|
||||
|
||||
public Document parse(final DigestURI location, final String mimeType, final String charset, InputStream source) throws ParserException, InterruptedException {
|
||||
public Document parse(final MultiProtocolURI location, final String mimeType, final String charset, InputStream source) throws ParserException, InterruptedException {
|
||||
|
||||
long docTextLength = 0;
|
||||
OutputStream docText = null;
|
||||
|
@ -106,8 +106,8 @@ public class tarParser extends AbstractParser implements Idiom {
|
|||
final LinkedList<String> docSections = new LinkedList<String>();
|
||||
final StringBuilder docAbstrct = new StringBuilder();
|
||||
|
||||
final Map<DigestURI, String> docAnchors = new HashMap<DigestURI, String>();
|
||||
final HashMap<String, ImageEntry> docImages = new HashMap<String, ImageEntry>();
|
||||
final Map<MultiProtocolURI, String> docAnchors = new HashMap<MultiProtocolURI, String>();
|
||||
final HashMap<MultiProtocolURI, ImageEntry> docImages = new HashMap<MultiProtocolURI, ImageEntry>();
|
||||
|
||||
// looping through the contained files
|
||||
TarEntry entry;
|
||||
|
@ -143,7 +143,7 @@ public class tarParser extends AbstractParser implements Idiom {
|
|||
checkInterruption();
|
||||
|
||||
// parsing the content
|
||||
subDoc = TextParser.parseSource(DigestURI.newURL(location,"#" + entryName),entryMime,null,subDocTempFile);
|
||||
subDoc = TextParser.parseSource(MultiProtocolURI.newURL(location,"#" + entryName),entryMime,null,subDocTempFile);
|
||||
} catch (final ParserException e) {
|
||||
this.theLogger.logInfo("Unable to parse tar file entry '" + entryName + "'. " + e.getMessage());
|
||||
} finally {
|
||||
|
|
|
@ -36,12 +36,12 @@ import java.util.List;
|
|||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import net.yacy.cora.document.MultiProtocolURI;
|
||||
import net.yacy.document.AbstractParser;
|
||||
import net.yacy.document.Condenser;
|
||||
import net.yacy.document.Document;
|
||||
import net.yacy.document.Idiom;
|
||||
import net.yacy.document.ParserException;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.kelondro.data.word.Word;
|
||||
import net.yacy.kelondro.util.BDecoder;
|
||||
import net.yacy.kelondro.util.FileUtils;
|
||||
|
@ -75,7 +75,7 @@ public class torrentParser extends AbstractParser implements Idiom {
|
|||
}
|
||||
|
||||
@Override
|
||||
public Document parse(DigestURI location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException {
|
||||
public Document parse(MultiProtocolURI location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException {
|
||||
byte[] b = null;
|
||||
try {
|
||||
b = FileUtils.read(source);
|
||||
|
@ -141,7 +141,7 @@ public class torrentParser extends AbstractParser implements Idiom {
|
|||
try {
|
||||
byte[] b = FileUtils.read(new File(args[0]));
|
||||
torrentParser parser = new torrentParser();
|
||||
Document d = parser.parse(new DigestURI("http://localhost/test.torrent", null), null, "utf-8", b);
|
||||
Document d = parser.parse(new MultiProtocolURI("http://localhost/test.torrent"), null, "utf-8", b);
|
||||
Condenser c = new Condenser(d, true, true);
|
||||
Map<String, Word> w = c.words();
|
||||
for (Map.Entry<String, Word> e: w.entrySet()) System.out.println("Word: " + e.getKey() + " - " + e.getValue().posInText);
|
||||
|
|
|
@ -37,11 +37,11 @@ import java.util.Iterator;
|
|||
import java.util.LinkedList;
|
||||
import java.util.Set;
|
||||
|
||||
import net.yacy.cora.document.MultiProtocolURI;
|
||||
import net.yacy.document.AbstractParser;
|
||||
import net.yacy.document.Document;
|
||||
import net.yacy.document.Idiom;
|
||||
import net.yacy.document.ParserException;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.kelondro.order.Base64Order;
|
||||
|
||||
/**
|
||||
|
@ -80,13 +80,13 @@ public class vcfParser extends AbstractParser implements Idiom {
|
|||
return SUPPORTED_EXTENSIONS;
|
||||
}
|
||||
|
||||
public Document parse(final DigestURI url, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException {
|
||||
public Document parse(final MultiProtocolURI url, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException {
|
||||
|
||||
try {
|
||||
final StringBuilder parsedTitle = new StringBuilder();
|
||||
final StringBuilder parsedDataText = new StringBuilder();
|
||||
final HashMap<String, String> parsedData = new HashMap<String, String>();
|
||||
final HashMap<DigestURI, String> anchors = new HashMap<DigestURI, String>();
|
||||
final HashMap<MultiProtocolURI, String> anchors = new HashMap<MultiProtocolURI, String>();
|
||||
final LinkedList<String> parsedNames = new LinkedList<String>();
|
||||
|
||||
boolean useLastLine = false;
|
||||
|
@ -195,7 +195,7 @@ public class vcfParser extends AbstractParser implements Idiom {
|
|||
parsedData.clear();
|
||||
} else if (key.toUpperCase().startsWith("URL")) {
|
||||
try {
|
||||
final DigestURI newURL = new DigestURI(value, null);
|
||||
final MultiProtocolURI newURL = new MultiProtocolURI(value);
|
||||
anchors.put(newURL, newURL.toString());
|
||||
//parsedData.put(key,value);
|
||||
} catch (final MalformedURLException ex) {/* ignore this */}
|
||||
|
|
|
@ -31,11 +31,11 @@ import java.io.InputStream;
|
|||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import net.yacy.cora.document.MultiProtocolURI;
|
||||
import net.yacy.document.AbstractParser;
|
||||
import net.yacy.document.Document;
|
||||
import net.yacy.document.Idiom;
|
||||
import net.yacy.document.ParserException;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.kelondro.logging.Log;
|
||||
|
||||
import org.apache.poi.hdgf.extractor.VisioTextExtractor;
|
||||
|
@ -82,7 +82,7 @@ public class vsdParser extends AbstractParser implements Idiom {
|
|||
* parses the source documents and returns a plasmaParserDocument containing
|
||||
* all extracted information about the parsed document
|
||||
*/
|
||||
public Document parse(final DigestURI location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException {
|
||||
public Document parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException {
|
||||
|
||||
Document theDoc = null;
|
||||
|
||||
|
|
|
@ -31,11 +31,11 @@ import java.io.InputStream;
|
|||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import net.yacy.cora.document.MultiProtocolURI;
|
||||
import net.yacy.document.AbstractParser;
|
||||
import net.yacy.document.Document;
|
||||
import net.yacy.document.Idiom;
|
||||
import net.yacy.document.ParserException;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.kelondro.logging.Log;
|
||||
|
||||
import org.apache.poi.hssf.eventusermodel.HSSFEventFactory;
|
||||
|
@ -76,7 +76,7 @@ public class xlsParser extends AbstractParser implements Idiom {
|
|||
* parses the source documents and returns a plasmaParserDocument containing
|
||||
* all extracted information about the parsed document
|
||||
*/
|
||||
public Document parse(final DigestURI location, final String mimeType,
|
||||
public Document parse(final MultiProtocolURI location, final String mimeType,
|
||||
final String charset, final InputStream source) throws ParserException,
|
||||
InterruptedException {
|
||||
return new XLSHSSFListener().parse(location, mimeType, charset, source);
|
||||
|
@ -111,7 +111,7 @@ public class xlsParser extends AbstractParser implements Idiom {
|
|||
* parses the source documents and returns a Document containing
|
||||
* all extracted information about the parsed document
|
||||
*/
|
||||
public Document parse(final DigestURI location, final String mimeType,
|
||||
public Document parse(final MultiProtocolURI location, final String mimeType,
|
||||
final String charset, final InputStream source) throws ParserException,
|
||||
InterruptedException {
|
||||
try {
|
||||
|
|
|
@ -39,6 +39,7 @@ import java.util.Set;
|
|||
import java.util.zip.ZipEntry;
|
||||
import java.util.zip.ZipInputStream;
|
||||
|
||||
import net.yacy.cora.document.MultiProtocolURI;
|
||||
import net.yacy.document.AbstractParser;
|
||||
import net.yacy.document.Document;
|
||||
import net.yacy.document.Idiom;
|
||||
|
@ -46,7 +47,6 @@ import net.yacy.document.TextParser;
|
|||
import net.yacy.document.ParserException;
|
||||
import net.yacy.document.parser.html.ContentScraper;
|
||||
import net.yacy.document.parser.html.ImageEntry;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.kelondro.util.ByteBuffer;
|
||||
import net.yacy.kelondro.util.FileUtils;
|
||||
|
||||
|
@ -82,7 +82,7 @@ public class zipParser extends AbstractParser implements Idiom {
|
|||
return SUPPORTED_EXTENSIONS;
|
||||
}
|
||||
|
||||
public Document parse(final DigestURI location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException {
|
||||
public Document parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException {
|
||||
|
||||
long docTextLength = 0;
|
||||
OutputStream docText = null;
|
||||
|
@ -95,8 +95,8 @@ public class zipParser extends AbstractParser implements Idiom {
|
|||
final StringBuilder docLongTitle = new StringBuilder();
|
||||
final LinkedList<String> docSections = new LinkedList<String>();
|
||||
final StringBuilder docAbstrct = new StringBuilder();
|
||||
final Map<DigestURI, String> docAnchors = new HashMap<DigestURI, String>();
|
||||
final HashMap<String, ImageEntry> docImages = new HashMap<String, ImageEntry>();
|
||||
final Map<MultiProtocolURI, String> docAnchors = new HashMap<MultiProtocolURI, String>();
|
||||
final HashMap<MultiProtocolURI, ImageEntry> docImages = new HashMap<MultiProtocolURI, ImageEntry>();
|
||||
|
||||
// looping through the contained files
|
||||
ZipEntry entry;
|
||||
|
@ -129,7 +129,7 @@ public class zipParser extends AbstractParser implements Idiom {
|
|||
FileUtils.copy(zippedContent,subDocTempFile,entry.getSize());
|
||||
|
||||
// parsing the zip file entry
|
||||
subDoc = TextParser.parseSource(DigestURI.newURL(location,"#" + entryName),entryMime,null, subDocTempFile);
|
||||
subDoc = TextParser.parseSource(MultiProtocolURI.newURL(location,"#" + entryName),entryMime,null, subDocTempFile);
|
||||
} catch (final ParserException e) {
|
||||
this.theLogger.logInfo("Unable to parse zip file entry '" + entryName + "'. " + e.getMessage());
|
||||
} finally {
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -595,7 +595,7 @@ public class Domains {
|
|||
}
|
||||
|
||||
public static boolean isLocal(final String host) {
|
||||
assert (host != null);
|
||||
if (host == null) return true;
|
||||
|
||||
// FIXME IPv4 only
|
||||
// check local ip addresses
|
||||
|
|
|
@ -283,7 +283,7 @@ public class Blacklist {
|
|||
}
|
||||
|
||||
public boolean isListed(final String blacklistType, final DigestURI url) {
|
||||
|
||||
if (url.getHost() == null) return false;
|
||||
final HandleSet urlHashCache = getCacheUrlHashsSet(blacklistType);
|
||||
if (!urlHashCache.has(url.hash())) {
|
||||
final boolean temp = isListed(blacklistType, url.getHost().toLowerCase(), url.getFile());
|
||||
|
|
|
@ -51,6 +51,7 @@ import net.yacy.kelondro.util.FileUtils;
|
|||
|
||||
import de.anomic.crawler.CrawlProfile;
|
||||
import de.anomic.crawler.retrieval.FTPLoader;
|
||||
import de.anomic.crawler.retrieval.FileLoader;
|
||||
import de.anomic.crawler.retrieval.HTTPLoader;
|
||||
import de.anomic.crawler.retrieval.Request;
|
||||
import de.anomic.crawler.retrieval.Response;
|
||||
|
@ -73,17 +74,19 @@ public final class LoaderDispatcher {
|
|||
private final HTTPLoader httpLoader;
|
||||
private final FTPLoader ftpLoader;
|
||||
private final SMBLoader smbLoader;
|
||||
private final FileLoader fileLoader;
|
||||
private final Log log;
|
||||
|
||||
public LoaderDispatcher(final Switchboard sb) {
|
||||
this.sb = sb;
|
||||
this.supportedProtocols = new HashSet<String>(Arrays.asList(new String[]{"http","https","ftp","smb"}));
|
||||
this.supportedProtocols = new HashSet<String>(Arrays.asList(new String[]{"http","https","ftp","smb","file"}));
|
||||
|
||||
// initiate loader objects
|
||||
this.log = new Log("LOADER");
|
||||
httpLoader = new HTTPLoader(sb, log);
|
||||
ftpLoader = new FTPLoader(sb, log);
|
||||
smbLoader = new SMBLoader(sb, log);
|
||||
fileLoader = new FileLoader(sb, log);
|
||||
}
|
||||
|
||||
public boolean isSupportedProtocol(final String protocol) {
|
||||
|
@ -251,13 +254,14 @@ public final class LoaderDispatcher {
|
|||
}
|
||||
|
||||
// now it's for sure that we will access the target. Remember the access time
|
||||
accessTime.put(host, System.currentTimeMillis());
|
||||
if (host != null) accessTime.put(host, System.currentTimeMillis());
|
||||
|
||||
// load resource from the internet
|
||||
Response response = null;
|
||||
if ((protocol.equals("http") || (protocol.equals("https")))) response = httpLoader.load(request, acceptOnlyParseable, maxFileSize);
|
||||
if (protocol.equals("ftp")) response = ftpLoader.load(request, true);
|
||||
if (protocol.equals("smb")) response = smbLoader.load(request, true);
|
||||
if (protocol.equals("file")) response = fileLoader.load(request, true);
|
||||
if (response != null) {
|
||||
// we got something. Now check if we want to store that to the cache
|
||||
// first check looks if we want to store the content to the cache
|
||||
|
|
|
@ -2,13 +2,14 @@ package de.anomic.yacy;
|
|||
|
||||
import java.net.MalformedURLException;
|
||||
|
||||
import net.yacy.cora.document.MultiProtocolURI;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
public class yacyURLTest extends TestCase {
|
||||
|
||||
public void testResolveBackpath() throws MalformedURLException {
|
||||
public void testResolveBackpath() {
|
||||
String[][] testStrings = new String[][] {
|
||||
new String[]{"/..home","/..home"},
|
||||
new String[]{"/test/..home/test.html","/test/..home/test.html"},
|
||||
|
@ -23,14 +24,13 @@ public class yacyURLTest extends TestCase {
|
|||
new String[]{"/home/..test/../hallo/../","/home/"}
|
||||
};
|
||||
|
||||
DigestURI urlObj = new DigestURI("http://yacy.net");
|
||||
for (int i=0; i < testStrings.length; i++) {
|
||||
// desired conversion result
|
||||
System.out.print("testResolveBackpath: " + testStrings[i][0]);
|
||||
String shouldBe = testStrings[i][1];
|
||||
|
||||
// conversion result
|
||||
String resolvedURL = urlObj.resolveBackpath(testStrings[i][0]);
|
||||
String resolvedURL = MultiProtocolURI.resolveBackpath(testStrings[i][0]);
|
||||
|
||||
// test if equal
|
||||
assertEquals(shouldBe,resolvedURL);
|
||||
|
|
Loading…
Reference in New Issue
Block a user