- added new protocol loader for 'file'-type URLs

- it is now possible to crawl the local file system with an intranet peer
- redesign of URL handling
- refactoring: created LGPLed package cora: 'content retrieval api' which may be used externally by other applications without yacy core elements because it has no dependencies to other parts of yacy

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6902 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
orbiter 2010-05-25 12:54:57 +00:00
parent 2fd795207c
commit 11639aef35
86 changed files with 2134 additions and 1676 deletions

View File

@ -685,7 +685,10 @@ crawler.http.maxFileSize=1048576
crawler.ftp.maxFileSize=1048576
# smb crawler specific settings: maximum size
crawler.smb.maxFileSize=50000000
crawler.smb.maxFileSize=100000000
# smb crawler specific settings: maximum size
crawler.file.maxFileSize=100000000
# maximum number of crawler threads
crawler.MaxActiveThreads = 200

View File

@ -24,7 +24,7 @@
import java.util.Random;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.kelondro.util.Domains;
import de.anomic.crawler.ResultImages;
@ -90,8 +90,8 @@ public class Collage {
final int yOffset = embed ? 0 : 70;
for (int i = 0; i < fifoSize; i++) {
final DigestURI baseURL = origins[i].baseURL;
final DigestURI imageURL = origins[i].imageEntry.url();
final MultiProtocolURI baseURL = origins[i].baseURL;
final MultiProtocolURI imageURL = origins[i].imageEntry.url();
// check if this loads a page from localhost, which must be prevented to protect the server
// against attacks to the administration interface when localhost access is granted

View File

@ -36,6 +36,7 @@ import java.util.Set;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.document.parser.html.ContentScraper;
import net.yacy.document.parser.html.TransformerWriter;
import net.yacy.kelondro.data.meta.DigestURI;
@ -234,7 +235,7 @@ public class Crawler_p {
// stack url
sb.crawler.profilesPassiveCrawls.removeEntry(crawlingStartURL.hash()); // if there is an old entry, delete it
final CrawlProfile.entry pe = sb.crawler.profilesActiveCrawls.newEntry(
crawlingStartURL.getHost(),
(crawlingStartURL.getHost() == null) ? Long.toHexString(System.currentTimeMillis()) : crawlingStartURL.getHost(),
crawlingStartURL,
newcrawlingMustMatch,
newcrawlingMustNotMatch,
@ -345,7 +346,7 @@ public class Crawler_p {
writer.close();
//String headline = scraper.getHeadline();
final Map<DigestURI, String> hyperlinks = scraper.getAnchors();
final Map<MultiProtocolURI, String> hyperlinks = scraper.getAnchors();
// creating a crawler profile
final DigestURI crawlURL = new DigestURI("file://" + file.toString(), null);
@ -370,11 +371,12 @@ public class Crawler_p {
sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
// loop through the contained links
final Iterator<Map.Entry<DigestURI, String>> linkiterator = hyperlinks.entrySet().iterator();
final Iterator<Map.Entry<MultiProtocolURI, String>> linkiterator = hyperlinks.entrySet().iterator();
DigestURI nexturl;
while (linkiterator.hasNext()) {
final Map.Entry<DigestURI, String> e = linkiterator.next();
nexturl = e.getKey();
final Map.Entry<MultiProtocolURI, String> e = linkiterator.next();
if (e.getKey() == null) continue;
nexturl = new DigestURI(e.getKey());
if (nexturl == null) continue;
// enqueuing the url for crawling

View File

@ -25,9 +25,9 @@
import java.io.IOException;
import java.net.MalformedURLException;
import net.yacy.document.content.RSSMessage;
import net.yacy.document.parser.xml.RSSFeed;
import net.yacy.document.parser.xml.RSSReader;
import net.yacy.cora.document.Hit;
import net.yacy.cora.document.RSSFeed;
import net.yacy.cora.document.RSSReader;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
@ -69,7 +69,7 @@ public class FeedReader_p {
prop.putHTML("page_description", feed.getChannel().getDescription());
int i = 0;
for (final RSSMessage item: feed) {
for (final Hit item: feed) {
prop.putHTML("page_items_" + i + "_author", item.getAuthor());
prop.putHTML("page_items_" + i + "_title", item.getTitle());
prop.putHTML("page_items_" + i + "_link", item.getLink());

View File

@ -159,23 +159,27 @@
<td colspan="2"><strong>http Crawler Settings:</strong></td>
</tr>
<tr>
<td>Maximum Filesize:</td>
<td>Maximum HTTP Filesize:</td>
<td class="settingsValue">#[crawler.http.maxFileSize]#</td>
</tr>
<tr>
<td colspan="2"><strong>ftp Crawler Settings:</strong></td>
</tr>
<tr>
<td>Maximum Filesize:</td>
<td>Maximum FTP Filesize:</td>
<td class="settingsValue">#[crawler.ftp.maxFileSize]#</td>
</tr>
<tr>
<td colspan="2"><strong>smb Crawler Settings:</strong></td>
</tr>
<tr>
<td>Maximum Filesize:</td>
<td>Maximum SMB Filesize:</td>
<td class="settingsValue">#[crawler.smb.maxFileSize]#</td>
</tr>
<tr>
<td>Maximum file Filesize:</td>
<td class="settingsValue">#[crawler.file.maxFileSize]#</td>
</tr>
</table>
::<!-- 29: Crawler settings timeout error -->
<p class="error">Invalid crawler timeout value: <tt>#[crawler.clientTimeout]#</tt></p>

View File

@ -503,18 +503,32 @@ public class SettingsAck_p {
long maxSmbSize;
try {
maxSmbSize = Integer.parseInt(maxSizeStr);
env.setConfig("crawler.smb.maxFileSize", Long.toString(maxFtpSize));
env.setConfig("crawler.smb.maxFileSize", Long.toString(maxSmbSize));
} catch (final NumberFormatException e) {
prop.put("info", "31");
prop.putHTML("info_crawler.smb.maxFileSize",post.get("crawler.smb.maxFileSize"));
return prop;
}
maxSizeStr = post.get("crawler.file.maxFileSize");
if (maxSizeStr==null||maxSizeStr.length()==0) maxSizeStr = "-1";
long maxFileSize;
try {
maxFileSize = Integer.parseInt(maxSizeStr);
env.setConfig("crawler.file.maxFileSize", Long.toString(maxFileSize));
} catch (final NumberFormatException e) {
prop.put("info", "31");
prop.putHTML("info_crawler.file.maxFileSize",post.get("crawler.file.maxFileSize"));
return prop;
}
// everything is ok
prop.put("info_crawler.clientTimeout",(crawlerTimeout==0) ? "0" :DateFormatter.formatInterval(crawlerTimeout));
prop.put("info_crawler.http.maxFileSize",(maxHttpSize==-1)? "-1":Formatter.bytesToString(maxHttpSize));
prop.put("info_crawler.ftp.maxFileSize", (maxFtpSize==-1) ? "-1":Formatter.bytesToString(maxFtpSize));
prop.put("info_crawler.smb.maxFileSize", (maxFtpSize==-1) ? "-1":Formatter.bytesToString(maxSmbSize));
prop.put("info_crawler.smb.maxFileSize", (maxSmbSize==-1) ? "-1":Formatter.bytesToString(maxSmbSize));
prop.put("info_crawler.file.maxFileSize", (maxFileSize==-1) ? "-1":Formatter.bytesToString(maxFileSize));
prop.put("info", "28");
return prop;
}

View File

@ -26,6 +26,22 @@
</tr>
<tr><td colspan="3"><hr /></td></tr>
<tr><td colspan="3"><p><strong>SMB Crawler Settings</strong>:</p></td></tr>
<tr valign="top">
<td>Maximum Filesize:</td>
<td><input name="crawler.smb.maxFileSize" type="text" size="16" maxlength="16" value="#[crawler.smb.maxFileSize]#" /></td>
<td><em>Maximum allowed file size in bytes that should be downloaded. Larger files will be skipped. <code>-1</code> means unlimited.</em></td>
</tr>
<tr><td colspan="3"><hr /></td></tr>
<tr><td colspan="3"><p><strong>Local File Crawler Settings</strong>:</p></td></tr>
<tr valign="top">
<td>Maximum Filesize:</td>
<td><input name="crawler.file.maxFileSize" type="text" size="16" maxlength="16" value="#[crawler.file.maxFileSize]#" /></td>
<td><em>Maximum allowed file size in bytes that should be downloaded. Larger files will be skipped. <code>-1</code> means unlimited.</em></td>
</tr>
<tr><td colspan="3"><hr /></td></tr>
<tr valign="top">
<td>&nbsp;</td>
<td><input type="submit" name="crawlerSettings" value="Submit" /></td>

View File

@ -202,6 +202,7 @@ public final class Settings_p {
prop.putHTML("crawler.http.maxFileSize",sb.getConfig("crawler.http.maxFileSize", "-1"));
prop.putHTML("crawler.ftp.maxFileSize",sb.getConfig("crawler.ftp.maxFileSize", "-1"));
prop.putHTML("crawler.smb.maxFileSize",sb.getConfig("crawler.smb.maxFileSize", "-1"));
prop.putHTML("crawler.file.maxFileSize",sb.getConfig("crawler.file.maxFileSize", "-1"));
// return rewrite properties
return prop;

View File

@ -35,6 +35,7 @@ import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.document.Condenser;
import net.yacy.document.Document;
import net.yacy.document.ParserException;
@ -372,7 +373,7 @@ public class ViewFile {
i += putMediaInfo(prop, wordArray, i, document.getAudiolinks(), "audio", (i % 2 == 0));
dark = (i % 2 == 0);
final HashMap<String, ImageEntry> ts = document.getImages();
final HashMap<MultiProtocolURI, ImageEntry> ts = document.getImages();
final Iterator<ImageEntry> tsi = ts.values().iterator();
ImageEntry entry;
while (tsi.hasNext()) {
@ -439,9 +440,9 @@ public class ViewFile {
return message;
}
private static int putMediaInfo(final serverObjects prop, final String[] wordArray, int c, final Map<DigestURI, String> media, final String name, boolean dark) {
final Iterator<Map.Entry<DigestURI, String>> mi = media.entrySet().iterator();
Map.Entry<DigestURI, String> entry;
private static int putMediaInfo(final serverObjects prop, final String[] wordArray, int c, final Map<MultiProtocolURI, String> media, final String name, boolean dark) {
final Iterator<Map.Entry<MultiProtocolURI, String>> mi = media.entrySet().iterator();
Map.Entry<MultiProtocolURI, String> entry;
int i = 0;
while (mi.hasNext()) {
entry = mi.next();

View File

@ -2,8 +2,8 @@
import java.util.Date;
import net.yacy.document.content.RSSMessage;
import net.yacy.document.parser.xml.RSSFeed;
import net.yacy.cora.document.RSSFeed;
import net.yacy.cora.document.RSSMessage;
import de.anomic.http.server.RequestHeader;
import de.anomic.search.Switchboard;

View File

@ -30,8 +30,8 @@ import java.text.ParseException;
import java.util.Date;
import java.util.Iterator;
import net.yacy.document.content.RSSMessage;
import net.yacy.document.parser.xml.RSSFeed;
import net.yacy.cora.document.RSSFeed;
import net.yacy.cora.document.Hit;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.util.DateFormatter;
@ -57,7 +57,7 @@ public class rct_p {
final yacySeed seed = (peerhash == null) ? null : sb.peers.getConnected(peerhash);
final RSSFeed feed = (seed == null) ? null : yacyClient.queryRemoteCrawlURLs(sb.peers, seed, 20, 60000);
if (feed != null) {
for (final RSSMessage item: feed) {
for (final Hit item: feed) {
//System.out.println("URL=" + item.getLink() + ", desc=" + item.getDescription() + ", pubDate=" + item.getPubDate());
// put url on remote crawl stack

View File

@ -35,8 +35,8 @@ import java.util.Map;
import java.util.TreeMap;
import java.util.TreeSet;
import net.yacy.document.content.RSSMessage;
import net.yacy.document.parser.xml.RSSFeed;
import net.yacy.cora.document.RSSFeed;
import net.yacy.cora.document.RSSMessage;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.word.WordReference;
import net.yacy.kelondro.data.word.WordReferenceRow;

View File

@ -30,8 +30,8 @@
import java.util.ArrayList;
import java.util.Iterator;
import net.yacy.document.content.RSSMessage;
import net.yacy.document.parser.xml.RSSFeed;
import net.yacy.cora.document.RSSFeed;
import net.yacy.cora.document.RSSMessage;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.word.WordReferenceRow;
import net.yacy.kelondro.index.HandleSet;

View File

@ -29,8 +29,8 @@
import java.io.IOException;
import java.text.ParseException;
import net.yacy.document.content.RSSMessage;
import net.yacy.document.parser.xml.RSSFeed;
import net.yacy.cora.document.RSSFeed;
import net.yacy.cora.document.RSSMessage;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.DateFormatter;

View File

@ -32,11 +32,11 @@ import java.util.HashMap;
import java.util.Iterator;
import java.util.TreeSet;
import net.yacy.cora.document.RSSFeed;
import net.yacy.cora.document.RSSMessage;
import net.yacy.document.Condenser;
import net.yacy.document.Document;
import net.yacy.document.content.RSSMessage;
import net.yacy.document.geolocalization.Location;
import net.yacy.document.parser.xml.RSSFeed;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.word.Word;

View File

@ -22,7 +22,8 @@ import java.util.Set;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.TimeUnit;
import net.yacy.document.content.RSSMessage;
import net.yacy.cora.document.RSSMessage;
import net.yacy.cora.services.Search;
import net.yacy.document.geolocalization.Location;
import de.anomic.data.LibraryProvider;
import de.anomic.http.server.HeaderFramework;
@ -32,7 +33,6 @@ import de.anomic.search.SwitchboardConstants;
import de.anomic.server.serverCore;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.yacy.yacyClient;
import java.util.Date;
import net.yacy.kelondro.util.DateFormatter;
@ -91,7 +91,8 @@ public class yacysearch_location {
if (search_title || search_publisher || search_creator || search_subject) try {
// get a queue of search results
BlockingQueue<RSSMessage> results = yacyClient.search(null, query, false, false, maximumTime, Integer.MAX_VALUE);
String rssSearchServiceURL = "http://localhost:" + sb.getConfig("port", "8080") + "/yacysearch.rss";
BlockingQueue<RSSMessage> results = Search.search(rssSearchServiceURL, query, false, false, maximumTime, Integer.MAX_VALUE);
// take the results and compute some locations
RSSMessage message;

View File

@ -36,8 +36,8 @@ import java.util.Iterator;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import net.yacy.document.content.RSSMessage;
import net.yacy.document.parser.xml.RSSFeed;
import net.yacy.cora.document.Hit;
import net.yacy.cora.document.RSSFeed;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.Base64Order;
@ -421,7 +421,7 @@ public class CrawlQueues {
// parse the rss
DigestURI url, referrer;
Date loaddate;
for (final RSSMessage item: feed) {
for (final Hit item: feed) {
//System.out.println("URL=" + item.getLink() + ", desc=" + item.getDescription() + ", pubDate=" + item.getPubDate());
// put url on remote crawl stack

View File

@ -354,6 +354,7 @@ public final class CrawlStacker {
// returns true if the url can be accepted accoring to network.unit.domain
if (url == null) return "url is null";
final String host = url.getHost();
if (this.acceptLocalURLs && host == null && url.getProtocol().equals("file")) return null;
if (host == null) return "url.host is null";
if (this.acceptGlobalURLs && this.acceptLocalURLs) return null; // fast shortcut to avoid dnsResolve
// check if this is a local address and we are allowed to index local pages:

View File

@ -30,9 +30,9 @@ import java.util.HashMap;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentLinkedQueue;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.document.Document;
import net.yacy.document.parser.html.ImageEntry;
import net.yacy.kelondro.data.meta.DigestURI;
public class ResultImages {
@ -48,18 +48,17 @@ public class ResultImages {
// we also check all links for a double-check so we don't get the same image more than once in any queue
// image links may appear double here even if the pages where the image links are embedded already are checked for double-occurrence:
// the same images may be linked from different pages
private static final ConcurrentHashMap<String, Long> doubleCheck = new ConcurrentHashMap<String, Long>(); // (url-hash, time) when the url appeared first
private static final ConcurrentHashMap<MultiProtocolURI, Long> doubleCheck = new ConcurrentHashMap<MultiProtocolURI, Long>(); // (url, time) when the url appeared first
public static void registerImages(final Document document, final boolean privateEntry) {
if (document == null) return;
if (document.dc_source() == null) return;
final HashMap<String, ImageEntry> images = document.getImages();
final HashMap<MultiProtocolURI, ImageEntry> images = document.getImages();
for (final ImageEntry image: images.values()) {
// do a double-check; attention: this can be time-consuming since this possibly needs a DNS-lookup
String hashstring = new String(image.url().hash());
if (doubleCheck.containsKey(hashstring)) continue;
doubleCheck.put(hashstring, System.currentTimeMillis());
if (doubleCheck.containsKey(image.url())) continue;
doubleCheck.put(image.url(), System.currentTimeMillis());
final String name = image.url().getFile();
boolean good = false;
@ -144,8 +143,8 @@ public class ResultImages {
public static class OriginEntry {
public ImageEntry imageEntry;
public DigestURI baseURL;
public OriginEntry(final ImageEntry imageEntry, final DigestURI baseURL) {
public MultiProtocolURI baseURL;
public OriginEntry(final ImageEntry imageEntry, final MultiProtocolURI baseURL) {
this.imageEntry = imageEntry;
this.baseURL = baseURL;
}

View File

@ -35,6 +35,7 @@ import java.util.Date;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.kelondro.blob.BEncodedHeap;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
@ -317,7 +318,7 @@ public class RobotsTxt {
reqHeaders.put(HeaderFramework.USER_AGENT, HTTPLoader.crawlerUserAgent);
// adding referer
reqHeaders.put(RequestHeader.REFERER, (DigestURI.newURL(robotsURL,"/")).toNormalform(true, true));
reqHeaders.put(RequestHeader.REFERER, (MultiProtocolURI.newURL(robotsURL,"/")).toNormalform(true, true));
if (entry != null) {
oldEtag = entry.getETag();
@ -380,7 +381,7 @@ public class RobotsTxt {
redirectionUrlString = redirectionUrlString.trim();
// generating the new URL object
final DigestURI redirectionUrl = DigestURI.newURL(robotsURL, redirectionUrlString);
final DigestURI redirectionUrl = new DigestURI(MultiProtocolURI.newURL(robotsURL, redirectionUrlString));
// following the redirection
if (log.isFinest()) log.logFinest("Redirection detected for robots.txt with URL '" + robotsURL + "'." +

View File

@ -32,6 +32,7 @@ import java.io.IOException;
import java.io.PrintStream;
import java.util.Date;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.document.TextParser;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
@ -272,8 +273,8 @@ public class FTPLoader {
* @param entryUrl
* @return
*/
private String getPath(final DigestURI entryUrl) {
return DigestURI.unescape(entryUrl.getPath()).replace("\"", "\"\"");
private String getPath(final MultiProtocolURI entryUrl) {
return MultiProtocolURI.unescape(entryUrl.getPath()).replace("\"", "\"\"");
}
}

View File

@ -0,0 +1,144 @@
/**
* FileLoader
* Copyright 2010 by Michael Peter Christen
* First released 25.5.2010 at http://yacy.net
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file COPYING.LESSER.
* If not, see <http://www.gnu.org/licenses/>.
*/
package de.anomic.crawler.retrieval;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import de.anomic.http.server.HeaderFramework;
import de.anomic.http.server.RequestHeader;
import de.anomic.http.server.ResponseHeader;
import de.anomic.net.ftpc;
import de.anomic.search.Segments;
import de.anomic.search.Switchboard;
import de.anomic.data.MimeTable;
import net.yacy.document.TextParser;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.DateFormatter;
import net.yacy.kelondro.util.FileUtils;
public class FileLoader {
private final Switchboard sb;
private final Log log;
private final int maxFileSize;
public FileLoader(final Switchboard sb, final Log log) {
this.sb = sb;
this.log = log;
maxFileSize = (int) sb.getConfigLong("crawler.file.maxFileSize", -1l);
}
public Response load(final Request request, boolean acceptOnlyParseable) throws IOException {
DigestURI url = request.url();
if (!url.getProtocol().equals("file")) throw new IOException("wrong loader for FileLoader: " + url.getProtocol());
RequestHeader requestHeader = new RequestHeader();
if (request.referrerhash() != null) {
DigestURI ur = sb.getURL(Segments.Process.LOCALCRAWLING, request.referrerhash());
if (ur != null) requestHeader.put(RequestHeader.REFERER, ur.toNormalform(true, false));
}
// process directories: transform them to html with meta robots=noindex (using the ftpc lib)
if (url.isDirectory()) {
String[] l = url.list();
if (l == null) {
// this can only happen if there is no connection or the directory does not exist
log.logInfo("directory listing not available. URL = " + request.url().toString());
sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, "directory listing not available. URL = " + request.url().toString());
throw new IOException("directory listing not available. URL = " + request.url().toString());
}
String u = url.toNormalform(true, true);
List<String> list = new ArrayList<String>();
for (String s: l) {
list.add(u + ((u.endsWith("/") || u.endsWith("\\")) ? "" : "/") + s);
}
StringBuilder content = ftpc.dirhtml(u, null, null, null, list, true);
ResponseHeader responseHeader = new ResponseHeader();
responseHeader.put(HeaderFramework.LAST_MODIFIED, DateFormatter.formatRFC1123(new Date()));
responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html");
Response response = new Response(
request,
requestHeader,
responseHeader,
"200",
sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()),
content.toString().getBytes());
return response;
}
// create response header
String mime = MimeTable.ext2mime(url.getFileExtension());
ResponseHeader responseHeader = new ResponseHeader();
responseHeader.put(HeaderFramework.LAST_MODIFIED, DateFormatter.formatRFC1123(new Date(url.lastModified())));
responseHeader.put(HeaderFramework.CONTENT_TYPE, mime);
// check mime type and availability of parsers
// and also check resource size and limitation of the size
long size = url.length();
String parserError = null;
if ((acceptOnlyParseable && (parserError = TextParser.supports(url, mime)) != null) ||
(size > maxFileSize && maxFileSize >= 0)) {
// we know that we cannot process that file before loading
// only the metadata is returned
if (parserError != null) {
log.logInfo("No parser available in File crawler: '" + parserError + "' for URL " + request.url().toString() + ": parsing only metadata");
} else {
log.logInfo("Too big file in File crawler with size = " + size + " Bytes for URL " + request.url().toString() + ": parsing only metadata");
}
// create response with metadata only
responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/plain");
Response response = new Response(
request,
requestHeader,
responseHeader,
"200",
sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()),
url.toNormalform(true, true).getBytes());
return response;
}
// load the resource
InputStream is = url.getInputStream();
byte[] b = FileUtils.read(is);
is.close();
// create response with loaded content
Response response = new Response(
request,
requestHeader,
responseHeader,
"200",
sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()),
b);
return response;
}
}

View File

@ -27,6 +27,7 @@ package de.anomic.crawler.retrieval;
import java.io.IOException;
import java.util.Date;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.document.TextParser;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
@ -180,7 +181,7 @@ public final class HTTPLoader {
}
// normalizing URL
final DigestURI redirectionUrl = DigestURI.newURL(request.url(), redirectionUrlString);
final DigestURI redirectionUrl = new DigestURI(MultiProtocolURI.newURL(request.url(), redirectionUrlString));
// restart crawling with new url
this.log.logInfo("CRAWLER Redirection detected ('" + res.getStatusLine() + "') for URL " + request.url().toString());
@ -289,7 +290,7 @@ public final class HTTPLoader {
}
// normalizing URL
final DigestURI redirectionUrl = DigestURI.newURL(request.url(), redirectionUrlString);
final DigestURI redirectionUrl = new DigestURI(MultiProtocolURI.newURL(request.url(), redirectionUrlString));
// if we are already doing a shutdown we don't need to retry crawling

View File

@ -52,6 +52,7 @@ import org.xml.sax.SAXException;
import de.anomic.data.bookmarksDB.Bookmark;
import de.anomic.data.bookmarksDB.Tag;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.document.parser.html.ContentScraper;
import net.yacy.document.parser.html.TransformerWriter;
import net.yacy.kelondro.data.meta.DigestURI;
@ -128,9 +129,9 @@ public class BookmarkHelper {
int importCount = 0;
Map<DigestURI, String> links = new HashMap<DigestURI, String>();
Map<MultiProtocolURI, String> links = new HashMap<MultiProtocolURI, String>();
String title;
DigestURI url;
MultiProtocolURI url;
Bookmark bm;
final Set<String> tags=listManager.string2set(tag); //this allow multiple default tags
try {
@ -142,14 +143,14 @@ public class BookmarkHelper {
writer.close();
links = scraper.getAnchors();
} catch (final IOException e) { Log.logWarning("BOOKMARKS", "error during load of links: "+ e.getClass() +" "+ e.getMessage());}
for (Entry<DigestURI, String> link: links.entrySet()) {
url= link.getKey();
title=link.getValue();
for (Entry<MultiProtocolURI, String> link: links.entrySet()) {
url = link.getKey();
title = link.getValue();
Log.logInfo("BOOKMARKS", "links.get(url)");
if(title.equals("")){//cannot be displayed
title=url.toString();
if (title.equals("")) {//cannot be displayed
title = url.toString();
}
bm=db.new Bookmark(url.toString());
bm = db.new Bookmark(url.toString());
bm.setProperty(Bookmark.BOOKMARK_TITLE, title);
bm.setTags(tags);
bm.setPublic(importPublic);

View File

@ -5,7 +5,7 @@ import java.io.File;
import java.io.FileInputStream;
import java.util.Properties;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.cora.document.MultiProtocolURI;
public class MimeTable {
@ -42,11 +42,11 @@ public class MimeTable {
return mimeTable.getProperty(ext, dfltMime);
}
public static String url2mime(final DigestURI url, final String dfltMime) {
public static String url2mime(final MultiProtocolURI url, final String dfltMime) {
return ext2mime(url.getFileExtension(), dfltMime);
}
public static String url2mime(final DigestURI url) {
public static String url2mime(final MultiProtocolURI url) {
return ext2mime(url.getFileExtension());
}
}

View File

@ -2645,7 +2645,7 @@ public class ftpc {
page.append("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 3.2 Final//EN\">\n");
page.append("<html><head>\n");
page.append(" <title>" + title + "</title>\n");
page.append(" <meta name=\"generator\" content=\"YaCy ftpc dirlisting\">\n");
page.append(" <meta name=\"generator\" content=\"YaCy directory listing\">\n");
if (metaRobotNoindex) {
page.append(" <meta name=\"robots\" content=\"noindex\">\n");
}
@ -2674,7 +2674,7 @@ public class ftpc {
if (line.length() > nameEnd) {
page.append(line.substring(nameEnd));
}
} else if (line.startsWith("http://") || line.startsWith("ftp://") || line.startsWith("smb://")) {
} else if (line.startsWith("http://") || line.startsWith("ftp://") || line.startsWith("smb://") || line.startsWith("file://")) {
page.append("<a href=\"" + line + "\">" + line + "</a>");
} else {
// raw

View File

@ -146,7 +146,7 @@ public class DocumentIndex extends Segment {
* If the given file is a path to a directory, the complete sub-tree is indexed
* @param start
*/
public void addConcurrent(DigestURI start) {
public void addConcurrent(DigestURI start) throws IOException {
assert (start != null);
assert (start.canRead()) : start.toString();
if (!start.isDirectory()) {

View File

@ -32,6 +32,7 @@ import java.util.TreeSet;
import de.anomic.data.MimeTable;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.document.Document;
import net.yacy.document.parser.html.ImageEntry;
import net.yacy.kelondro.data.meta.DigestURI;
@ -130,25 +131,25 @@ public class MediaSnippet implements Comparable<MediaSnippet>, Comparator<MediaS
public static ArrayList<MediaSnippet> computeMediaSnippets(final Document document, final HandleSet queryhashes, final ContentDomain mediatype) {
if (document == null) return new ArrayList<MediaSnippet>();
Map<DigestURI, String> media = null;
Map<MultiProtocolURI, String> media = null;
if (mediatype == ContentDomain.AUDIO) media = document.getAudiolinks();
else if (mediatype == ContentDomain.VIDEO) media = document.getVideolinks();
else if (mediatype == ContentDomain.APP) media = document.getApplinks();
if (media == null) return null;
final Iterator<Map.Entry<DigestURI, String>> i = media.entrySet().iterator();
Map.Entry<DigestURI, String> entry;
final Iterator<Map.Entry<MultiProtocolURI, String>> i = media.entrySet().iterator();
Map.Entry<MultiProtocolURI, String> entry;
DigestURI url;
String desc;
final ArrayList<MediaSnippet> result = new ArrayList<MediaSnippet>();
while (i.hasNext()) {
entry = i.next();
url = entry.getKey();
url = new DigestURI(entry.getKey());
desc = entry.getValue();
int ranking = TextSnippet.removeAppearanceHashes(url.toNormalform(false, false), queryhashes).size() +
TextSnippet.removeAppearanceHashes(desc, queryhashes).size();
if (ranking < 2 * queryhashes.size()) {
result.add(new MediaSnippet(mediatype, url, MimeTable.url2mime(url), desc, document.getTextLength(), null, ranking, document.dc_source()));
result.add(new MediaSnippet(mediatype, url, MimeTable.url2mime(url), desc, document.getTextLength(), null, ranking, new DigestURI(document.dc_source())));
}
}
return result;
@ -167,7 +168,7 @@ public class MediaSnippet implements Comparable<MediaSnippet>, Comparator<MediaS
final ArrayList<MediaSnippet> result = new ArrayList<MediaSnippet>();
while (i.hasNext()) {
ientry = i.next();
url = ientry.url();
url = new DigestURI(ientry.url());
String u = url.toString();
if (u.indexOf(".ico") >= 0 || u.indexOf("favicon") >= 0) continue;
if (ientry.height() > 0 && ientry.height() < 64) continue;
@ -177,7 +178,7 @@ public class MediaSnippet implements Comparable<MediaSnippet>, Comparator<MediaS
TextSnippet.removeAppearanceHashes(url.toNormalform(false, false), queryhashes).size() -
TextSnippet.removeAppearanceHashes(desc, queryhashes).size();
final int ranking = Integer.MAX_VALUE - (ientry.height() + 1) * (ientry.width() + 1) * (appcount + 1);
result.add(new MediaSnippet(ContentDomain.IMAGE, url, MimeTable.url2mime(url), desc, ientry.fileSize(), ientry.width(), ientry.height(), ranking, document.dc_source()));
result.add(new MediaSnippet(ContentDomain.IMAGE, url, MimeTable.url2mime(url), desc, ientry.fileSize(), ientry.width(), ientry.height(), ranking, new DigestURI(document.dc_source())));
}
return result;
}

View File

@ -38,6 +38,7 @@ import java.util.Iterator;
import java.util.Map;
import java.util.TreeSet;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.document.parser.html.CharacterCoding;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow;
@ -516,7 +517,7 @@ public final class MetadataRepository implements Iterable<byte[]> {
if (format == 2) {
pw.println("<item>");
pw.println("<title>" + CharacterCoding.unicode2xml(metadata.dc_title(), true) + "</title>");
pw.println("<link>" + DigestURI.escape(url) + "</link>");
pw.println("<link>" + MultiProtocolURI.escape(url) + "</link>");
if (metadata.dc_creator().length() > 0) pw.println("<author>" + CharacterCoding.unicode2xml(metadata.dc_creator(), true) + "</author>");
if (metadata.dc_subject().length() > 0) pw.println("<description>" + CharacterCoding.unicode2xml(metadata.dc_subject(), true) + "</description>");
pw.println("<pubDate>" + entry.moddate().toString() + "</pubDate>");

View File

@ -39,6 +39,7 @@ import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.TimeUnit;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.document.Condenser;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow;
@ -631,7 +632,7 @@ public final class RankingProcess extends Thread {
// take out relevant information for reference computation
if ((resultEntry.url() == null) || (resultEntry.title() == null)) return;
//final String[] urlcomps = htmlFilterContentScraper.urlComps(resultEntry.url().toNormalform(true, true)); // word components of the url
final String[] descrcomps = DigestURI.splitpattern.split(resultEntry.title().toLowerCase()); // words in the description
final String[] descrcomps = MultiProtocolURI.splitpattern.split(resultEntry.title().toLowerCase()); // words in the description
// add references
//addTopic(urlcomps);

View File

@ -31,6 +31,7 @@ import java.util.ArrayList;
import java.util.Comparator;
import java.util.Date;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.document.Condenser;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow;
@ -124,7 +125,7 @@ public class ResultEntry implements Comparable<ResultEntry>, Comparator<ResultEn
return (alternative_urlstring == null) ? urlcomps.url().toNormalform(false, true) : alternative_urlstring;
}
public String urlname() {
return (alternative_urlname == null) ? DigestURI.unescape(urlcomps.url().toNormalform(false, true)) : alternative_urlname;
return (alternative_urlname == null) ? MultiProtocolURI.unescape(urlcomps.url().toNormalform(false, true)) : alternative_urlname;
}
public String title() {
return urlcomps.dc_title();

View File

@ -30,8 +30,8 @@ import java.util.ArrayList;
import java.util.Iterator;
import java.util.Map;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.document.Condenser;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.index.HandleSet;
@ -370,8 +370,8 @@ public class ResultFetcher {
// apply 'common-sense' heuristic using references
final String urlstring = rentry.url().toNormalform(true, true);
final String[] urlcomps = DigestURI.urlComps(urlstring);
final String[] descrcomps = DigestURI.splitpattern.split(rentry.title().toLowerCase());
final String[] urlcomps = MultiProtocolURI.urlComps(urlstring);
final String[] descrcomps = MultiProtocolURI.splitpattern.split(rentry.title().toLowerCase());
Navigator.Item tc;
for (int j = 0; j < urlcomps.length; j++) {
tc = topwords.get(urlcomps[j]);

View File

@ -37,6 +37,7 @@ import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.document.Condenser;
import net.yacy.document.Document;
import net.yacy.document.ParserException;
@ -198,7 +199,7 @@ public class Segment {
private int addPageIndex(final DigestURI url, final Date urlModified, final Document document, final Condenser condenser, final String language, final char doctype, final int outlinksSame, final int outlinksOther) {
int wordCount = 0;
final int urlLength = url.toNormalform(true, true).length();
final int urlComps = DigestURI.urlComps(url.toString()).length;
final int urlComps = MultiProtocolURI.urlComps(url.toString()).length;
// iterate over all words of context text
final Iterator<Map.Entry<String, Word>> i = condenser.words().entrySet().iterator();
@ -273,10 +274,10 @@ public class Segment {
if (!u.contains("/" + language + "/") && !u.contains("/" + ISO639.country(language).toLowerCase() + "/")) {
// no confirmation using the url, use the TLD
language = url.language();
System.out.println(error + ", corrected using the TLD");
log.logWarning(error + ", corrected using the TLD");
} else {
// this is a strong hint that the statistics was in fact correct
System.out.println(error + ", but the url proves that the statistic is correct");
log.logWarning(error + ", but the url proves that the statistic is correct");
}
}
} else {

View File

@ -70,16 +70,17 @@ import java.util.zip.GZIPOutputStream;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.document.RSSFeed;
import net.yacy.cora.document.RSSMessage;
import net.yacy.document.Condenser;
import net.yacy.document.Document;
import net.yacy.document.TextParser;
import net.yacy.document.ParserException;
import net.yacy.document.content.DCEntry;
import net.yacy.document.content.RSSMessage;
import net.yacy.document.content.SurrogateReader;
import net.yacy.document.importer.OAIListFriendsLoader;
import net.yacy.document.parser.html.ImageEntry;
import net.yacy.document.parser.xml.RSSFeed;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.meta.URIMetadataRow.Components;
@ -291,7 +292,7 @@ public final class Switchboard extends serverSwitch {
// init sessionid name file
final String sessionidNamesFile = getConfig("sessionidNamesFile","");
this.log.logConfig("Loading sessionid file " + sessionidNamesFile);
DigestURI.initSessionIDNames(new File(getRootPath(), sessionidNamesFile));
MultiProtocolURI.initSessionIDNames(FileUtils.loadList(new File(getRootPath(), sessionidNamesFile)));
// init tables
this.tables = new WorkTables(this.workPath);
@ -1733,7 +1734,7 @@ public final class Switchboard extends serverSwitch {
((response.profile() == null) || (response.depth() < response.profile().depth()))
) {
// get the hyperlinks
final Map<DigestURI, String> hl = document.getHyperlinks();
final Map<MultiProtocolURI, String> hl = document.getHyperlinks();
// add all images also to the crawl stack
for (ImageEntry imageReference : document.getImages().values()) {
@ -1741,15 +1742,15 @@ public final class Switchboard extends serverSwitch {
}
// insert those hyperlinks to the crawler
DigestURI nextUrl;
for (Map.Entry<DigestURI, String> nextEntry : hl.entrySet()) {
MultiProtocolURI nextUrl;
for (Map.Entry<MultiProtocolURI, String> nextEntry : hl.entrySet()) {
// check for interruption
checkInterruption();
// process the next hyperlink
nextUrl = nextEntry.getKey();
String u = nextUrl.toNormalform(true, true, true);
if (!(u.startsWith("http") || u.startsWith("ftp") || u.startsWith("smb"))) continue;
if (!(u.startsWith("http://") || u.startsWith("ftp://") || u.startsWith("smb://") || u.startsWith("file://"))) continue;
// enqueue the hyperlink into the pre-notice-url db
try {
crawlStacker.enqueueEntry(new Request(

View File

@ -405,7 +405,7 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
/* ===========================================================================
* COMPUTE SNIPPET
* =========================================================================== */
final DigestURI resFavicon = document.getFavicon();
final DigestURI resFavicon = (document.getFavicon() == null) ? null : new DigestURI(document.getFavicon());
if (resFavicon != null) faviconCache.put(new String(url.hash()), resFavicon);
// we have found a parseable non-empty file: use the lines

View File

@ -52,8 +52,8 @@ import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.document.parser.html.CharacterCoding;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.DateFormatter;
import net.yacy.kelondro.util.Formatter;
@ -369,9 +369,9 @@ public class serverObjects extends HashMap<String, String> implements Cloneable
if (this.size() == 0) return "";
StringBuilder param = new StringBuilder();
for (Map.Entry<String, String> entry: this.entrySet()) {
param.append(DigestURI.escape(entry.getKey()));
param.append(MultiProtocolURI.escape(entry.getKey()));
param.append('=');
param.append(DigestURI.escape(entry.getValue()));
param.append(MultiProtocolURI.escape(entry.getValue()));
param.append('&');
}
param.setLength(param.length() - 1);

View File

@ -37,6 +37,7 @@ import java.util.SortedMap;
import java.util.TreeMap;
import java.util.TreeSet;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.document.Condenser;
import net.yacy.document.Document;
import net.yacy.kelondro.data.meta.DigestURI;
@ -95,11 +96,11 @@ public class WebStructureGraph {
}
public Integer[] /*(outlinksSame, outlinksOther)*/ generateCitationReference(final Document document, final Condenser condenser, final Date docDate) {
final DigestURI url = document.dc_source();
final DigestURI url = new DigestURI(document.dc_source());
// generate citation reference
final Map<DigestURI, String> hl = document.getHyperlinks();
final Iterator<DigestURI> it = hl.keySet().iterator();
final Map<MultiProtocolURI, String> hl = document.getHyperlinks();
final Iterator<MultiProtocolURI> it = hl.keySet().iterator();
byte[] nexturlhashb;
String nexturlhash;
final StringBuilder cpg = new StringBuilder(12 * (hl.size() + 1) + 1);
@ -109,7 +110,7 @@ public class WebStructureGraph {
int GCount = 0;
int LCount = 0;
while (it.hasNext()) {
nexturlhashb = it.next().hash();
nexturlhashb = new DigestURI(it.next()).hash();
if (nexturlhashb != null) {
nexturlhash = new String(nexturlhashb);
assert nexturlhash.length() == 12 : "nexturlhash.length() = " + nexturlhash.length() + ", nexturlhash = " + nexturlhash;

View File

@ -54,14 +54,12 @@ import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.regex.Pattern;
import net.yacy.document.content.RSSMessage;
import net.yacy.document.parser.xml.RSSFeed;
import net.yacy.document.parser.xml.RSSReader;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.cora.document.RSSFeed;
import net.yacy.cora.document.RSSReader;
import net.yacy.cora.protocol.HttpConnector;
import net.yacy.cora.services.Search;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.data.word.WordReference;
@ -86,10 +84,8 @@ import de.anomic.crawler.retrieval.HTTPLoader;
import de.anomic.http.client.DefaultCharsetFilePart;
import de.anomic.http.client.DefaultCharsetStringPart;
import de.anomic.http.client.Client;
import de.anomic.http.client.RemoteProxyConfig;
import de.anomic.http.server.HeaderFramework;
import de.anomic.http.server.RequestHeader;
import de.anomic.http.server.ResponseContainer;
import de.anomic.search.RankingProfile;
import de.anomic.search.RankingProcess;
import de.anomic.search.Segment;
@ -101,6 +97,22 @@ import de.anomic.tools.crypt;
public final class yacyClient {
/**
* @see wput
* @param target
* @param filename
* @param post
* @return
* @throws IOException
*/
private static byte[] postToFile(final yacySeed target, final String filename, final List<Part> post, final int timeout) throws IOException {
return HttpConnector.wput("http://" + target.getClusterAddress() + "/yacy/" + filename, target.getHexHash() + ".yacyh", post, timeout, false);
}
private static byte[] postToFile(final yacySeedDB seedDB, final String targetHash, final String filename, final List<Part> post, final int timeout) throws IOException {
return HttpConnector.wput("http://" + targetAddress(seedDB, targetHash) + "/yacy/" + filename, yacySeed.b64Hash2hexHash(targetHash)+ ".yacyh", post, timeout, false);
}
/**
* this is called to enrich the seed information by
* - own address (if peer is behind a nat/router)
@ -134,7 +146,7 @@ public final class yacyClient {
post.add(new DefaultCharsetStringPart("seed", mySeed.genSeedStr(salt)));
// send request
final long start = System.currentTimeMillis();
final byte[] content = wput("http://" + address + "/yacy/hello.html", yacySeed.b64Hash2hexHash(otherHash) + ".yacyh", post, 30000, false);
final byte[] content = HttpConnector.wput("http://" + address + "/yacy/hello.html", yacySeed.b64Hash2hexHash(otherHash) + ".yacyh", post, 30000, false);
yacyCore.log.logInfo("yacyClient.publishMySeed thread '" + Thread.currentThread().getName() + "' contacted peer at " + address + ", received " + ((content == null) ? "null" : content.length) + " bytes, time = " + (System.currentTimeMillis() - start) + " milliseconds");
result = FileUtils.table(content);
break;
@ -237,82 +249,6 @@ public final class yacyClient {
return count;
}
/**
* send data to the server named by vhost
*
* @param address address of the server
* @param vhost name of the server at address which should respond
* @param post data to send (name-value-pairs)
* @param gzipBody send with content gzip encoded
* @return response body
* @throws IOException
*/
/*
private static byte[] wput(final String url, String vhost, final List<Part> post, boolean gzipBody) throws IOException {
return wput(url, vhost, post, 10000, gzipBody);
}
*/
/**
* send data to the server named by vhost
*
* @param address address of the server
* @param vhost name of the server at address which should respond
* @param post data to send (name-value-pairs)
* @param timeout in milliseconds
* @return response body
* @throws IOException
*/
private static byte[] wput(final String url, final String vhost, final List<Part> post, final int timeout) throws IOException {
return wput(url, vhost, post, timeout, false);
}
/**
* send data to the server named by vhost
*
* @param address address of the server
* @param vhost name of the server at address which should respond
* @param post data to send (name-value-pairs)
* @param timeout in milliseconds
* @param gzipBody send with content gzip encoded
* @return response body
* @throws IOException
*/
private static byte[] wput(final String url, final String vhost, final List<Part> post, final int timeout, final boolean gzipBody) throws IOException {
final RequestHeader header = new RequestHeader();
header.put(HeaderFramework.USER_AGENT, HTTPLoader.yacyUserAgent);
header.put(HeaderFramework.HOST, vhost);
final Client client = new Client(timeout, header);
client.setProxy(proxyConfig());
ResponseContainer res = null;
byte[] content = null;
try {
// send request/data
res = client.POST(url, post, gzipBody);
content = res.getData();
} finally {
if(res != null) {
// release connection
res.closeStream();
}
}
return content;
}
/**
* @see wput
* @param target
* @param filename
* @param post
* @return
* @throws IOException
*/
private static byte[] postToFile(final yacySeed target, final String filename, final List<Part> post, final int timeout) throws IOException {
return wput("http://" + target.getClusterAddress() + "/yacy/" + filename, target.getHexHash() + ".yacyh", post, timeout, false);
}
private static byte[] postToFile(final yacySeedDB seedDB, final String targetHash, final String filename, final List<Part> post, final int timeout) throws IOException {
return wput("http://" + targetAddress(seedDB, targetHash) + "/yacy/" + filename, yacySeed.b64Hash2hexHash(targetHash)+ ".yacyh", post, timeout, false);
}
public static yacySeed querySeed(final yacySeed target, final String seedHash) {
// prepare request
final String salt = crypt.randomSalt();
@ -400,7 +336,7 @@ public final class yacyClient {
// send request
try {
/* a long time-out is needed */
final byte[] result = wput("http://" + target.getClusterAddress() + "/yacy/urls.xml", target.getHexHash() + ".yacyh", post, (int) maxTime);
final byte[] result = HttpConnector.wput("http://" + target.getClusterAddress() + "/yacy/urls.xml", target.getHexHash() + ".yacyh", post, (int) maxTime);
final RSSReader reader = RSSReader.parse(result);
if (reader == null) {
yacyCore.log.logWarning("yacyClient.queryRemoteCrawlURLs failed asking peer '" + target.getName() + "': probably bad response from remote peer (1), reader == null");
@ -425,120 +361,11 @@ public final class yacyClient {
return null;
}
}
public static BlockingQueue<RSSMessage> search(String urlBase, String query, boolean verify, boolean global, long timeout, int maximumRecords) {
if (urlBase == null) {
urlBase = "http://localhost:" + Switchboard.getSwitchboard().getConfig("port", "8080") + "/yacysearch.rss";
}
BlockingQueue<RSSMessage> queue = new LinkedBlockingQueue<RSSMessage>();
searchJob job = new searchJob(urlBase, query, verify, global, timeout, maximumRecords, queue);
job.start();
return queue;
}
private final static int recordsPerSession = 10;
public static class searchJob extends Thread {
String urlBase, query;
boolean verify, global;
long timeout;
int startRecord, maximumRecords;
BlockingQueue<RSSMessage> queue;
public searchJob(String urlBase, String query, boolean verify, boolean global, long timeout, int maximumRecords, BlockingQueue<RSSMessage> queue) {
this.urlBase = urlBase;
this.query = query;
this.verify = verify;
this.global = global;
this.timeout = timeout;
this.startRecord = 0;
this.maximumRecords = maximumRecords;
this.queue = queue;
}
public void run() {
RSSMessage message;
mainloop: while (timeout > 0 && maximumRecords > 0) {
long st = System.currentTimeMillis();
RSSFeed feed = search(urlBase, query, verify, global, timeout, startRecord, recordsPerSession);
if (feed == null || feed.isEmpty()) break mainloop;
maximumRecords -= feed.size();
innerloop: while (!feed.isEmpty()) {
message = feed.pollMessage();
if (message == null) break innerloop;
try {
queue.put(message);
} catch (InterruptedException e) {
break innerloop;
}
}
startRecord += recordsPerSession;
timeout -= System.currentTimeMillis() - st;
}
try { queue.put(RSSMessage.POISON); } catch (InterruptedException e) {}
}
}
/**
* send a query to a yacy public search interface
* @param urlBase the target url base (everything before the ? that follows the SRU request syntax properties). can null, then the local peer is used
* @param query the query as string
* @param startRecord number of first record
* @param maximumRecords maximum number of records
* @param verify if true, result entries are verified using the snippet fetch (slow); if false simply the result is returned
* @param global if true also search results from other peers are included
* @param timeout milliseconds that are waited at maximum for a search result
* @return
*/
public static RSSFeed search(String urlBase, String query, boolean verify, boolean global, long timeout, int startRecord, int maximumRecords) {
// returns a search result from a peer
if (urlBase == null) {
urlBase = "http://localhost:" + Switchboard.getSwitchboard().getConfig("port", "8080") + "/yacysearch.rss";
}
DigestURI uri = null;
try {
uri = new DigestURI(urlBase, null);
} catch (MalformedURLException e) {
yacyCore.log.logWarning("yacyClient.search failed asking peer '" + urlBase + "': bad url, " + e.getMessage());
return null;
}
// prepare request
final List<Part> post = new ArrayList<Part>();
post.add(new DefaultCharsetStringPart("query", query));
post.add(new DefaultCharsetStringPart("startRecord", Integer.toString(startRecord)));
post.add(new DefaultCharsetStringPart("maximumRecords", Long.toString(maximumRecords)));
post.add(new DefaultCharsetStringPart("verify", verify ? "true" : "false"));
post.add(new DefaultCharsetStringPart("resource", global ? "global" : "local"));
// send request
try {
final byte[] result = wput(urlBase, uri.getHost(), post, (int) timeout);
//String debug = new String(result); System.out.println("*** DEBUG: " + debug);
final RSSReader reader = RSSReader.parse(result);
if (reader == null) {
yacyCore.log.logWarning("yacyClient.search failed asking peer '" + uri.getHost() + "': probably bad response from remote peer (1), reader == null");
return null;
}
final RSSFeed feed = reader.getFeed();
if (feed == null) {
// case where the rss reader does not understand the content
yacyCore.log.logWarning("yacyClient.search failed asking peer '" + uri.getHost() + "': probably bad response from remote peer (2)");
return null;
}
return feed;
} catch (final IOException e) {
yacyCore.log.logSevere("yacyClient.search error asking peer '" + uri.getHost() + "':" + e.toString());
return null;
}
}
public static RSSFeed search(final yacySeed targetSeed, String query, boolean verify, boolean global, long timeout, int startRecord, int maximumRecords) {
public static RSSFeed search(final yacySeed targetSeed, String query, boolean verify, boolean global, long timeout, int startRecord, int maximumRecords) throws IOException {
String address = (targetSeed == null || targetSeed == Switchboard.getSwitchboard().peers.mySeed()) ? "localhost:" + Switchboard.getSwitchboard().getConfig("port", "8080") : targetSeed.getClusterAddress();
String urlBase = "http://" + address + "/yacysearch.rss";
return search(urlBase, query, verify, global, timeout, startRecord, maximumRecords);
return Search.search(urlBase, query, verify, global, timeout, startRecord, maximumRecords);
}
@SuppressWarnings("unchecked")
@ -607,7 +434,7 @@ public final class yacyClient {
// send request
HashMap<String, String> result = null;
try {
result = FileUtils.table(wput("http://" + target.getClusterAddress() + "/yacy/search.html", target.getHexHash() + ".yacyh", post, 60000));
result = FileUtils.table(HttpConnector.wput("http://" + target.getClusterAddress() + "/yacy/search.html", target.getHexHash() + ".yacyh", post, 60000));
} catch (final IOException e) {
yacyCore.log.logInfo("SEARCH failed, Peer: " + target.hash + ":" + target.getName() + " (" + e.getMessage() + "), score=" + target.selectscore);
//yacyCore.peerActions.peerDeparture(target, "search request to peer created io exception: " + e.getMessage());
@ -878,7 +705,7 @@ public final class yacyClient {
// send request
try {
final byte[] content = wput("http://" + targetAddress + "/yacy/transfer.html", targetAddress, post, 10000);
final byte[] content = HttpConnector.wput("http://" + targetAddress + "/yacy/transfer.html", targetAddress, post, 10000);
final HashMap<String, String> result = FileUtils.table(content);
return result;
} catch (final Exception e) {
@ -902,7 +729,7 @@ public final class yacyClient {
// send request
try {
final byte[] content = wput("http://" + targetAddress + "/yacy/transfer.html", targetAddress, post, 20000);
final byte[] content = HttpConnector.wput("http://" + targetAddress + "/yacy/transfer.html", targetAddress, post, 20000);
final HashMap<String, String> result = FileUtils.table(content);
return result;
} catch (final Exception e) {
@ -977,7 +804,7 @@ public final class yacyClient {
// send request
try {
final byte[] content = wput("http://" + address + "/yacy/crawlReceipt.html", target.getHexHash() + ".yacyh", post, 10000);
final byte[] content = HttpConnector.wput("http://" + address + "/yacy/crawlReceipt.html", target.getHexHash() + ".yacyh", post, 10000);
return FileUtils.table(content);
} catch (final Exception e) {
// most probably a network time-out exception
@ -1127,7 +954,7 @@ public final class yacyClient {
post.add(new DefaultCharsetStringPart("entryc", Integer.toString(indexcount)));
post.add(new DefaultCharsetStringPart("indexes", entrypost.toString()));
try {
final byte[] content = wput("http://" + address + "/yacy/transferRWI.html", targetSeed.getHexHash() + ".yacyh", post, timeout, gzipBody);
final byte[] content = HttpConnector.wput("http://" + address + "/yacy/transferRWI.html", targetSeed.getHexHash() + ".yacyh", post, timeout, gzipBody);
final Iterator<String> v = FileUtils.strings(content);
// this should return a list of urlhashes that are unknown
@ -1171,7 +998,7 @@ public final class yacyClient {
}
post.add(new DefaultCharsetStringPart("urlc", Integer.toString(urlc)));
try {
final byte[] content = wput("http://" + address + "/yacy/transferURL.html", targetSeed.getHexHash() + ".yacyh", post, timeout, gzipBody);
final byte[] content = HttpConnector.wput("http://" + address + "/yacy/transferURL.html", targetSeed.getHexHash() + ".yacyh", post, timeout, gzipBody);
final Iterator<String> v = FileUtils.strings(content);
final HashMap<String, String> result = FileUtils.table(v);
@ -1193,7 +1020,7 @@ public final class yacyClient {
String address = targetSeed.getClusterAddress();
if (address == null) { address = "localhost:8080"; }
try {
final byte[] content = wput("http://" + address + "/yacy/profile.html", targetSeed.getHexHash() + ".yacyh", post, 5000);
final byte[] content = HttpConnector.wput("http://" + address + "/yacy/profile.html", targetSeed.getHexHash() + ".yacyh", post, 5000);
return FileUtils.table(content);
} catch (final Exception e) {
yacyCore.log.logSevere("yacyClient.getProfile error:" + e.getMessage());
@ -1201,14 +1028,6 @@ public final class yacyClient {
}
}
/**
* proxy for "to YaCy connections"
* @return
*/
private static final RemoteProxyConfig proxyConfig() {
final RemoteProxyConfig p = RemoteProxyConfig.getRemoteProxyConfig();
return ((p != null) && (p.useProxy()) && (p.useProxy4Yacy())) ? p : null;
}
public static void main(final String[] args) {
if(args.length > 1) {
@ -1262,7 +1081,7 @@ public final class yacyClient {
//post.add(new FilePart("filename", new ByteArrayPartSource(filename, file)));
// do it!
try {
final byte[] response = wput(url.toString(), vhost, post, timeout, gzipBody);
final byte[] response = HttpConnector.wput(url.toString(), vhost, post, timeout, gzipBody);
System.out.println(new String(response));
} catch (final IOException e) {
Log.logException(e);

View File

@ -48,8 +48,8 @@ import java.util.List;
import java.util.Map;
import java.util.concurrent.Semaphore;
import net.yacy.document.content.RSSMessage;
import net.yacy.document.parser.xml.RSSFeed;
import net.yacy.cora.document.RSSFeed;
import net.yacy.cora.document.RSSMessage;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.DateFormatter;

View File

@ -26,8 +26,8 @@ package de.anomic.yacy;
import java.util.HashMap;
import net.yacy.document.content.RSSMessage;
import net.yacy.document.parser.xml.RSSFeed;
import net.yacy.cora.document.RSSFeed;
import net.yacy.cora.document.RSSMessage;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.DateFormatter;
import net.yacy.kelondro.util.MapTools;

View File

@ -45,8 +45,8 @@ import java.util.Map;
import java.util.SortedSet;
import java.util.TreeSet;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.document.parser.html.ContentScraper;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.io.CharBuffer;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.Base64Order;
@ -74,17 +74,17 @@ public final class yacyRelease extends yacyVersion {
private static Map<yacyUpdateLocation, DevAndMainVersions> latestReleases = new HashMap<yacyUpdateLocation, DevAndMainVersions>();
public final static List<yacyUpdateLocation> latestReleaseLocations = new ArrayList<yacyUpdateLocation>(); // will be initialized with value in defaults/yacy.network.freeworld.unit
private DigestURI url;
private MultiProtocolURI url;
private File releaseFile;
private PublicKey publicKey;
public yacyRelease(final DigestURI url) {
public yacyRelease(final MultiProtocolURI url) {
super(url.getFileName());
this.url = url;
}
public yacyRelease(final DigestURI url, PublicKey publicKey) {
public yacyRelease(final MultiProtocolURI url, PublicKey publicKey) {
this(url);
this.publicKey = publicKey;
}
@ -94,7 +94,7 @@ public final class yacyRelease extends yacyVersion {
this.releaseFile = releaseFile;
}
public DigestURI getUrl() {
public MultiProtocolURI getUrl() {
return url;
}
@ -241,10 +241,10 @@ public final class yacyRelease extends yacyVersion {
}
// analyse links in scraper resource, and find link to latest release in it
final Map<DigestURI, String> anchors = scraper.getAnchors(); // a url (String) / name (String) relation
final Map<MultiProtocolURI, String> anchors = scraper.getAnchors(); // a url (String) / name (String) relation
final TreeSet<yacyRelease> mainReleases = new TreeSet<yacyRelease>();
final TreeSet<yacyRelease> devReleases = new TreeSet<yacyRelease>();
for(DigestURI url : anchors.keySet()) {
for (MultiProtocolURI url : anchors.keySet()) {
try {
yacyRelease release = new yacyRelease(url, location.getPublicKey());
//System.out.println("r " + release.toAnchor());

View File

@ -0,0 +1,42 @@
/**
* Channel
* Copyright 2010 by Michael Peter Christen
* First released 10.5.2010 at http://yacy.net
*
* This file is part of YaCy Content Integration
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file COPYING.LESSER.
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.document;
public interface Channel extends Iterable<Hit> {
public void setTitle(String title);
public void setLink(String link);
public void setDescription(String description);
public void setImageURL(String imageUrl);
public void setTotalResults(String totalResults);
public void setStartIndex(String startIndex);
public void setItemsPerPage(String itemsPerPage);
public void setSearchTerms(String searchTerms);
}

View File

@ -0,0 +1,27 @@
/**
* Channels
* Copyright 2010 by Michael Peter Christen
* First released 10.5.2010 at http://yacy.net
*
* This file is part of YaCy Content Integration
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file COPYING.LESSER.
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.document;
public class Channels {
}

View File

@ -0,0 +1,74 @@
/**
* Hit
* Copyright 2010 by Michael Peter Christen
* First released 10.5.2010 at http://yacy.net
*
* This file is part of YaCy Content Integration
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file COPYING.LESSER.
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.document;
public interface Hit {
public void setAuthor(String title);
public void setCopyright(String title);
public void setCategory(String title);
public void setTitle(String title);
public void setLink(String link);
public void setReferrer(String title);
public void setLanguage(String title);
public void setDescription(String description);
public void setCreator(String pubdate);
public void setPubDate(String pubdate);
public void setGuid(String guid);
public void setDocs(String guid);
public String getAuthor();
public String getCopyright();
public String getCategory();
public String getTitle();
public String getLink();
public String getReferrer();
public String getLanguage();
public String getDescription();
public String getPubDate();
public String getGuid();
public String getDocs();
}

File diff suppressed because it is too large Load Diff

View File

@ -21,19 +21,19 @@
* USA
*/
package net.yacy.kelondro.util;
package net.yacy.cora.document;
public class Punycode {
/* Punycode parameters */
final static int TMIN = 1;
final static int TMAX = 26;
final static int BASE = 36;
final static int INITIAL_N = 128;
final static int INITIAL_BIAS = 72;
final static int DAMP = 700;
final static int SKEW = 38;
final static char DELIMITER = '-';
private final static int TMIN = 1;
private final static int TMAX = 26;
private final static int BASE = 36;
private final static int INITIAL_N = 128;
private final static int INITIAL_BIAS = 72;
private final static int DAMP = 700;
private final static int SKEW = 38;
private final static char DELIMITER = '-';
/**
* Punycodes a unicode string.

View File

@ -1,40 +1,31 @@
// RSSFeed.java
// (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 24.04.2008 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
/**
* RSSFeed
* Copyright 2007 by Michael Peter Christen
* First released 16.7.2007 at http://yacy.net
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file COPYING.LESSER.
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.document.parser.xml;
package net.yacy.cora.document;
import java.util.HashSet;
import java.util.Iterator;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentLinkedQueue;
import net.yacy.document.content.RSSMessage;
public class RSSFeed implements Iterable<RSSMessage> {
public class RSSFeed implements Iterable<Hit> {
// static channel names of feeds
public static final String TEST = "TEST";
@ -119,7 +110,7 @@ public class RSSFeed implements Iterable<RSSMessage> {
return messages.size();
}
public Iterator<RSSMessage> iterator() {
public Iterator<Hit> iterator() {
return new messageIterator();
}
@ -131,7 +122,7 @@ public class RSSFeed implements Iterable<RSSMessage> {
return messages.remove(nextGUID);
}
public class messageIterator implements Iterator<RSSMessage>{
public class messageIterator implements Iterator<Hit>{
Iterator<String> GUIDiterator;
String lastGUID;

View File

@ -1,31 +1,24 @@
// RSSMessage.java
// (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 16.07.2007 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
// $LastChangedRevision: 1986 $
// $LastChangedBy: orbiter $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
/**
* RSSMessage
* Copyright 2007 by Michael Peter Christen
* First released 16.7.2007 at http://yacy.net
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file COPYING.LESSER.
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.document.content;
package net.yacy.cora.document;
import java.util.Date;
import java.util.HashMap;
@ -33,7 +26,7 @@ import java.util.HashSet;
import java.util.Map;
import java.util.Set;
public class RSSMessage {
public class RSSMessage implements Hit {
// statics for item generation and automatic categorization
private static int guidcount = 0;
@ -165,4 +158,74 @@ public class RSSMessage {
public String toString() {
return this.map.toString();
}
public void setAuthor(String title) {
// TODO Auto-generated method stub
}
public void setCategory(String title) {
// TODO Auto-generated method stub
}
public void setCopyright(String title) {
// TODO Auto-generated method stub
}
public void setCreator(String pubdate) {
// TODO Auto-generated method stub
}
public void setDescription(String description) {
// TODO Auto-generated method stub
}
public void setDocs(String guid) {
// TODO Auto-generated method stub
}
public void setGuid(String guid) {
// TODO Auto-generated method stub
}
public void setLanguage(String title) {
// TODO Auto-generated method stub
}
public void setLink(String link) {
// TODO Auto-generated method stub
}
public void setPubDate(String pubdate) {
// TODO Auto-generated method stub
}
public void setReferrer(String title) {
// TODO Auto-generated method stub
}
public void setSize(long size) {
// TODO Auto-generated method stub
}
public void setSizename(String sizename) {
// TODO Auto-generated method stub
}
public void setTitle(String title) {
// TODO Auto-generated method stub
}
}

View File

@ -1,30 +1,24 @@
// RSSReader.java
// (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 16.07.2007 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
/**
* RSSReader
* Copyright 2007 by Michael Peter Christen
* First released 16.7.2007 at http://yacy.net
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file COPYING.LESSER.
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.document.parser.xml;
package net.yacy.cora.document;
import java.io.ByteArrayInputStream;
import java.io.IOException;
@ -34,10 +28,6 @@ import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import net.yacy.document.content.RSSMessage;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.ByteBuffer;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
@ -86,25 +76,21 @@ public class RSSReader extends DefaultHandler {
}
}
public static RSSReader parse(final byte[] a) {
public static RSSReader parse(final byte[] a) throws IOException {
// check integrity of array
if ((a == null) || (a.length == 0)) {
Log.logWarning("rssReader", "response=null");
return null;
throw new IOException("response=null");
}
if (a.length < 100) {
Log.logWarning("rssReader", "response=" + new String(a));
return null;
throw new IOException("response=" + new String(a));
}
if (!ByteBuffer.equals(a, "<?xml".getBytes())) {
Log.logWarning("rssReader", "response does not contain valid xml");
return null;
if (!equals(a, "<?xml".getBytes())) {
throw new IOException("response does not contain valid xml");
}
final String end = new String(a, a.length - 10, 10);
if (end.indexOf("rss") < 0) {
Log.logWarning("rssReader", "response incomplete");
return null;
throw new IOException("response incomplete");
}
// make input stream
@ -115,13 +101,18 @@ public class RSSReader extends DefaultHandler {
try {
reader = new RSSReader(bais);
} catch (final Exception e) {
Log.logException(e);
Log.logWarning("rssReader", "parse exception: " + e.getMessage(), e);
return null;
throw new IOException("parse exception: " + e.getMessage(), e);
}
try { bais.close(); } catch (final IOException e) {}
return reader;
}
private final static boolean equals(final byte[] buffer, final byte[] pattern) {
// compares two byte arrays: true, if pattern appears completely at offset position
if (buffer.length < pattern.length) return false;
for (int i = 0; i < pattern.length; i++) if (buffer[i] != pattern[i]) return false;
return true;
}
@Override
public void startElement(final String uri, final String name, final String tag, final Attributes atts) throws SAXException {

View File

@ -0,0 +1,90 @@
/**
* HttpConnector
* Copyright 2010 by Michael Peter Christen
* First released 25.05.2010 at http://yacy.net
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file COPYING.LESSER.
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.protocol;
import java.io.IOException;
import java.util.List;
import org.apache.commons.httpclient.methods.multipart.Part;
import de.anomic.crawler.retrieval.HTTPLoader;
import de.anomic.http.client.Client;
import de.anomic.http.client.RemoteProxyConfig;
import de.anomic.http.server.HeaderFramework;
import de.anomic.http.server.RequestHeader;
import de.anomic.http.server.ResponseContainer;
public class HttpConnector {
/**
* send data to the server named by vhost
*
* @param address address of the server
* @param vhost name of the server at address which should respond
* @param post data to send (name-value-pairs)
* @param timeout in milliseconds
* @return response body
* @throws IOException
*/
public static byte[] wput(final String url, final String vhost, final List<Part> post, final int timeout) throws IOException {
return wput(url, vhost, post, timeout, false);
}
/**
* send data to the server named by vhost
*
* @param address address of the server
* @param vhost name of the server at address which should respond
* @param post data to send (name-value-pairs)
* @param timeout in milliseconds
* @param gzipBody send with content gzip encoded
* @return response body
* @throws IOException
*/
public static byte[] wput(final String url, final String vhost, final List<Part> post, final int timeout, final boolean gzipBody) throws IOException {
final RequestHeader header = new RequestHeader();
header.put(HeaderFramework.USER_AGENT, HTTPLoader.yacyUserAgent);
header.put(HeaderFramework.HOST, vhost);
final Client client = new Client(timeout, header);
client.setProxy(proxyConfig());
ResponseContainer res = null;
byte[] content = null;
try {
// send request/data
res = client.POST(url, post, gzipBody);
content = res.getData();
} finally {
if(res != null) {
// release connection
res.closeStream();
}
}
return content;
}
private static final RemoteProxyConfig proxyConfig() {
final RemoteProxyConfig p = RemoteProxyConfig.getRemoteProxyConfig();
return ((p != null) && (p.useProxy()) && (p.useProxy4Yacy())) ? p : null;
}
}

View File

@ -0,0 +1,145 @@
/**
* Search
* Copyright 2010 by Michael Peter Christen
* First released 25.05.2010 at http://yacy.net
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file COPYING.LESSER.
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.services;
import java.io.IOException;
import java.net.MalformedURLException;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.document.RSSFeed;
import net.yacy.cora.document.RSSMessage;
import net.yacy.cora.document.RSSReader;
import net.yacy.cora.protocol.HttpConnector;
import org.apache.commons.httpclient.methods.multipart.Part;
import org.apache.commons.httpclient.methods.multipart.StringPart;
public class Search {
public static BlockingQueue<RSSMessage> search(String rssSearchServiceURL, String query, boolean verify, boolean global, long timeout, int maximumRecords) {
BlockingQueue<RSSMessage> queue = new LinkedBlockingQueue<RSSMessage>();
searchJob job = new searchJob(rssSearchServiceURL, query, verify, global, timeout, maximumRecords, queue);
job.start();
return queue;
}
private final static int recordsPerSession = 10;
public static class searchJob extends Thread {
String urlBase, query;
boolean verify, global;
long timeout;
int startRecord, maximumRecords;
BlockingQueue<RSSMessage> queue;
public searchJob(String urlBase, String query, boolean verify, boolean global, long timeout, int maximumRecords, BlockingQueue<RSSMessage> queue) {
this.urlBase = urlBase;
this.query = query;
this.verify = verify;
this.global = global;
this.timeout = timeout;
this.startRecord = 0;
this.maximumRecords = maximumRecords;
this.queue = queue;
}
public void run() {
RSSMessage message;
mainloop: while (timeout > 0 && maximumRecords > 0) {
long st = System.currentTimeMillis();
RSSFeed feed;
try {
feed = search(urlBase, query, verify, global, timeout, startRecord, recordsPerSession);
} catch (IOException e1) {
break mainloop;
}
if (feed == null || feed.isEmpty()) break mainloop;
maximumRecords -= feed.size();
innerloop: while (!feed.isEmpty()) {
message = feed.pollMessage();
if (message == null) break innerloop;
try {
queue.put(message);
} catch (InterruptedException e) {
break innerloop;
}
}
startRecord += recordsPerSession;
timeout -= System.currentTimeMillis() - st;
}
try { queue.put(RSSMessage.POISON); } catch (InterruptedException e) {}
}
}
/**
* send a query to a yacy public search interface
* @param rssSearchServiceURL the target url base (everything before the ? that follows the SRU request syntax properties). can null, then the local peer is used
* @param query the query as string
* @param startRecord number of first record
* @param maximumRecords maximum number of records
* @param verify if true, result entries are verified using the snippet fetch (slow); if false simply the result is returned
* @param global if true also search results from other peers are included
* @param timeout milliseconds that are waited at maximum for a search result
* @return
*/
public static RSSFeed search(String rssSearchServiceURL, String query, boolean verify, boolean global, long timeout, int startRecord, int maximumRecords) throws IOException {
MultiProtocolURI uri = null;
try {
uri = new MultiProtocolURI(rssSearchServiceURL);
} catch (MalformedURLException e) {
throw new IOException("cora.Search failed asking peer '" + rssSearchServiceURL + "': bad url, " + e.getMessage());
}
// prepare request
final List<Part> post = new ArrayList<Part>();
post.add(new StringPart("query", query, Charset.defaultCharset().name()));
post.add(new StringPart("startRecord", Integer.toString(startRecord), Charset.defaultCharset().name()));
post.add(new StringPart("maximumRecords", Long.toString(maximumRecords), Charset.defaultCharset().name()));
post.add(new StringPart("verify", verify ? "true" : "false", Charset.defaultCharset().name()));
post.add(new StringPart("resource", global ? "global" : "local", Charset.defaultCharset().name()));
// send request
try {
final byte[] result = HttpConnector.wput(rssSearchServiceURL, uri.getHost(), post, (int) timeout);
//String debug = new String(result); System.out.println("*** DEBUG: " + debug);
final RSSReader reader = RSSReader.parse(result);
if (reader == null) {
throw new IOException("cora.Search failed asking peer '" + uri.getHost() + "': probably bad response from remote peer (1), reader == null");
}
final RSSFeed feed = reader.getFeed();
if (feed == null) {
// case where the rss reader does not understand the content
throw new IOException("cora.Search failed asking peer '" + uri.getHost() + "': probably bad response from remote peer (2)");
}
return feed;
} catch (final IOException e) {
throw new IOException("cora.Search error asking peer '" + uri.getHost() + "':" + e.toString());
}
}
}

View File

@ -33,7 +33,7 @@ import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.workflow.WorkflowThread;
@ -108,7 +108,7 @@ public abstract class AbstractParser implements Idiom {
return tempFile;
}
public int parseDir(final DigestURI location, final String prefix, final File dir, final Document doc)
public int parseDir(final MultiProtocolURI location, final String prefix, final File dir, final Document doc)
throws ParserException, InterruptedException, IOException {
if (!dir.isDirectory())
throw new ParserException("tried to parse ordinary file " + dir + " as directory", location);
@ -122,7 +122,7 @@ public abstract class AbstractParser implements Idiom {
if (file.isDirectory()) {
result += parseDir(location, prefix, file, doc);
} else try {
final DigestURI url = DigestURI.newURL(location, "/" + prefix + "/"
final MultiProtocolURI url = MultiProtocolURI.newURL(location, "/" + prefix + "/"
// XXX: workaround for relative paths within document
+ file.getPath().substring(file.getPath().indexOf(File.separatorChar) + 1)
+ "/" + file.getName());
@ -151,7 +151,7 @@ public abstract class AbstractParser implements Idiom {
* @see net.yacy.document.Idiom#parse(de.anomic.net.URL, java.lang.String, byte[])
*/
public Document parse(
final DigestURI location,
final MultiProtocolURI location,
final String mimeType,
final String charset,
final byte[] source
@ -186,7 +186,7 @@ public abstract class AbstractParser implements Idiom {
* @see net.yacy.document.Idiom#parse(de.anomic.net.URL, java.lang.String, java.io.File)
*/
public Document parse(
final DigestURI location,
final MultiProtocolURI location,
final String mimeType,
final String charset,
final File sourceFile
@ -220,7 +220,7 @@ public abstract class AbstractParser implements Idiom {
*
* @see net.yacy.document.Idiom#parse(de.anomic.net.URL, java.lang.String, java.io.InputStream)
*/
public abstract Document parse(DigestURI location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException;
public abstract Document parse(MultiProtocolURI location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException;
/**
* Return the name of the parser

View File

@ -46,10 +46,10 @@ import java.util.Properties;
import java.util.TreeMap;
import java.util.TreeSet;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.document.language.Identificator;
import net.yacy.document.parser.html.ContentScraper;
import net.yacy.document.parser.html.ImageEntry;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.data.word.WordReferenceRow;
import net.yacy.kelondro.logging.Log;
@ -125,7 +125,7 @@ public final class Condenser {
this.languageIdentificator = new Identificator();
Map.Entry<DigestURI, String> entry;
Map.Entry<MultiProtocolURI, String> entry;
if (indexText) {
createCondensement(document.getText());
// the phrase counter:
@ -179,7 +179,7 @@ public final class Condenser {
if (indexMedia) {
// add anchor descriptions: here, we also add the url components
// audio
Iterator<Map.Entry<DigestURI, String>> i = document.getAudiolinks().entrySet().iterator();
Iterator<Map.Entry<MultiProtocolURI, String>> i = document.getAudiolinks().entrySet().iterator();
while (i.hasNext()) {
entry = i.next();
insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasaudio, RESULT_FLAGS, false);

View File

@ -45,9 +45,9 @@ import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.document.parser.html.ContentScraper;
import net.yacy.document.parser.html.ImageEntry;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.DateFormatter;
import net.yacy.kelondro.util.FileUtils;
@ -55,7 +55,7 @@ import net.yacy.kelondro.util.FileUtils;
public class Document {
private final DigestURI source; // the source url
private final MultiProtocolURI source; // the source url
private final String mimeType; // mimeType as taken from http header
private final String charset; // the charset of the document
private final List<String> keywords; // most resources provide a keyword field
@ -65,24 +65,24 @@ public class Document {
private final List<String> sections; // if present: more titles/headlines appearing in the document
private final StringBuilder description; // an abstract, if present: short content description
private Object text; // the clear text, all that is visible
private final Map<DigestURI, String> anchors; // all links embedded as clickeable entities (anchor tags)
private final HashMap<String, ImageEntry> images; // all visible pictures in document
private final Map<MultiProtocolURI, String> anchors; // all links embedded as clickeable entities (anchor tags)
private final HashMap<MultiProtocolURI, ImageEntry> images; // all visible pictures in document
// the anchors and images - Maps are URL-to-EntityDescription mappings.
// The EntityDescription appear either as visible text in anchors or as alternative
// text in image tags.
private Map<DigestURI, String> hyperlinks, audiolinks, videolinks, applinks;
private Map<MultiProtocolURI, String> hyperlinks, audiolinks, videolinks, applinks;
private Map<String, String> emaillinks;
private DigestURI favicon;
private MultiProtocolURI favicon;
private boolean resorted;
private InputStream textStream;
private int inboundLinks, outboundLinks; // counters for inbound and outbound links, are counted after calling notifyWebStructure
private Set<String> languages;
private boolean indexingDenied;
public Document(final DigestURI location, final String mimeType, final String charset, final Set<String> languages,
public Document(final MultiProtocolURI location, final String mimeType, final String charset, final Set<String> languages,
final String[] keywords, final String title, final String author, final String publisher,
final String[] sections, final String abstrct,
final Object text, final Map<DigestURI, String> anchors, final HashMap<String, ImageEntry> images,
final Object text, final Map<MultiProtocolURI, String> anchors, final HashMap<MultiProtocolURI, ImageEntry> images,
boolean indexingDenied) {
this.source = location;
this.mimeType = (mimeType == null) ? "application/octet-stream" : mimeType;
@ -92,8 +92,8 @@ public class Document {
this.creator = (author == null) ? new StringBuilder(0) : new StringBuilder(author);
this.sections = (sections == null) ? new LinkedList<String>() : Arrays.asList(sections);
this.description = (abstrct == null) ? new StringBuilder(0) : new StringBuilder(abstrct);
this.anchors = (anchors == null) ? new HashMap<DigestURI, String>(0) : anchors;
this.images = (images == null) ? new HashMap<String, ImageEntry>() : images;
this.anchors = (anchors == null) ? new HashMap<MultiProtocolURI, String>(0) : anchors;
this.images = (images == null) ? new HashMap<MultiProtocolURI, ImageEntry>() : images;
this.publisher = publisher;
this.hyperlinks = null;
this.audiolinks = null;
@ -159,7 +159,7 @@ dc_rights
*/
public String dc_title() {
return title.toString();
return (title == null) ? "" : title.toString();
}
public void setTitle(String title) {
@ -167,9 +167,7 @@ dc_rights
}
public String dc_creator() {
if (creator == null)
return "";
return creator.toString();
return (creator == null) ? "" : creator.toString();
}
public String dc_subject(final char separator) {
@ -196,7 +194,7 @@ dc_rights
}
public String dc_publisher() {
return this.publisher;
return this.publisher == null ? "" : this.publisher;
}
public String dc_format() {
@ -207,7 +205,7 @@ dc_rights
return this.source.toNormalform(true, false);
}
public DigestURI dc_source() {
public MultiProtocolURI dc_source() {
return this.source;
}
@ -282,7 +280,7 @@ dc_rights
return this.keywords;
}
public Map<DigestURI, String> getAnchors() {
public Map<MultiProtocolURI, String> getAnchors() {
// returns all links embedded as anchors (clickeable entities)
// this is a url(String)/text(String) map
return anchors;
@ -291,30 +289,30 @@ dc_rights
// the next three methods provide a calculated view on the getAnchors/getImages:
public Map<DigestURI, String> getHyperlinks() {
public Map<MultiProtocolURI, String> getHyperlinks() {
// this is a subset of the getAnchor-set: only links to other hyperrefs
if (!resorted) resortLinks();
return hyperlinks;
}
public Map<DigestURI, String> getAudiolinks() {
public Map<MultiProtocolURI, String> getAudiolinks() {
if (!resorted) resortLinks();
return this.audiolinks;
}
public Map<DigestURI, String> getVideolinks() {
public Map<MultiProtocolURI, String> getVideolinks() {
if (!resorted) resortLinks();
return this.videolinks;
}
public HashMap<String, ImageEntry> getImages() {
public HashMap<MultiProtocolURI, ImageEntry> getImages() {
// returns all links enbedded as pictures (visible in document)
// this resturns a htmlFilterImageEntry collection
if (!resorted) resortLinks();
return images;
}
public Map<DigestURI, String> getApplinks() {
public Map<MultiProtocolURI, String> getApplinks() {
if (!resorted) resortLinks();
return this.applinks;
}
@ -329,18 +327,18 @@ dc_rights
if (this.resorted) return;
// extract hyperlinks, medialinks and emaillinks from anchorlinks
DigestURI url;
MultiProtocolURI url;
String u;
int extpos, qpos;
String ext = null;
final Iterator<Map.Entry<DigestURI, String>> i = anchors.entrySet().iterator();
hyperlinks = new HashMap<DigestURI, String>();
videolinks = new HashMap<DigestURI, String>();
audiolinks = new HashMap<DigestURI, String>();
applinks = new HashMap<DigestURI, String>();
final Iterator<Map.Entry<MultiProtocolURI, String>> i = anchors.entrySet().iterator();
hyperlinks = new HashMap<MultiProtocolURI, String>();
videolinks = new HashMap<MultiProtocolURI, String>();
audiolinks = new HashMap<MultiProtocolURI, String>();
applinks = new HashMap<MultiProtocolURI, String>();
emaillinks = new HashMap<String, String>();
final HashMap<String, ImageEntry> collectedImages = new HashMap<String, ImageEntry>(); // this is a set that is collected now and joined later to the imagelinks
Map.Entry<DigestURI, String> entry;
final HashMap<MultiProtocolURI, ImageEntry> collectedImages = new HashMap<MultiProtocolURI, ImageEntry>(); // this is a set that is collected now and joined later to the imagelinks
Map.Entry<MultiProtocolURI, String> entry;
while (i.hasNext()) {
entry = i.next();
url = entry.getKey();
@ -393,21 +391,21 @@ dc_rights
this.resorted = true;
}
public static Map<DigestURI, String> allSubpaths(final Collection<?> links) {
public static Map<MultiProtocolURI, String> allSubpaths(final Collection<?> links) {
// links is either a Set of Strings (urls) or a Set of
// htmlFilterImageEntries
final HashSet<String> h = new HashSet<String>();
Iterator<?> i = links.iterator();
Object o;
DigestURI url;
MultiProtocolURI url;
String u;
int pos;
int l;
while (i.hasNext())
try {
o = i.next();
if (o instanceof DigestURI) url = (DigestURI) o;
else if (o instanceof String) url = new DigestURI((String) o, null);
if (o instanceof MultiProtocolURI) url = (MultiProtocolURI) o;
else if (o instanceof String) url = new MultiProtocolURI((String) o);
else if (o instanceof ImageEntry) url = ((ImageEntry) o).url();
else {
assert false;
@ -428,11 +426,11 @@ dc_rights
} catch (final MalformedURLException e) { }
// now convert the strings to yacyURLs
i = h.iterator();
final HashMap<DigestURI, String> v = new HashMap<DigestURI, String>();
final HashMap<MultiProtocolURI, String> v = new HashMap<MultiProtocolURI, String>();
while (i.hasNext()) {
u = (String) i.next();
try {
url = new DigestURI(u, null);
url = new MultiProtocolURI(u);
v.put(url, "sub");
} catch (final MalformedURLException e) {
}
@ -440,23 +438,23 @@ dc_rights
return v;
}
public static Map<DigestURI, String> allReflinks(final Collection<?> links) {
public static Map<MultiProtocolURI, String> allReflinks(final Collection<?> links) {
// links is either a Set of Strings (with urls) or
// htmlFilterImageEntries
// we find all links that are part of a reference inside a url
final HashMap<DigestURI, String> v = new HashMap<DigestURI, String>();
final HashMap<MultiProtocolURI, String> v = new HashMap<MultiProtocolURI, String>();
final Iterator<?> i = links.iterator();
Object o;
DigestURI url;
MultiProtocolURI url;
String u;
int pos;
loop: while (i.hasNext())
try {
o = i.next();
if (o instanceof DigestURI)
url = (DigestURI) o;
if (o instanceof MultiProtocolURI)
url = (MultiProtocolURI) o;
else if (o instanceof String)
url = new DigestURI((String) o, null);
url = new MultiProtocolURI((String) o);
else if (o instanceof ImageEntry)
url = ((ImageEntry) o).url();
else {
@ -469,7 +467,7 @@ dc_rights
u = u.substring(pos);
while ((pos = u.toLowerCase().indexOf("http://", 7)) > 0)
u = u.substring(pos);
url = new DigestURI(u, null);
url = new MultiProtocolURI(u);
if (!(v.containsKey(url)))
v.put(url, "ref");
continue loop;
@ -479,7 +477,7 @@ dc_rights
u = "http:/" + u.substring(pos);
while ((pos = u.toLowerCase().indexOf("/www.", 7)) > 0)
u = "http:/" + u.substring(pos);
url = new DigestURI(u, null);
url = new MultiProtocolURI(u);
if (!(v.containsKey(url)))
v.put(url, "ref");
continue loop;
@ -512,14 +510,14 @@ dc_rights
/**
* @return the {@link URL} to the favicon that belongs to the document
*/
public DigestURI getFavicon() {
public MultiProtocolURI getFavicon() {
return this.favicon;
}
/**
* @param faviconURL the {@link URL} to the favicon that belongs to the document
*/
public void setFavicon(final DigestURI faviconURL) {
public void setFavicon(final MultiProtocolURI faviconURL) {
this.favicon = faviconURL;
}

View File

@ -29,7 +29,7 @@ import java.io.File;
import java.io.InputStream;
import java.util.Set;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.cora.document.MultiProtocolURI;
/**
@ -51,7 +51,7 @@ public interface Idiom {
*
* @throws ParserException if the content could not be parsed properly
*/
public Document parse(DigestURI location, String mimeType, String charset, byte[] source)
public Document parse(MultiProtocolURI location, String mimeType, String charset, byte[] source)
throws ParserException, InterruptedException;
/**
@ -65,7 +65,7 @@ public interface Idiom {
*
* @throws ParserException if the content could not be parsed properly
*/
public Document parse(DigestURI location, String mimeType, String charset, File sourceFile)
public Document parse(MultiProtocolURI location, String mimeType, String charset, File sourceFile)
throws ParserException, InterruptedException;
/**
@ -79,7 +79,7 @@ public interface Idiom {
*
* @throws ParserException if the content could not be parsed properly
*/
public Document parse(DigestURI location, String mimeType, String charset, InputStream source)
public Document parse(MultiProtocolURI location, String mimeType, String charset, InputStream source)
throws ParserException, InterruptedException;
/**

View File

@ -24,10 +24,10 @@
package net.yacy.document;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.cora.document.MultiProtocolURI;
public class ParserException extends Exception {
private DigestURI url = null;
private MultiProtocolURI url = null;
private static final long serialVersionUID = 1L;
@ -35,12 +35,12 @@ public class ParserException extends Exception {
super();
}
public ParserException(final String message, final DigestURI url) {
public ParserException(final String message, final MultiProtocolURI url) {
super(message + "; url = " + url.toNormalform(true, false));
this.url = url;
}
public DigestURI getURL() {
public MultiProtocolURI getURL() {
return this.url;
}
}

View File

@ -40,6 +40,7 @@ import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.document.parser.bzipParser;
import net.yacy.document.parser.csvParser;
import net.yacy.document.parser.docParser;
@ -61,7 +62,6 @@ import net.yacy.document.parser.vsdParser;
import net.yacy.document.parser.xlsParser;
import net.yacy.document.parser.zipParser;
import net.yacy.document.parser.images.genericImageParser;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.FileUtils;
@ -138,7 +138,7 @@ public final class TextParser {
}
public static Document parseSource(
final DigestURI location,
final MultiProtocolURI location,
final String mimeType,
final String charset,
final File sourceFile
@ -167,7 +167,7 @@ public final class TextParser {
}
public static Document parseSource(
final DigestURI location,
final MultiProtocolURI location,
String mimeType,
final String charset,
final byte[] content
@ -176,7 +176,7 @@ public final class TextParser {
}
public static Document parseSource(
final DigestURI location,
final MultiProtocolURI location,
String mimeType,
final String charset,
final long contentLength,
@ -211,7 +211,7 @@ public final class TextParser {
}
private static Document parseSource(
final DigestURI location,
final MultiProtocolURI location,
String mimeType,
Idiom idiom,
final String charset,
@ -233,7 +233,7 @@ public final class TextParser {
}
private static Document parseSource(
final DigestURI location,
final MultiProtocolURI location,
String mimeType,
List<Idiom> idioms,
final String charset,
@ -280,7 +280,7 @@ public final class TextParser {
* @param mimeType
* @return returns null if the content is supported. If the content is not supported, return a error string.
*/
public static String supports(final DigestURI url, String mimeType) {
public static String supports(final MultiProtocolURI url, String mimeType) {
try {
// try to get a parser. If this works, we don't need the parser itself, we just return null to show that everything is ok.
List<Idiom> idioms = idiomParser(url, mimeType);
@ -304,7 +304,7 @@ public final class TextParser {
* @return a list of Idiom parsers that may be appropriate for the given criteria
* @throws ParserException
*/
private static List<Idiom> idiomParser(final DigestURI url, String mimeType1) throws ParserException {
private static List<Idiom> idiomParser(final MultiProtocolURI url, String mimeType1) throws ParserException {
List<Idiom> idioms = new ArrayList<Idiom>(2);
// check extension
@ -345,7 +345,7 @@ public final class TextParser {
return null;
}
public static String supportsExtension(final DigestURI url) {
public static String supportsExtension(final MultiProtocolURI url) {
String ext = url.getFileExtension().toLowerCase();
if (ext == null || ext.length() == 0) return null;
if (denyExtensionx.containsKey(ext)) return "file extension '" + ext + "' is denied (2)";
@ -357,7 +357,7 @@ public final class TextParser {
return null;
}
public static String mimeOf(DigestURI url) {
public static String mimeOf(MultiProtocolURI url) {
return mimeOf(url.getFileExtension());
}

View File

@ -33,12 +33,12 @@ import java.io.InputStream;
import java.util.HashSet;
import java.util.Set;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Idiom;
import net.yacy.document.TextParser;
import net.yacy.document.ParserException;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.util.FileUtils;
import org.apache.tools.bzip2.CBZip2InputStream;
@ -75,7 +75,7 @@ public class bzipParser extends AbstractParser implements Idiom {
return SUPPORTED_EXTENSIONS;
}
public Document parse(final DigestURI location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException {
public Document parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException {
File tempFile = null;
try {

View File

@ -37,11 +37,11 @@ import java.util.HashSet;
import java.util.List;
import java.util.Set;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Idiom;
import net.yacy.document.ParserException;
import net.yacy.kelondro.data.meta.DigestURI;
/**
* a parser for comma-separated values
@ -73,7 +73,7 @@ public class csvParser extends AbstractParser implements Idiom {
}
@Override
public Document parse(DigestURI location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException {
public Document parse(MultiProtocolURI location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException {
// construct a document using all cells of the document
// the first row is used as headline
// all lines are artificially terminated by a '.' to separate them as sentence for the condenser.
@ -112,7 +112,7 @@ public class csvParser extends AbstractParser implements Idiom {
return sb.toString();
}
public List<String[]> getTable(DigestURI location, String mimeType, String charset, InputStream source) {
public List<String[]> getTable(MultiProtocolURI location, String mimeType, String charset, InputStream source) {
ArrayList<String[]> rows = new ArrayList<String[]>();
BufferedReader reader;
try {

View File

@ -32,11 +32,11 @@ import java.io.UnsupportedEncodingException;
import java.util.HashSet;
import java.util.Set;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Idiom;
import net.yacy.document.ParserException;
import net.yacy.kelondro.data.meta.DigestURI;
import org.apache.poi.hwpf.extractor.WordExtractor;
@ -65,7 +65,7 @@ public class docParser extends AbstractParser implements Idiom {
super("Word Document Parser");
}
public Document parse(final DigestURI location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException {
public Document parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException {
final WordExtractor extractor;

View File

@ -34,12 +34,12 @@ import java.util.HashSet;
import java.util.Set;
import java.util.zip.GZIPInputStream;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Idiom;
import net.yacy.document.TextParser;
import net.yacy.document.ParserException;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.util.FileUtils;
@ -74,7 +74,7 @@ public class gzipParser extends AbstractParser implements Idiom {
return SUPPORTED_EXTENSIONS;
}
public Document parse(final DigestURI location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException {
public Document parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException {
File tempFile = null;
try {

View File

@ -44,8 +44,8 @@ import java.util.Properties;
import javax.swing.event.EventListenerList;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.document.parser.htmlParser;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.io.CharBuffer;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.FileUtils;
@ -79,8 +79,8 @@ public class ContentScraper extends AbstractScraper implements Scraper {
}
// class variables: collectors for links
private HashMap<DigestURI, String> anchors;
private HashMap<String, ImageEntry> images; // urlhash/image relation
private HashMap<MultiProtocolURI, String> anchors;
private HashMap<MultiProtocolURI, ImageEntry> images; // urlhash/image relation
private final HashMap<String, String> metas;
private String title;
//private String headline;
@ -89,23 +89,23 @@ public class ContentScraper extends AbstractScraper implements Scraper {
private final EventListenerList htmlFilterEventListeners;
/**
* {@link DigestURI} to the favicon that belongs to the document
* {@link MultiProtocolURI} to the favicon that belongs to the document
*/
private DigestURI favicon;
private MultiProtocolURI favicon;
/**
* The document root {@link DigestURI}
* The document root {@link MultiProtocolURI}
*/
private DigestURI root;
private MultiProtocolURI root;
@SuppressWarnings("unchecked")
public ContentScraper(final DigestURI root) {
public ContentScraper(final MultiProtocolURI root) {
// the root value here will not be used to load the resource.
// it is only the reference for relative links
super(linkTags0, linkTags1);
this.root = root;
this.anchors = new HashMap<DigestURI, String>();
this.images = new HashMap<String, ImageEntry>();
this.anchors = new HashMap<MultiProtocolURI, String>();
this.images = new HashMap<MultiProtocolURI, ImageEntry>();
this.metas = new HashMap<String, String>();
this.title = "";
this.headlines = new ArrayList[4];
@ -133,9 +133,9 @@ public class ContentScraper extends AbstractScraper implements Scraper {
if (b.length() != 0) content.append(b).append(32);
}
private DigestURI absolutePath(final String relativePath) {
private MultiProtocolURI absolutePath(final String relativePath) {
try {
return DigestURI.newURL(root, relativePath);
return MultiProtocolURI.newURL(root, relativePath);
} catch (final Exception e) {
return null;
}
@ -149,7 +149,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
if (width > 15 && height > 15) {
final float ratio = (float) Math.min(width, height) / Math.max(width, height);
if (ratio > 0.4) {
final DigestURI url = absolutePath(tagopts.getProperty("src", ""));
final MultiProtocolURI url = absolutePath(tagopts.getProperty("src", ""));
final ImageEntry ie = new ImageEntry(url, tagopts.getProperty("alt", ""), width, height, -1);
addImage(images, ie);
}
@ -162,7 +162,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
} catch (final NumberFormatException e) {}
}
if (tagname.equalsIgnoreCase("base")) try {
root = new DigestURI(tagopts.getProperty("href", ""), null);
root = new MultiProtocolURI(tagopts.getProperty("href", ""));
} catch (final MalformedURLException e) {}
if (tagname.equalsIgnoreCase("frame")) {
anchors.put(absolutePath(tagopts.getProperty("src", "")), tagopts.getProperty("name",""));
@ -185,7 +185,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
if (href.length() > 0) anchors.put(absolutePath(href), areatitle);
}
if (tagname.equalsIgnoreCase("link")) {
final DigestURI newLink = absolutePath(tagopts.getProperty("href", ""));
final MultiProtocolURI newLink = absolutePath(tagopts.getProperty("href", ""));
if (newLink != null) {
final String type = tagopts.getProperty("rel", "");
@ -193,7 +193,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
if (type.equalsIgnoreCase("shortcut icon")) {
final ImageEntry ie = new ImageEntry(newLink, linktitle, -1, -1, -1);
images.put(new String(ie.url().hash()), ie);
images.put(ie.url(), ie);
this.favicon = newLink;
} else if (!type.equalsIgnoreCase("stylesheet") && !type.equalsIgnoreCase("alternate stylesheet")) {
anchors.put(newLink, linktitle);
@ -220,7 +220,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
// System.out.println("ScrapeTag1: tagname=" + tagname + ", opts=" + tagopts.toString() + ", text=" + new String(text));
if (tagname.equalsIgnoreCase("a") && text.length < 2048) {
final String href = tagopts.getProperty("href", "");
DigestURI url;
MultiProtocolURI url;
if ((href.length() > 0) && ((url = absolutePath(href)) != null)) {
final String f = url.getFile();
final int p = f.lastIndexOf('.');
@ -350,7 +350,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
}
}
public Map<DigestURI, String> getAnchors() {
public Map<MultiProtocolURI, String> getAnchors() {
// returns a url (String) / name (String) relation
return anchors;
}
@ -359,7 +359,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
* get all images
* @return a map of <urlhash, ImageEntry>
*/
public HashMap<String, ImageEntry> getImages() {
public HashMap<MultiProtocolURI, ImageEntry> getImages() {
// this resturns a String(absolute url)/htmlFilterImageEntry - relation
return images;
}
@ -369,9 +369,9 @@ public class ContentScraper extends AbstractScraper implements Scraper {
}
/**
* @return the {@link DigestURI} to the favicon that belongs to the document
* @return the {@link MultiProtocolURI} to the favicon that belongs to the document
*/
public DigestURI getFavicon() {
public MultiProtocolURI getFavicon() {
return this.favicon;
}
@ -442,7 +442,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
if (s == null) s = metas.get("dc.description");
if (s == null) s = "";
if (s.length() == 0) {
return DigestURI.splitpattern.split(getTitle().toLowerCase());
return MultiProtocolURI.splitpattern.split(getTitle().toLowerCase());
}
if (s.contains(",")) return s.split(" |,");
if (s.contains(";")) return s.split(" |;");
@ -536,32 +536,32 @@ public class ContentScraper extends AbstractScraper implements Scraper {
if (page == null) throw new IOException("no content in file " + file.toString());
// scrape document to look up charset
final ScraperInputStream htmlFilter = new ScraperInputStream(new ByteArrayInputStream(page),"UTF-8",new DigestURI("http://localhost", null),null,false);
final ScraperInputStream htmlFilter = new ScraperInputStream(new ByteArrayInputStream(page),"UTF-8", new MultiProtocolURI("http://localhost"),null,false);
final String charset = htmlParser.patchCharsetEncoding(htmlFilter.detectCharset());
// scrape content
final ContentScraper scraper = new ContentScraper(new DigestURI("http://localhost", null));
final ContentScraper scraper = new ContentScraper(new MultiProtocolURI("http://localhost"));
final Writer writer = new TransformerWriter(null, null, scraper, null, false);
FileUtils.copy(new ByteArrayInputStream(page), writer, Charset.forName(charset));
return scraper;
}
public static void addAllImages(final HashMap<String, ImageEntry> a, final HashMap<String, ImageEntry> b) {
final Iterator<Map.Entry<String, ImageEntry>> i = b.entrySet().iterator();
Map.Entry<String, ImageEntry> ie;
public static void addAllImages(final HashMap<MultiProtocolURI, ImageEntry> a, final HashMap<MultiProtocolURI, ImageEntry> b) {
final Iterator<Map.Entry<MultiProtocolURI, ImageEntry>> i = b.entrySet().iterator();
Map.Entry<MultiProtocolURI, ImageEntry> ie;
while (i.hasNext()) {
ie = i.next();
addImage(a, ie.getValue());
}
}
public static void addImage(final HashMap<String, ImageEntry> a, final ImageEntry ie) {
if (a.containsKey(new String(ie.url().hash()))) {
public static void addImage(final HashMap<MultiProtocolURI, ImageEntry> a, final ImageEntry ie) {
if (a.containsKey(ie.url())) {
// in case of a collision, take that image that has the better image size tags
if ((ie.height() > 0) && (ie.width() > 0)) a.put(new String(ie.url().hash()), ie);
if ((ie.height() > 0) && (ie.width() > 0)) a.put(ie.url(), ie);
} else {
a.put(new String(ie.url().hash()), ie);
a.put(ie.url(), ie);
}
}

View File

@ -26,16 +26,16 @@ package net.yacy.document.parser.html;
import java.util.Comparator;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.cora.document.MultiProtocolURI;
public class ImageEntry implements Comparable<ImageEntry>, Comparator<ImageEntry> {
private final DigestURI url;
private final MultiProtocolURI url;
private final String alt;
private final int width, height;
private final long fileSize;
public ImageEntry(final DigestURI url, final String alt, final int width, final int height, long fileSize) {
public ImageEntry(final MultiProtocolURI url, final String alt, final int width, final int height, long fileSize) {
this.url = url;
this.alt = alt;
this.width = width;
@ -43,7 +43,7 @@ public class ImageEntry implements Comparable<ImageEntry>, Comparator<ImageEntry
this.fileSize = fileSize;
}
public DigestURI url() {
public MultiProtocolURI url() {
return this.url;
}

View File

@ -35,7 +35,7 @@ import java.io.UnsupportedEncodingException;
import java.io.Writer;
import java.util.Properties;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.cora.document.MultiProtocolURI;
public class ScraperInputStream extends InputStream implements ScraperListener {
@ -58,7 +58,7 @@ public class ScraperInputStream extends InputStream implements ScraperListener {
public ScraperInputStream(
final InputStream inStream,
final String inputStreamCharset,
final DigestURI rooturl,
final MultiProtocolURI rooturl,
final Transformer transformer,
final boolean passbyIfBinarySuspect
) {

View File

@ -34,6 +34,7 @@ import java.nio.charset.UnsupportedCharsetException;
import java.util.HashSet;
import java.util.Set;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Idiom;
@ -41,7 +42,6 @@ import net.yacy.document.ParserException;
import net.yacy.document.parser.html.ContentScraper;
import net.yacy.document.parser.html.ScraperInputStream;
import net.yacy.document.parser.html.TransformerWriter;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.util.FileUtils;
@ -84,7 +84,7 @@ public class htmlParser extends AbstractParser implements Idiom {
@Override
public Document parse(
final DigestURI location,
final MultiProtocolURI location,
final String mimeType,
final String documentCharset,
final InputStream sourceStream) throws ParserException, InterruptedException {
@ -136,7 +136,7 @@ public class htmlParser extends AbstractParser implements Idiom {
return transformScraper(location, mimeType, documentCharset, scraper);
}
private static Document transformScraper(final DigestURI location, final String mimeType, final String charSet, final ContentScraper scraper) {
private static Document transformScraper(final MultiProtocolURI location, final String mimeType, final String charSet, final ContentScraper scraper) {
final String[] sections = new String[scraper.getHeadlines(1).length + scraper.getHeadlines(2).length + scraper.getHeadlines(3).length + scraper.getHeadlines(4).length];
int p = 0;
for (int i = 1; i <= 4; i++) for (int j = 0; j < scraper.getHeadlines(i).length; j++) sections[p++] = scraper.getHeadlines(i)[j];

View File

@ -50,13 +50,13 @@ import com.sun.image.codec.jpeg.JPEGCodec;
import com.sun.image.codec.jpeg.JPEGDecodeParam;
import com.sun.image.codec.jpeg.JPEGImageDecoder;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Idiom;
import net.yacy.document.ParserException;
import net.yacy.document.parser.html.ImageEntry;
import net.yacy.document.parser.images.bmpParser.IMAGEMAP;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.FileUtils;
@ -88,7 +88,7 @@ public class genericImageParser extends AbstractParser implements Idiom {
@SuppressWarnings("unchecked")
@Override
public Document parse(
final DigestURI location,
final MultiProtocolURI location,
final String mimeType,
final String documentCharset,
final InputStream sourceStream) throws ParserException, InterruptedException {
@ -170,11 +170,11 @@ public class genericImageParser extends AbstractParser implements Idiom {
}
final HashSet<String> languages = new HashSet<String>();
final HashMap<DigestURI, String> anchors = new HashMap<DigestURI, String>();
final HashMap<String, ImageEntry> images = new HashMap<String, ImageEntry>();
final HashMap<MultiProtocolURI, String> anchors = new HashMap<MultiProtocolURI, String>();
final HashMap<MultiProtocolURI, ImageEntry> images = new HashMap<MultiProtocolURI, ImageEntry>();
// add this image to the map of images
String infoString = ii.info.toString();
images.put(infoString, new ImageEntry(location, "", ii.width, ii.height, -1));
images.put(ii.location, new ImageEntry(location, "", ii.width, ii.height, -1));
if (title == null) title = location.toNormalform(true, true);
@ -204,7 +204,7 @@ public class genericImageParser extends AbstractParser implements Idiom {
}
public static ImageInfo parseJavaImage(
final DigestURI location,
final MultiProtocolURI location,
final InputStream sourceStream) throws ParserException {
BufferedImage image = null;
try {
@ -222,7 +222,7 @@ public class genericImageParser extends AbstractParser implements Idiom {
}
public static ImageInfo parseJavaImage(
final DigestURI location,
final MultiProtocolURI location,
final BufferedImage image) {
ImageInfo ii = new ImageInfo(location);
ii.image = image;
@ -259,12 +259,12 @@ public class genericImageParser extends AbstractParser implements Idiom {
}
public static class ImageInfo {
public DigestURI location;
public MultiProtocolURI location;
public BufferedImage image;
public StringBuilder info;
public int height;
public int width;
public ImageInfo(final DigestURI location) {
public ImageInfo(final MultiProtocolURI location) {
this.location = location;
this.image = null;
this.info = new StringBuilder();
@ -278,9 +278,9 @@ public class genericImageParser extends AbstractParser implements Idiom {
public static void main(final String[] args) {
File image = new File(args[0]);
genericImageParser parser = new genericImageParser();
DigestURI uri;
MultiProtocolURI uri;
try {
uri = new DigestURI("http://localhost/" + image.getName());
uri = new MultiProtocolURI("http://localhost/" + image.getName());
Document document = parser.parse(uri, "image/" + uri.getFileExtension(), "UTF-8", new FileInputStream(image));
System.out.println(document.toString());
} catch (MalformedURLException e) {

View File

@ -39,13 +39,13 @@ import java.util.zip.ZipFile;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Idiom;
import net.yacy.document.ParserException;
import net.yacy.document.parser.xml.ODContentHandler;
import net.yacy.document.parser.xml.ODMetaHandler;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.io.CharBuffer;
import net.yacy.kelondro.util.FileUtils;
@ -106,7 +106,7 @@ public class odtParser extends AbstractParser implements Idiom {
}
@Override
public Document parse(final DigestURI location, final String mimeType, final String charset, final File dest) throws ParserException, InterruptedException {
public Document parse(final MultiProtocolURI location, final String mimeType, final String charset, final File dest) throws ParserException, InterruptedException {
Writer writer = null;
File writerFile = null;
@ -228,7 +228,7 @@ public class odtParser extends AbstractParser implements Idiom {
}
}
public Document parse(final DigestURI location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException {
public Document parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException {
File dest = null;
try {
// creating a tempfile

View File

@ -39,13 +39,13 @@ import java.util.zip.ZipFile;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Idiom;
import net.yacy.document.ParserException;
import net.yacy.document.parser.xml.ODContentHandler;
import net.yacy.document.parser.xml.ODMetaHandler;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.io.CharBuffer;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.FileUtils;
@ -90,7 +90,7 @@ public class ooxmlParser extends AbstractParser implements Idiom {
}
@Override
public Document parse(final DigestURI location, final String mimeType, final String charset, final File dest) throws ParserException, InterruptedException {
public Document parse(final MultiProtocolURI location, final String mimeType, final String charset, final File dest) throws ParserException, InterruptedException {
Writer writer = null;
File writerFile = null;
@ -215,7 +215,7 @@ public class ooxmlParser extends AbstractParser implements Idiom {
}
}
public Document parse(final DigestURI location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException {
public Document parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException {
File dest = null;
try {
// creating a tempfile

View File

@ -44,11 +44,11 @@ import org.apache.pdfbox.pdmodel.encryption.BadSecurityHandlerException;
import org.apache.pdfbox.pdmodel.encryption.StandardDecryptionMaterial;
import org.apache.pdfbox.util.PDFTextStripper;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Idiom;
import net.yacy.document.ParserException;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.io.CharBuffer;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.FileUtils;
@ -84,7 +84,7 @@ public class pdfParser extends AbstractParser implements Idiom {
return SUPPORTED_EXTENSIONS;
}
public Document parse(final DigestURI location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException {
public Document parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException {
// create a pdf parser
final PDDocument theDocument;

View File

@ -32,11 +32,11 @@ import java.io.InputStream;
import java.util.HashSet;
import java.util.Set;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Idiom;
import net.yacy.document.ParserException;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
import org.apache.poi.hslf.extractor.PowerPointExtractor;
@ -70,7 +70,7 @@ public class pptParser extends AbstractParser implements Idiom {
* parses the source documents and returns a plasmaParserDocument containing
* all extracted information about the parsed document
*/
public Document parse(final DigestURI location, final String mimeType,
public Document parse(final MultiProtocolURI location, final String mimeType,
final String charset, final InputStream source) throws ParserException,
InterruptedException {
try {

View File

@ -37,11 +37,11 @@ import java.io.InputStreamReader;
import java.util.HashSet;
import java.util.Set;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Idiom;
import net.yacy.document.ParserException;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.util.FileUtils;
@ -104,7 +104,7 @@ public class psParser extends AbstractParser implements Idiom {
@Override
public Document parse(final DigestURI location, final String mimeType, final String charset, final File sourceFile) throws ParserException, InterruptedException {
public Document parse(final MultiProtocolURI location, final String mimeType, final String charset, final File sourceFile) throws ParserException, InterruptedException {
File outputFile = null;
try {
@ -277,7 +277,7 @@ public class psParser extends AbstractParser implements Idiom {
super.reset();
}
public Document parse(final DigestURI location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException {
public Document parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException {
File tempFile = null;
try {

View File

@ -40,18 +40,18 @@ import java.util.LinkedList;
import java.util.Map;
import java.util.Set;
import net.yacy.cora.document.Hit;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.document.RSSFeed;
import net.yacy.cora.document.RSSReader;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Idiom;
import net.yacy.document.ParserException;
import net.yacy.document.content.RSSMessage;
import net.yacy.document.parser.html.AbstractScraper;
import net.yacy.document.parser.html.ContentScraper;
import net.yacy.document.parser.html.ImageEntry;
import net.yacy.document.parser.html.TransformerWriter;
import net.yacy.document.parser.xml.RSSFeed;
import net.yacy.document.parser.xml.RSSReader;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.io.CharBuffer;
import net.yacy.kelondro.util.ByteBuffer;
import net.yacy.kelondro.util.FileUtils;
@ -78,11 +78,11 @@ public class rssParser extends AbstractParser implements Idiom {
super("Rich Site Summary/Atom Feed Parser");
}
public Document parse(final DigestURI location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException {
public Document parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException {
final LinkedList<String> feedSections = new LinkedList<String>();
final HashMap<DigestURI, String> anchors = new HashMap<DigestURI, String>();
final HashMap<String, ImageEntry> images = new HashMap<String, ImageEntry>();
final HashMap<MultiProtocolURI, String> anchors = new HashMap<MultiProtocolURI, String>();
final HashMap<MultiProtocolURI, ImageEntry> images = new HashMap<MultiProtocolURI, ImageEntry>();
final ByteBuffer text = new ByteBuffer();
final CharBuffer authors = new CharBuffer();
@ -119,20 +119,20 @@ public class rssParser extends AbstractParser implements Idiom {
if (feed.getImage() != null) {
try {
DigestURI imgURL = new DigestURI(feed.getImage(), null);
images.put(new String(imgURL.hash()), new ImageEntry(imgURL, feedTitle, -1, -1, -1));
MultiProtocolURI imgURL = new MultiProtocolURI(feed.getImage());
images.put(imgURL, new ImageEntry(imgURL, feedTitle, -1, -1, -1));
} catch (MalformedURLException e) {}
}
// loop through the feed items
for (final RSSMessage item: feed) {
for (final Hit item: feed) {
// check for interruption
checkInterruption();
final String itemTitle = item.getTitle();
DigestURI itemURL = null;
MultiProtocolURI itemURL = null;
try {
itemURL = new DigestURI(item.getLink(), null);
itemURL = new MultiProtocolURI(item.getLink());
} catch (MalformedURLException e) {
continue;
}
@ -164,12 +164,12 @@ public class rssParser extends AbstractParser implements Idiom {
feedSections.add(itemHeadline);
}
final Map<DigestURI, String> itemLinks = scraper.getAnchors();
final Map<MultiProtocolURI, String> itemLinks = scraper.getAnchors();
if (itemLinks != null && !itemLinks.isEmpty()) {
anchors.putAll(itemLinks);
}
final HashMap<String, ImageEntry> itemImages = scraper.getImages();
final HashMap<MultiProtocolURI, ImageEntry> itemImages = scraper.getImages();
if (itemImages != null && !itemImages.isEmpty()) {
ContentScraper.addAllImages(images, itemImages);
}

View File

@ -34,11 +34,11 @@ import java.util.Set;
import javax.swing.text.DefaultStyledDocument;
import javax.swing.text.rtf.RTFEditorKit;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Idiom;
import net.yacy.document.ParserException;
import net.yacy.kelondro.data.meta.DigestURI;
public class rtfParser extends AbstractParser implements Idiom {
@ -62,7 +62,7 @@ public class rtfParser extends AbstractParser implements Idiom {
super("Rich Text Format Parser");
}
public Document parse(final DigestURI location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException {
public Document parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException {
try {

View File

@ -36,12 +36,12 @@ import java.io.OutputStream;
import java.util.HashSet;
import java.util.Set;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Idiom;
import net.yacy.document.TextParser;
import net.yacy.document.ParserException;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.FileUtils;
@ -69,7 +69,7 @@ public class sevenzipParser extends AbstractParser implements Idiom {
super("7zip Archive Parser");
}
public Document parse(final DigestURI location, final String mimeType, final String charset, final IInStream source) throws ParserException, InterruptedException {
public Document parse(final MultiProtocolURI location, final String mimeType, final String charset, final IInStream source) throws ParserException, InterruptedException {
final Document doc = new Document(location, mimeType, charset, null, null, null, null, null, null, null, (Object)null, null, null, false);
Handler archive;
super.theLogger.logFine("opening 7zip archive...");
@ -99,13 +99,13 @@ public class sevenzipParser extends AbstractParser implements Idiom {
}
@Override
public Document parse(final DigestURI location, final String mimeType, final String charset,
public Document parse(final MultiProtocolURI location, final String mimeType, final String charset,
final byte[] source) throws ParserException, InterruptedException {
return parse(location, mimeType, charset, new ByteArrayIInStream(source));
}
@Override
public Document parse(final DigestURI location, final String mimeType, final String charset,
public Document parse(final MultiProtocolURI location, final String mimeType, final String charset,
final File sourceFile) throws ParserException, InterruptedException {
try {
return parse(location, mimeType, charset, new MyRandomAccessFile(sourceFile, "r"));
@ -114,7 +114,7 @@ public class sevenzipParser extends AbstractParser implements Idiom {
}
}
public Document parse(final DigestURI location, final String mimeType, final String charset,
public Document parse(final MultiProtocolURI location, final String mimeType, final String charset,
final InputStream source) throws ParserException, InterruptedException {
try {
final ByteArrayOutputStream cfos = new ByteArrayOutputStream();
@ -189,7 +189,7 @@ public class sevenzipParser extends AbstractParser implements Idiom {
Document theDoc;
// workaround for relative links in file, normally '#' shall be used behind the location, see
// below for reversion of the effects
final DigestURI url = DigestURI.newURL(doc.dc_source(), this.prefix + "/" + super.filePath);
final MultiProtocolURI url = MultiProtocolURI.newURL(doc.dc_source(), this.prefix + "/" + super.filePath);
final String mime = TextParser.mimeOf(super.filePath.substring(super.filePath.lastIndexOf('.') + 1));
theDoc = TextParser.parseSource(url, mime, null, this.cfos.toByteArray());

View File

@ -33,11 +33,11 @@ import java.util.HashMap;
import java.util.HashSet;
import java.util.Set;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Idiom;
import net.yacy.document.ParserException;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
import pt.tumba.parser.swf.SWF2HTML;
@ -74,7 +74,7 @@ public class swfParser extends AbstractParser implements Idiom {
* parses the source documents and returns a plasmaParserDocument containing
* all extracted information about the parsed document
*/
public Document parse(final DigestURI location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException {
public Document parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException {
try {
final SWF2HTML swf2html = new SWF2HTML();
@ -97,7 +97,7 @@ public class swfParser extends AbstractParser implements Idiom {
final String[] sections = null;
final String abstrct = null;
//TreeSet images = null;
final HashMap<DigestURI, String> anchors = new HashMap<DigestURI, String>();
final HashMap<MultiProtocolURI, String> anchors = new HashMap<MultiProtocolURI, String>();
int urls = 0;
int urlStart = -1;
int urlEnd = 0;
@ -114,7 +114,7 @@ public class swfParser extends AbstractParser implements Idiom {
urlEnd = contents.indexOf(linebreak,urlStart);
url = contents.substring(urlStart,urlEnd);
urlnr = (Integer.valueOf(++urls)).toString();
anchors.put(new DigestURI(url, null), urlnr);
anchors.put(new MultiProtocolURI(url), urlnr);
contents = contents.substring(0,urlStart)+contents.substring(urlEnd);
}

View File

@ -38,6 +38,7 @@ import java.util.Map;
import java.util.Set;
import java.util.zip.GZIPInputStream;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Idiom;
@ -45,7 +46,6 @@ import net.yacy.document.TextParser;
import net.yacy.document.ParserException;
import net.yacy.document.parser.html.ContentScraper;
import net.yacy.document.parser.html.ImageEntry;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.util.ByteBuffer;
import net.yacy.kelondro.util.FileUtils;
@ -81,7 +81,7 @@ public class tarParser extends AbstractParser implements Idiom {
return SUPPORTED_EXTENSIONS;
}
public Document parse(final DigestURI location, final String mimeType, final String charset, InputStream source) throws ParserException, InterruptedException {
public Document parse(final MultiProtocolURI location, final String mimeType, final String charset, InputStream source) throws ParserException, InterruptedException {
long docTextLength = 0;
OutputStream docText = null;
@ -106,8 +106,8 @@ public class tarParser extends AbstractParser implements Idiom {
final LinkedList<String> docSections = new LinkedList<String>();
final StringBuilder docAbstrct = new StringBuilder();
final Map<DigestURI, String> docAnchors = new HashMap<DigestURI, String>();
final HashMap<String, ImageEntry> docImages = new HashMap<String, ImageEntry>();
final Map<MultiProtocolURI, String> docAnchors = new HashMap<MultiProtocolURI, String>();
final HashMap<MultiProtocolURI, ImageEntry> docImages = new HashMap<MultiProtocolURI, ImageEntry>();
// looping through the contained files
TarEntry entry;
@ -143,7 +143,7 @@ public class tarParser extends AbstractParser implements Idiom {
checkInterruption();
// parsing the content
subDoc = TextParser.parseSource(DigestURI.newURL(location,"#" + entryName),entryMime,null,subDocTempFile);
subDoc = TextParser.parseSource(MultiProtocolURI.newURL(location,"#" + entryName),entryMime,null,subDocTempFile);
} catch (final ParserException e) {
this.theLogger.logInfo("Unable to parse tar file entry '" + entryName + "'. " + e.getMessage());
} finally {

View File

@ -36,12 +36,12 @@ import java.util.List;
import java.util.Map;
import java.util.Set;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.document.AbstractParser;
import net.yacy.document.Condenser;
import net.yacy.document.Document;
import net.yacy.document.Idiom;
import net.yacy.document.ParserException;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.util.BDecoder;
import net.yacy.kelondro.util.FileUtils;
@ -75,7 +75,7 @@ public class torrentParser extends AbstractParser implements Idiom {
}
@Override
public Document parse(DigestURI location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException {
public Document parse(MultiProtocolURI location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException {
byte[] b = null;
try {
b = FileUtils.read(source);
@ -141,7 +141,7 @@ public class torrentParser extends AbstractParser implements Idiom {
try {
byte[] b = FileUtils.read(new File(args[0]));
torrentParser parser = new torrentParser();
Document d = parser.parse(new DigestURI("http://localhost/test.torrent", null), null, "utf-8", b);
Document d = parser.parse(new MultiProtocolURI("http://localhost/test.torrent"), null, "utf-8", b);
Condenser c = new Condenser(d, true, true);
Map<String, Word> w = c.words();
for (Map.Entry<String, Word> e: w.entrySet()) System.out.println("Word: " + e.getKey() + " - " + e.getValue().posInText);

View File

@ -37,11 +37,11 @@ import java.util.Iterator;
import java.util.LinkedList;
import java.util.Set;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Idiom;
import net.yacy.document.ParserException;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.order.Base64Order;
/**
@ -80,13 +80,13 @@ public class vcfParser extends AbstractParser implements Idiom {
return SUPPORTED_EXTENSIONS;
}
public Document parse(final DigestURI url, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException {
public Document parse(final MultiProtocolURI url, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException {
try {
final StringBuilder parsedTitle = new StringBuilder();
final StringBuilder parsedDataText = new StringBuilder();
final HashMap<String, String> parsedData = new HashMap<String, String>();
final HashMap<DigestURI, String> anchors = new HashMap<DigestURI, String>();
final HashMap<MultiProtocolURI, String> anchors = new HashMap<MultiProtocolURI, String>();
final LinkedList<String> parsedNames = new LinkedList<String>();
boolean useLastLine = false;
@ -195,7 +195,7 @@ public class vcfParser extends AbstractParser implements Idiom {
parsedData.clear();
} else if (key.toUpperCase().startsWith("URL")) {
try {
final DigestURI newURL = new DigestURI(value, null);
final MultiProtocolURI newURL = new MultiProtocolURI(value);
anchors.put(newURL, newURL.toString());
//parsedData.put(key,value);
} catch (final MalformedURLException ex) {/* ignore this */}

View File

@ -31,11 +31,11 @@ import java.io.InputStream;
import java.util.HashSet;
import java.util.Set;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Idiom;
import net.yacy.document.ParserException;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
import org.apache.poi.hdgf.extractor.VisioTextExtractor;
@ -82,7 +82,7 @@ public class vsdParser extends AbstractParser implements Idiom {
* parses the source documents and returns a plasmaParserDocument containing
* all extracted information about the parsed document
*/
public Document parse(final DigestURI location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException {
public Document parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException {
Document theDoc = null;

View File

@ -31,11 +31,11 @@ import java.io.InputStream;
import java.util.HashSet;
import java.util.Set;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Idiom;
import net.yacy.document.ParserException;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
import org.apache.poi.hssf.eventusermodel.HSSFEventFactory;
@ -76,7 +76,7 @@ public class xlsParser extends AbstractParser implements Idiom {
* parses the source documents and returns a plasmaParserDocument containing
* all extracted information about the parsed document
*/
public Document parse(final DigestURI location, final String mimeType,
public Document parse(final MultiProtocolURI location, final String mimeType,
final String charset, final InputStream source) throws ParserException,
InterruptedException {
return new XLSHSSFListener().parse(location, mimeType, charset, source);
@ -111,7 +111,7 @@ public class xlsParser extends AbstractParser implements Idiom {
* parses the source documents and returns a Document containing
* all extracted information about the parsed document
*/
public Document parse(final DigestURI location, final String mimeType,
public Document parse(final MultiProtocolURI location, final String mimeType,
final String charset, final InputStream source) throws ParserException,
InterruptedException {
try {

View File

@ -39,6 +39,7 @@ import java.util.Set;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Idiom;
@ -46,7 +47,6 @@ import net.yacy.document.TextParser;
import net.yacy.document.ParserException;
import net.yacy.document.parser.html.ContentScraper;
import net.yacy.document.parser.html.ImageEntry;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.util.ByteBuffer;
import net.yacy.kelondro.util.FileUtils;
@ -82,7 +82,7 @@ public class zipParser extends AbstractParser implements Idiom {
return SUPPORTED_EXTENSIONS;
}
public Document parse(final DigestURI location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException {
public Document parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException {
long docTextLength = 0;
OutputStream docText = null;
@ -95,8 +95,8 @@ public class zipParser extends AbstractParser implements Idiom {
final StringBuilder docLongTitle = new StringBuilder();
final LinkedList<String> docSections = new LinkedList<String>();
final StringBuilder docAbstrct = new StringBuilder();
final Map<DigestURI, String> docAnchors = new HashMap<DigestURI, String>();
final HashMap<String, ImageEntry> docImages = new HashMap<String, ImageEntry>();
final Map<MultiProtocolURI, String> docAnchors = new HashMap<MultiProtocolURI, String>();
final HashMap<MultiProtocolURI, ImageEntry> docImages = new HashMap<MultiProtocolURI, ImageEntry>();
// looping through the contained files
ZipEntry entry;
@ -129,7 +129,7 @@ public class zipParser extends AbstractParser implements Idiom {
FileUtils.copy(zippedContent,subDocTempFile,entry.getSize());
// parsing the zip file entry
subDoc = TextParser.parseSource(DigestURI.newURL(location,"#" + entryName),entryMime,null, subDocTempFile);
subDoc = TextParser.parseSource(MultiProtocolURI.newURL(location,"#" + entryName),entryMime,null, subDocTempFile);
} catch (final ParserException e) {
this.theLogger.logInfo("Unable to parse zip file entry '" + entryName + "'. " + e.getMessage());
} finally {

File diff suppressed because it is too large Load Diff

View File

@ -595,7 +595,7 @@ public class Domains {
}
public static boolean isLocal(final String host) {
assert (host != null);
if (host == null) return true;
// FIXME IPv4 only
// check local ip addresses

View File

@ -283,7 +283,7 @@ public class Blacklist {
}
public boolean isListed(final String blacklistType, final DigestURI url) {
if (url.getHost() == null) return false;
final HandleSet urlHashCache = getCacheUrlHashsSet(blacklistType);
if (!urlHashCache.has(url.hash())) {
final boolean temp = isListed(blacklistType, url.getHost().toLowerCase(), url.getFile());

View File

@ -51,6 +51,7 @@ import net.yacy.kelondro.util.FileUtils;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.retrieval.FTPLoader;
import de.anomic.crawler.retrieval.FileLoader;
import de.anomic.crawler.retrieval.HTTPLoader;
import de.anomic.crawler.retrieval.Request;
import de.anomic.crawler.retrieval.Response;
@ -73,17 +74,19 @@ public final class LoaderDispatcher {
private final HTTPLoader httpLoader;
private final FTPLoader ftpLoader;
private final SMBLoader smbLoader;
private final FileLoader fileLoader;
private final Log log;
public LoaderDispatcher(final Switchboard sb) {
this.sb = sb;
this.supportedProtocols = new HashSet<String>(Arrays.asList(new String[]{"http","https","ftp","smb"}));
this.supportedProtocols = new HashSet<String>(Arrays.asList(new String[]{"http","https","ftp","smb","file"}));
// initiate loader objects
this.log = new Log("LOADER");
httpLoader = new HTTPLoader(sb, log);
ftpLoader = new FTPLoader(sb, log);
smbLoader = new SMBLoader(sb, log);
fileLoader = new FileLoader(sb, log);
}
public boolean isSupportedProtocol(final String protocol) {
@ -251,13 +254,14 @@ public final class LoaderDispatcher {
}
// now it's for sure that we will access the target. Remember the access time
accessTime.put(host, System.currentTimeMillis());
if (host != null) accessTime.put(host, System.currentTimeMillis());
// load resource from the internet
Response response = null;
if ((protocol.equals("http") || (protocol.equals("https")))) response = httpLoader.load(request, acceptOnlyParseable, maxFileSize);
if (protocol.equals("ftp")) response = ftpLoader.load(request, true);
if (protocol.equals("smb")) response = smbLoader.load(request, true);
if (protocol.equals("file")) response = fileLoader.load(request, true);
if (response != null) {
// we got something. Now check if we want to store that to the cache
// first check looks if we want to store the content to the cache

View File

@ -2,13 +2,14 @@ package de.anomic.yacy;
import java.net.MalformedURLException;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.kelondro.data.meta.DigestURI;
import junit.framework.TestCase;
public class yacyURLTest extends TestCase {
public void testResolveBackpath() throws MalformedURLException {
public void testResolveBackpath() {
String[][] testStrings = new String[][] {
new String[]{"/..home","/..home"},
new String[]{"/test/..home/test.html","/test/..home/test.html"},
@ -23,14 +24,13 @@ public class yacyURLTest extends TestCase {
new String[]{"/home/..test/../hallo/../","/home/"}
};
DigestURI urlObj = new DigestURI("http://yacy.net");
for (int i=0; i < testStrings.length; i++) {
// desired conversion result
System.out.print("testResolveBackpath: " + testStrings[i][0]);
String shouldBe = testStrings[i][1];
// conversion result
String resolvedURL = urlObj.resolveBackpath(testStrings[i][0]);
String resolvedURL = MultiProtocolURI.resolveBackpath(testStrings[i][0]);
// test if equal
assertEquals(shouldBe,resolvedURL);