mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-21 00:00:13 +02:00
a563b05b60
- added a new queue 'noload' which can be filled with urls where it is already known that the content cannot be loaded. This may be because there is no parser available or the file is too big - the noload queue is emptied with the parser process which indexes the file names only - the 'start from file' functionality now also reads from ftp crawler git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7368 6c8d7289-2bf4-0310-a012-ef5d649a1542
267 lines
11 KiB
Java
Executable File
267 lines
11 KiB
Java
Executable File
// Request.java
|
|
// (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
|
|
// first published 14.03.2007 on http://yacy.net
|
|
//
|
|
// This is a part of YaCy, a peer-to-peer based web search engine
|
|
//
|
|
// $LastChangedDate$
|
|
// $LastChangedRevision$
|
|
// $LastChangedBy$
|
|
//
|
|
// LICENSE
|
|
//
|
|
// This program is free software; you can redistribute it and/or modify
|
|
// it under the terms of the GNU General Public License as published by
|
|
// the Free Software Foundation; either version 2 of the License, or
|
|
// (at your option) any later version.
|
|
//
|
|
// This program is distributed in the hope that it will be useful,
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
// GNU General Public License for more details.
|
|
//
|
|
// You should have received a copy of the GNU General Public License
|
|
// along with this program; if not, write to the Free Software
|
|
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
|
|
package de.anomic.crawler.retrieval;
|
|
|
|
import java.io.IOException;
|
|
import java.io.UnsupportedEncodingException;
|
|
import java.util.Date;
|
|
|
|
import net.yacy.kelondro.data.meta.DigestURI;
|
|
import net.yacy.kelondro.data.word.Word;
|
|
import net.yacy.kelondro.index.Row;
|
|
import net.yacy.kelondro.order.Base64Order;
|
|
import net.yacy.kelondro.order.Bitfield;
|
|
import net.yacy.kelondro.order.NaturalOrder;
|
|
import net.yacy.kelondro.workflow.WorkflowJob;
|
|
|
|
public class Request extends WorkflowJob {
|
|
|
|
// row definition for balancer-related NURL-entries
|
|
public final static Row rowdef = new Row(
|
|
"String urlhash-" + Word.commonHashLength + ", " + // the url's hash
|
|
"String initiator-" + Word.commonHashLength + ", " + // the crawling initiator
|
|
"String urlstring-256, " + // the url as string
|
|
"String refhash-" + Word.commonHashLength + ", " + // the url's referrer hash
|
|
"String urlname-80, " + // the name of the url, from anchor tag <a>name</a>
|
|
"Cardinal appdate-8 {b256}, " + // the date of the resource; either file date or first appearance
|
|
"String profile-" + Word.commonHashLength + ", " + // the name of the prefetch profile handle
|
|
"Cardinal depth-2 {b256}, " + // the prefetch depth so far, starts at 0
|
|
"Cardinal parentbr-3 {b256}, " + // number of anchors of the parent
|
|
"Cardinal forkfactor-4 {b256}, " + // sum of anchors of all ancestors
|
|
"byte[] flags-4, " + // flags
|
|
"Cardinal handle-4 {b256}, " + // handle (NOT USED)
|
|
"Cardinal loaddate-8 {b256}, " + // NOT USED
|
|
"Cardinal lastmodified-8 {b256}, " + // NOT USED
|
|
"Cardinal size-8 {b256}", // size of resource in bytes (if known) or 0 if not known
|
|
Base64Order.enhancedCoder
|
|
);
|
|
|
|
private byte[] initiator; // the initiator hash, is NULL or "" if it is the own proxy;
|
|
// if this is generated by a crawl, the own peer hash in entered
|
|
private byte[] refhash; // the url's referrer hash
|
|
private DigestURI url; // the url as string
|
|
private String name; // the name of the url, from anchor tag <a>name</a>
|
|
private long appdate; // the time when the url was first time appeared.
|
|
private String profileHandle; // the name of the fetch profile
|
|
private int depth; // the prefetch depth so far, starts at 0
|
|
private int anchors; // number of anchors of the parent
|
|
private int forkfactor; // sum of anchors of all ancestors
|
|
private Bitfield flags;
|
|
private long size; // size of resource in bytes (if known) or 0 if not known
|
|
private String statusMessage;
|
|
private int initialHash; // to provide a object hash that does not change even if the url changes because of redirection
|
|
|
|
/**
|
|
* convenience method for 'full' request object
|
|
* @param url
|
|
* @param referrerhash
|
|
*/
|
|
public Request(final DigestURI url, final byte[] referrerhash) {
|
|
this(null, url, referrerhash, null, null, null, 0, 0, 0, 0);
|
|
}
|
|
|
|
/**
|
|
* A Request Entry is a object that is created to provide
|
|
* all information to load a specific resource.
|
|
*
|
|
* @param initiator the hash of the initiator peer
|
|
* @param url the {@link URL} to crawl
|
|
* @param referrer the hash of the referrer URL
|
|
* @param name the name of the document to crawl
|
|
* @param appdate the time when the url was first time appeared
|
|
* @param profileHandle the name of the prefetch profile. This must not be null!
|
|
* @param depth the crawling depth of the entry
|
|
* @param anchors number of anchors of the parent
|
|
* @param forkfactor sum of anchors of all ancestors
|
|
*/
|
|
public Request(
|
|
final byte[] initiator,
|
|
final DigestURI url,
|
|
final byte[] referrerhash,
|
|
final String name,
|
|
final Date appdate,
|
|
final String profileHandle,
|
|
final int depth,
|
|
final int anchors,
|
|
final int forkfactor,
|
|
final long size
|
|
) {
|
|
// create new entry and store it into database
|
|
assert url != null;
|
|
assert profileHandle == null || profileHandle.length() == Word.commonHashLength : profileHandle + " != " + Word.commonHashLength;
|
|
url.removeRef(); // remove anchor reference
|
|
this.initiator = (initiator == null) ? null : ((initiator.length == 0) ? null : initiator);
|
|
this.url = url;
|
|
this.refhash = referrerhash;
|
|
this.name = (name == null) ? "" : name;
|
|
this.appdate = (appdate == null) ? 0 : appdate.getTime();
|
|
this.profileHandle = profileHandle; // must not be null
|
|
this.depth = depth;
|
|
this.anchors = anchors;
|
|
this.forkfactor = forkfactor;
|
|
this.flags = new Bitfield(rowdef.width(10));
|
|
this.statusMessage = "loaded(args)";
|
|
this.initialHash = url.hashCode();
|
|
this.status = WorkflowJob.STATUS_INITIATED;
|
|
this.size = size;
|
|
}
|
|
|
|
public Request(final Row.Entry entry) throws IOException {
|
|
assert (entry != null);
|
|
insertEntry(entry);
|
|
}
|
|
|
|
private void insertEntry(final Row.Entry entry) throws IOException {
|
|
final String urlstring = entry.getColString(2, null);
|
|
if (urlstring == null) throw new IOException ("url string is null");
|
|
this.initiator = entry.getColBytes(1, true);
|
|
this.initiator = (initiator == null) ? null : ((initiator.length == 0) ? null : initiator);
|
|
this.url = new DigestURI(urlstring, entry.getColBytes(0, true));
|
|
this.refhash = (entry.empty(3)) ? null : entry.getColBytes(3, true);
|
|
this.name = (entry.empty(4)) ? "" : entry.getColString(4, "UTF-8").trim();
|
|
this.appdate = entry.getColLong(5);
|
|
this.profileHandle = (entry.empty(6)) ? null : entry.getColString(6, null).trim();
|
|
this.depth = (int) entry.getColLong(7);
|
|
this.anchors = (int) entry.getColLong(8);
|
|
this.forkfactor = (int) entry.getColLong(9);
|
|
this.flags = new Bitfield(entry.getColBytes(10, true));
|
|
//this.loaddate = entry.getColLong(12);
|
|
//this.lastmodified = entry.getColLong(13);
|
|
this.size = entry.getColLong(14);
|
|
this.statusMessage = "loaded(kelondroRow.Entry)";
|
|
this.initialHash = url.hashCode();
|
|
return;
|
|
}
|
|
|
|
public int hashCode() {
|
|
// overloads Object.hashCode()
|
|
return this.initialHash;
|
|
}
|
|
|
|
public void setStatus(final String s, int code) {
|
|
//System.out.println("***DEBUG*** crawler status " + s + ", " + code + " for " + this.url.toNormalform(true, false));
|
|
this.statusMessage = s;
|
|
this.status = code;
|
|
}
|
|
|
|
public String getStatus() {
|
|
return this.statusMessage;
|
|
}
|
|
|
|
public Row.Entry toRow() {
|
|
final byte[] appdatestr = NaturalOrder.encodeLong(appdate, rowdef.width(5));
|
|
final byte[] loaddatestr = NaturalOrder.encodeLong(0 /*loaddate*/, rowdef.width(12));
|
|
final byte[] serverdatestr = NaturalOrder.encodeLong(0 /*lastmodified*/, rowdef.width(13));
|
|
final byte[] sizestr = NaturalOrder.encodeLong(this.size, rowdef.width(14));
|
|
// store the hash in the hash cache
|
|
byte[] namebytes;
|
|
try {
|
|
namebytes = this.name.getBytes("UTF-8");
|
|
} catch (final UnsupportedEncodingException e) {
|
|
namebytes = this.name.getBytes();
|
|
}
|
|
final byte[][] entry = new byte[][] {
|
|
this.url.hash(),
|
|
initiator,
|
|
this.url.toString().getBytes(),
|
|
this.refhash,
|
|
namebytes,
|
|
appdatestr,
|
|
(this.profileHandle == null) ? null : this.profileHandle.getBytes(),
|
|
NaturalOrder.encodeLong(this.depth, rowdef.width(7)),
|
|
NaturalOrder.encodeLong(this.anchors, rowdef.width(8)),
|
|
NaturalOrder.encodeLong(this.forkfactor, rowdef.width(9)),
|
|
this.flags.bytes(),
|
|
NaturalOrder.encodeLong(0, rowdef.width(11)),
|
|
loaddatestr,
|
|
serverdatestr,
|
|
sizestr};
|
|
return rowdef.newEntry(entry);
|
|
}
|
|
|
|
public DigestURI url() {
|
|
// the url
|
|
return url;
|
|
}
|
|
|
|
public void redirectURL(final DigestURI redirectedURL) {
|
|
// replace old URL by new one. This should only be used in case of url redirection
|
|
this.url = redirectedURL;
|
|
}
|
|
|
|
public byte[] referrerhash() {
|
|
// the urlhash of a referer url
|
|
return this.refhash;
|
|
}
|
|
|
|
public byte[] initiator() {
|
|
// returns the hash of the initiating peer
|
|
return initiator;
|
|
}
|
|
|
|
public boolean proxy() {
|
|
// true when the url was retrieved using the proxy
|
|
return (initiator() == null || initiator().length == 0);
|
|
}
|
|
|
|
public Date appdate() {
|
|
// the date when the url appeared first
|
|
return new Date(this.appdate);
|
|
}
|
|
/*
|
|
public Date loaddate() {
|
|
// the date when the url was loaded
|
|
return new Date(this.loaddate);
|
|
}
|
|
|
|
public Date lastmodified() {
|
|
// the date that the server returned as document date
|
|
return new Date(this.lastmodified);
|
|
}
|
|
*/
|
|
public long size() {
|
|
// the date that the client (browser) send as ifModifiedSince in proxy mode
|
|
return this.size;
|
|
}
|
|
|
|
public String name() {
|
|
// return the anchor name (text inside <a> tag)
|
|
return this.name;
|
|
}
|
|
|
|
public int depth() {
|
|
// crawl depth where the url appeared
|
|
return this.depth;
|
|
}
|
|
|
|
public String profileHandle() {
|
|
// the handle of the crawl profile
|
|
assert profileHandle.length() == Word.commonHashLength : profileHandle + " != " + Word.commonHashLength;
|
|
return this.profileHandle;
|
|
}
|
|
|
|
} |