yacy_search_server/source/net/yacy/crawler/retrieval/Request.java
2022-01-26 11:22:43 +01:00

297 lines
11 KiB
Java

// Request.java
// SPDX-FileCopyrightText: 2007 Michael Peter Christen <mc@yacy.net)>
// SPDX-License-Identifier: GPL-2.0-or-later
// Frankfurt a. M., Germany
// first published 14.03.2007 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package net.yacy.crawler.retrieval;
import java.io.IOException;
import java.net.URL;
import java.util.Date;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.order.Base64Order;
import net.yacy.cora.order.NaturalOrder;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.index.Row;
import net.yacy.kelondro.util.Bitfield;
import net.yacy.kelondro.workflow.WorkflowJob;
public class Request extends WorkflowJob
{
// row definition for balancer-related NURL-entries
public final static Row rowdef = new Row("String urlhash-" + Word.commonHashLength + ", " + // the url's hash
"String initiator-"
+ Word.commonHashLength
+ ", "
+ // the crawling initiator
"String urlstring-2048, "
+ // the url as string
"String refhash-"
+ Word.commonHashLength
+ ", "
+ // the url's referrer hash
"String urlname-256, "
+ // the name of the url, from anchor tag <a>name</a> (must be big to transport NOLOAD entries)
"Cardinal appdate-8 {b256}, "
+ // the date of the resource; either file date or first appearance
"String profile-"
+ Word.commonHashLength
+ ", "
+ // the name of the prefetch profile handle
"Cardinal depth-2 {b256}, "
+ // the prefetch depth so far, starts at 0
"Cardinal parentbr-3 {b256}, "
+ // number of anchors of the parent (NOT USED)
"Cardinal forkfactor-4 {b256}, "
+ // sum of anchors of all ancestors (NOT USED)
"byte[] flags-4, "
+ // flags
"Cardinal handle-4 {b256}, "
+ // handle (NOT USED)
"Cardinal loaddate-8 {b256}, "
+ // NOT USED
"Cardinal lastmodified-8 {b256}, "
+ // NOT USED
"Cardinal size-8 {b256}", // size of resource in bytes (if known) or 0 if not known
Base64Order.enhancedCoder);
public final static int descrLength = rowdef.column(4).cellwidth;
private byte[] initiator; // the initiator hash, is NULL or "" if it is the own proxy;
// if this is generated by a crawl, the own peer hash in entered
private byte[] refhash; // the url's referrer hash
private DigestURL url; // the url as string
private String name; // the name of the url, from anchor tag <a>name</a>
private long appdate; // the time when the url was first time appeared.
private String profileHandle; // the name of the fetch profile
private int depth; // the prefetch depth so far, starts at 0
private Bitfield flags;
private String statusMessage;
private int initialHash; // to provide a object hash that does not change even if the url changes because of redirection
private int timezoneOffset;
public Request() {
// used only to create poison entries
this.initiator = null;
this.url = null;
this.refhash = null;
this.name = null;
this.appdate = 0;
this.profileHandle = null;
this.depth = 0;
this.flags = null;
this.statusMessage = null;
this.initialHash = 0;
this.status = 0;
this.timezoneOffset = 0;
}
/**
* convenience method for 'full' request object
*
* @param url
* @param referrerhash
*/
public Request(final DigestURL url, final byte[] referrerhash) {
this(null, url, referrerhash, null, null, null, 0, 0);
}
/**
* A Request Entry is a object that is created to provide all information to load a specific resource.
*
* @param initiator the hash of the initiator peer
* @param url the {@link URL} to crawl
* @param referrerhash the hash of the referrer URL
* @param name the name of the document to crawl
* @param appdate the time when the url was first time appeared
* @param profileHandle the name of the prefetch profile. This must not be null!
* @param depth the crawling depth of the entry
*/
public Request(
final byte[] initiator,
final DigestURL url,
final byte[] referrerhash,
final String name,
final Date appdate,
final String profileHandle,
final int depth,
final int timezoneOffset) {
// create new entry and store it into database
assert url != null;
assert profileHandle == null || profileHandle.length() == Word.commonHashLength : profileHandle
+ " != "
+ Word.commonHashLength;
url.removeRef(); // remove anchor reference
this.initiator = (initiator == null) ? null : ((initiator.length == 0) ? null : initiator);
this.url = url;
this.refhash = referrerhash;
this.name = (name == null) ? "" : name;
this.appdate = (appdate == null) ? 0 : appdate.getTime();
this.profileHandle = profileHandle; // must not be null
this.depth = depth;
this.timezoneOffset = timezoneOffset;
this.flags = new Bitfield(rowdef.width(10));
this.statusMessage = "loaded(args)";
this.initialHash = url.hashCode();
this.status = WorkflowJob.STATUS_INITIATED;
}
public Request(final Row.Entry entry) throws IOException {
assert (entry != null);
insertEntry(entry);
}
private void insertEntry(final Row.Entry entry) throws IOException {
try {
final String urlstring = entry.getColUTF8(2);
if ( urlstring == null ) {
throw new IOException("url string is null");
}
this.initiator = entry.getColBytes(1, true);
this.initiator =
(this.initiator == null) ? null : ((this.initiator.length == 0) ? null : this.initiator);
this.url = new DigestURL(urlstring, entry.getPrimaryKeyBytes());
this.refhash = (entry.empty(3)) ? null : entry.getColBytes(3, true);
this.name = (entry.empty(4)) ? "" : entry.getColUTF8(4).trim();
this.appdate = entry.getColLong(5);
this.profileHandle = (entry.empty(6)) ? null : entry.getColASCII(6).trim();
this.depth = (int) entry.getColLong(7);
//this.anchors = (int) entry.getColLong(8);
//this.forkfactor = (int) entry.getColLong(9);
this.flags = new Bitfield(entry.getColBytes(10, true));
//this.loaddate = entry.getColLong(12);
//this.lastmodified = entry.getColLong(13);
this.statusMessage = "loaded(kelondroRow.Entry)";
this.initialHash = this.url.hashCode();
} catch (final Throwable e ) {
throw new IOException(e.getMessage());
}
return;
}
@Override
public int hashCode() {
// overloads Object.hashCode()
return this.initialHash;
}
public void setStatus(final String s, final int code) {
//System.out.println("***DEBUG*** crawler status " + s + ", " + code + " for " + this.url.toNormalform(true, false));
this.statusMessage = s;
this.status = code;
}
public String getStatus() {
return this.statusMessage;
}
public Row.Entry toRow() {
final byte[] appdatestr = NaturalOrder.encodeLong(this.appdate, rowdef.width(5));
final byte[] loaddatestr = NaturalOrder.encodeLong(0 /*loaddate*/, rowdef.width(12));
final byte[] serverdatestr = NaturalOrder.encodeLong(0 /*lastmodified*/, rowdef.width(13));
// store the hash in the hash cache
final byte[] namebytes = UTF8.getBytes(this.name);
final byte[][] entry =
new byte[][] {
this.url.hash(),
this.initiator,
UTF8.getBytes(this.url.toNormalform(false)),
this.refhash,
namebytes,
appdatestr,
(this.profileHandle == null) ? null : ASCII.getBytes(this.profileHandle),
NaturalOrder.encodeLong(this.depth, rowdef.width(7)),
NaturalOrder.encodeLong(0, rowdef.width(8)), // anchors
NaturalOrder.encodeLong(0, rowdef.width(9)), // forkfactor
this.flags.bytes(),
NaturalOrder.encodeLong(0, rowdef.width(11)),
loaddatestr,
serverdatestr,
new byte[0] // dummy, not used (any more)
};
return rowdef.newEntry(entry);
}
public DigestURL url() {
// the url
return this.url;
}
public void redirectURL(final DigestURL redirectedURL) {
// replace old URL by new one. This should only be used in case of url redirection
this.url = redirectedURL;
}
public byte[] referrerhash() {
// the urlhash of a referer url
return this.refhash;
}
public byte[] initiator() {
// returns the hash of the initiating peer
return this.initiator;
}
public boolean proxy() {
// true when the url was retrieved using the proxy
return (initiator() == null || initiator().length == 0);
}
public Date appdate() {
// the date when the url appeared first
return new Date(this.appdate);
}
public String name() {
// return the anchor name (text inside <a> tag)
return this.name;
}
public int depth() {
// crawl depth where the url appeared
return this.depth;
}
public int timezoneOffset() {
return this.timezoneOffset;
}
public String profileHandle() {
// the handle of the crawl profile
assert this.profileHandle == null || this.profileHandle.length() == Word.commonHashLength : this.profileHandle + " != " + Word.commonHashLength;
return this.profileHandle;
}
@Override
public String toString() {
return this.url.toNormalform(true);
}
}