yacy_search_server/source/net/yacy/crawler/retrieval/Request.java
Michael Peter Christen fed26f33a8 enhanced timezone managament for indexed data:
to support the new time parser and search functions in YaCy a high
precision detection of date and time on the day is necessary. That
requires that the time zone of the document content and the time zone of
the user, doing a search, is detected. The time zone of the search
request is done automatically using the browsers time zone offset which
is delivered to the search request automatically and invisible to the
user. The time zone for the content of web pages cannot be detected
automatically and must be an attribute of crawl starts. The advanced
crawl start now provides an input field to set the time zone in minutes
as an offset number. All parsers must get a time zone offset passed, so
this required the change of the parser java api. A lot of other changes
had been made which corrects the wrong handling of dates in YaCy which
was to add a correction based on the time zone of the server. Now no
correction is added and all dates in YaCy are UTC/GMT time zone, a
normalized time zone for all peers.
2015-04-15 13:17:23 +02:00

293 lines
11 KiB
Java

// Request.java
// (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 14.03.2007 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package net.yacy.crawler.retrieval;
import java.io.IOException;
import java.util.Date;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.order.Base64Order;
import net.yacy.cora.order.NaturalOrder;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.index.Row;
import net.yacy.kelondro.util.Bitfield;
import net.yacy.kelondro.workflow.WorkflowJob;
public class Request extends WorkflowJob
{
// row definition for balancer-related NURL-entries
public final static Row rowdef = new Row("String urlhash-" + Word.commonHashLength + ", " + // the url's hash
"String initiator-"
+ Word.commonHashLength
+ ", "
+ // the crawling initiator
"String urlstring-256, "
+ // the url as string
"String refhash-"
+ Word.commonHashLength
+ ", "
+ // the url's referrer hash
"String urlname-256, "
+ // the name of the url, from anchor tag <a>name</a> (must be big to transport NOLOAD entries)
"Cardinal appdate-8 {b256}, "
+ // the date of the resource; either file date or first appearance
"String profile-"
+ Word.commonHashLength
+ ", "
+ // the name of the prefetch profile handle
"Cardinal depth-2 {b256}, "
+ // the prefetch depth so far, starts at 0
"Cardinal parentbr-3 {b256}, "
+ // number of anchors of the parent (NOT USED)
"Cardinal forkfactor-4 {b256}, "
+ // sum of anchors of all ancestors (NOT USED)
"byte[] flags-4, "
+ // flags
"Cardinal handle-4 {b256}, "
+ // handle (NOT USED)
"Cardinal loaddate-8 {b256}, "
+ // NOT USED
"Cardinal lastmodified-8 {b256}, "
+ // NOT USED
"Cardinal size-8 {b256}", // size of resource in bytes (if known) or 0 if not known
Base64Order.enhancedCoder);
public final static int descrLength = rowdef.column(4).cellwidth;
private byte[] initiator; // the initiator hash, is NULL or "" if it is the own proxy;
// if this is generated by a crawl, the own peer hash in entered
private byte[] refhash; // the url's referrer hash
private DigestURL url; // the url as string
private String name; // the name of the url, from anchor tag <a>name</a>
private long appdate; // the time when the url was first time appeared.
private String profileHandle; // the name of the fetch profile
private int depth; // the prefetch depth so far, starts at 0
private Bitfield flags;
private String statusMessage;
private int initialHash; // to provide a object hash that does not change even if the url changes because of redirection
private int timezoneOffset;
public Request() {
// used only to create poison entries
this.initiator = null;
this.url = null;
this.refhash = null;
this.name = null;
this.appdate = 0;
this.profileHandle = null;
this.depth = 0;
this.flags = null;
this.statusMessage = null;
this.initialHash = 0;
this.status = 0;
this.timezoneOffset = 0;
}
/**
* convenience method for 'full' request object
*
* @param url
* @param referrerhash
*/
public Request(final DigestURL url, final byte[] referrerhash) {
this(null, url, referrerhash, null, null, null, 0, 0);
}
/**
* A Request Entry is a object that is created to provide all information to load a specific resource.
*
* @param initiator the hash of the initiator peer
* @param url the {@link URL} to crawl
* @param referrer the hash of the referrer URL
* @param name the name of the document to crawl
* @param appdate the time when the url was first time appeared
* @param profileHandle the name of the prefetch profile. This must not be null!
* @param depth the crawling depth of the entry
*/
public Request(
final byte[] initiator,
final DigestURL url,
final byte[] referrerhash,
final String name,
final Date appdate,
final String profileHandle,
final int depth,
final int timezoneOffset) {
// create new entry and store it into database
assert url != null;
assert profileHandle == null || profileHandle.length() == Word.commonHashLength : profileHandle
+ " != "
+ Word.commonHashLength;
url.removeRef(); // remove anchor reference
this.initiator = (initiator == null) ? null : ((initiator.length == 0) ? null : initiator);
this.url = url;
this.refhash = referrerhash;
this.name = (name == null) ? "" : name;
this.appdate = (appdate == null) ? 0 : appdate.getTime();
this.profileHandle = profileHandle; // must not be null
this.depth = depth;
this.timezoneOffset = timezoneOffset;
this.flags = new Bitfield(rowdef.width(10));
this.statusMessage = "loaded(args)";
this.initialHash = url.hashCode();
this.status = WorkflowJob.STATUS_INITIATED;
}
public Request(final Row.Entry entry) throws IOException {
assert (entry != null);
insertEntry(entry);
}
private void insertEntry(final Row.Entry entry) throws IOException {
try {
final String urlstring = entry.getColUTF8(2);
if ( urlstring == null ) {
throw new IOException("url string is null");
}
this.initiator = entry.getColBytes(1, true);
this.initiator =
(this.initiator == null) ? null : ((this.initiator.length == 0) ? null : this.initiator);
this.url = new DigestURL(urlstring, entry.getPrimaryKeyBytes());
this.refhash = (entry.empty(3)) ? null : entry.getColBytes(3, true);
this.name = (entry.empty(4)) ? "" : entry.getColUTF8(4).trim();
this.appdate = entry.getColLong(5);
this.profileHandle = (entry.empty(6)) ? null : entry.getColASCII(6).trim();
this.depth = (int) entry.getColLong(7);
//this.anchors = (int) entry.getColLong(8);
//this.forkfactor = (int) entry.getColLong(9);
this.flags = new Bitfield(entry.getColBytes(10, true));
//this.loaddate = entry.getColLong(12);
//this.lastmodified = entry.getColLong(13);
this.statusMessage = "loaded(kelondroRow.Entry)";
this.initialHash = this.url.hashCode();
} catch (final Throwable e ) {
throw new IOException(e.getMessage());
}
return;
}
@Override
public int hashCode() {
// overloads Object.hashCode()
return this.initialHash;
}
public void setStatus(final String s, final int code) {
//System.out.println("***DEBUG*** crawler status " + s + ", " + code + " for " + this.url.toNormalform(true, false));
this.statusMessage = s;
this.status = code;
}
public String getStatus() {
return this.statusMessage;
}
public Row.Entry toRow() {
final byte[] appdatestr = NaturalOrder.encodeLong(this.appdate, rowdef.width(5));
final byte[] loaddatestr = NaturalOrder.encodeLong(0 /*loaddate*/, rowdef.width(12));
final byte[] serverdatestr = NaturalOrder.encodeLong(0 /*lastmodified*/, rowdef.width(13));
// store the hash in the hash cache
final byte[] namebytes = UTF8.getBytes(this.name);
final byte[][] entry =
new byte[][] {
this.url.hash(),
this.initiator,
UTF8.getBytes(this.url.toNormalform(false)),
this.refhash,
namebytes,
appdatestr,
(this.profileHandle == null) ? null : ASCII.getBytes(this.profileHandle),
NaturalOrder.encodeLong(this.depth, rowdef.width(7)),
NaturalOrder.encodeLong(0, rowdef.width(8)), // anchors
NaturalOrder.encodeLong(0, rowdef.width(9)), // forkfactor
this.flags.bytes(),
NaturalOrder.encodeLong(0, rowdef.width(11)),
loaddatestr,
serverdatestr,
new byte[0] // dummy, not used (any more)
};
return rowdef.newEntry(entry);
}
public DigestURL url() {
// the url
return this.url;
}
public void redirectURL(final DigestURL redirectedURL) {
// replace old URL by new one. This should only be used in case of url redirection
this.url = redirectedURL;
}
public byte[] referrerhash() {
// the urlhash of a referer url
return this.refhash;
}
public byte[] initiator() {
// returns the hash of the initiating peer
return this.initiator;
}
public boolean proxy() {
// true when the url was retrieved using the proxy
return (initiator() == null || initiator().length == 0);
}
public Date appdate() {
// the date when the url appeared first
return new Date(this.appdate);
}
public String name() {
// return the anchor name (text inside <a> tag)
return this.name;
}
public int depth() {
// crawl depth where the url appeared
return this.depth;
}
public int timezoneOffset() {
return this.timezoneOffset;
}
public String profileHandle() {
// the handle of the crawl profile
assert this.profileHandle == null || this.profileHandle.length() == Word.commonHashLength : this.profileHandle + " != " + Word.commonHashLength;
return this.profileHandle;
}
@Override
public String toString() {
return this.url.toNormalform(true);
}
}