mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-21 00:00:13 +02:00
fed26f33a8
to support the new time parser and search functions in YaCy a high precision detection of date and time on the day is necessary. That requires that the time zone of the document content and the time zone of the user, doing a search, is detected. The time zone of the search request is done automatically using the browsers time zone offset which is delivered to the search request automatically and invisible to the user. The time zone for the content of web pages cannot be detected automatically and must be an attribute of crawl starts. The advanced crawl start now provides an input field to set the time zone in minutes as an offset number. All parsers must get a time zone offset passed, so this required the change of the parser java api. A lot of other changes had been made which corrects the wrong handling of dates in YaCy which was to add a correction based on the time zone of the server. Now no correction is added and all dates in YaCy are UTC/GMT time zone, a normalized time zone for all peers.
338 lines
14 KiB
Java
338 lines
14 KiB
Java
/**
|
|
* AccessTracker
|
|
* an interface for Adaptive Replacement Caches
|
|
* Copyright 2009 by Michael Peter Christen, mc@yacy.net, Frankfurt a. M., Germany
|
|
* First released 29.08.2009 at http://yacy.net
|
|
*
|
|
* $LastChangedDate$
|
|
* $LastChangedRevision$
|
|
* $LastChangedBy$
|
|
*
|
|
* This library is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
* License as published by the Free Software Foundation; either
|
|
* version 2.1 of the License, or (at your option) any later version.
|
|
*
|
|
* This library is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* Lesser General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU Lesser General Public License
|
|
* along with this program in the file lgpl21.txt
|
|
* If not, see <http://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
package net.yacy.search.query;
|
|
|
|
import java.io.BufferedReader;
|
|
import java.io.ByteArrayInputStream;
|
|
import java.io.File;
|
|
import java.io.FileNotFoundException;
|
|
import java.io.IOException;
|
|
import java.io.InputStreamReader;
|
|
import java.io.RandomAccessFile;
|
|
import java.text.ParseException;
|
|
import java.util.ArrayList;
|
|
import java.util.Date;
|
|
import java.util.Iterator;
|
|
import java.util.LinkedList;
|
|
import java.util.List;
|
|
|
|
import net.yacy.cora.date.GenericFormatter;
|
|
import net.yacy.cora.document.WordCache;
|
|
import net.yacy.cora.document.encoding.UTF8;
|
|
import net.yacy.cora.util.ConcurrentLog;
|
|
import net.yacy.kelondro.util.MemoryControl;
|
|
import net.yacy.search.EventTracker;
|
|
|
|
public class AccessTracker {
|
|
|
|
private final static long DUMP_PERIOD = 60000L;
|
|
|
|
private static final int minSize = 100;
|
|
private static final int maxSize = 1000;
|
|
private static final int maxAge = 24 * 60 * 60 * 1000;
|
|
|
|
public static class QueryEvent {
|
|
final public String address;
|
|
final public String userAgent;
|
|
final public String query;
|
|
final public Date date;
|
|
final public short offset;
|
|
final public short requestedResults;
|
|
final public short returnedResults;
|
|
final public short knownResults;
|
|
final public short executionTime;
|
|
|
|
public QueryEvent(
|
|
final String address, final String userAgent,
|
|
final String query, final Date date,
|
|
final short offset, final short requestedResults,
|
|
final short returnedResults, final short knownResults,
|
|
final short executionTime) {
|
|
this.address = address;
|
|
this.userAgent = userAgent;
|
|
this.query = query;
|
|
this.date = date;
|
|
this.offset = offset;
|
|
this.requestedResults = requestedResults;
|
|
this.returnedResults = returnedResults;
|
|
this.knownResults = knownResults;
|
|
this.executionTime = executionTime;
|
|
}
|
|
}
|
|
|
|
public enum Location {local, remote}
|
|
|
|
private static final LinkedList<QueryParams> localSearches = new LinkedList<QueryParams>();
|
|
private static final LinkedList<QueryParams> remoteSearches = new LinkedList<QueryParams>();
|
|
private static final ArrayList<String> log = new ArrayList<String>();
|
|
private static long lastLogDump = System.currentTimeMillis();
|
|
private static File dumpFile = null;
|
|
|
|
public static void setDumpFile(File f) {
|
|
dumpFile = f;
|
|
}
|
|
|
|
public static File getDumpFile() {
|
|
return dumpFile;
|
|
}
|
|
|
|
public static void add(final Location location, final QueryParams query, int resultCount) {
|
|
if (location == Location.local) synchronized (localSearches) {add(localSearches, query, resultCount);}
|
|
if (location == Location.remote) synchronized (remoteSearches) {add(remoteSearches, query, resultCount);}
|
|
}
|
|
|
|
private static void add(final LinkedList<QueryParams> list, final QueryParams query, int resultCount) {
|
|
// learn that this word can be a word completion for the DidYouMeanLibrary
|
|
String queryString = query.getQueryGoal().getQueryString(false);
|
|
if (resultCount > 10 && queryString != null && queryString.length() > 0) {
|
|
final StringBuilder sb = new StringBuilder(queryString);
|
|
sb.append(queryString);
|
|
WordCache.learn(sb);
|
|
}
|
|
|
|
// add query to statistics list
|
|
list.add(query);
|
|
|
|
// shrink dump list but keep essentials in dump
|
|
while (list.size() > maxSize || (!list.isEmpty() && MemoryControl.shortStatus())) {
|
|
synchronized (list) {
|
|
if (!list.isEmpty()) addToDump(list.removeFirst(), resultCount); else break;
|
|
}
|
|
}
|
|
|
|
// if the list is small we can terminate
|
|
if (list.size() <= minSize) return;
|
|
|
|
// if the list is large we look for too old entries
|
|
final long timeout = System.currentTimeMillis() - maxAge;
|
|
while (!list.isEmpty()) {
|
|
final QueryParams q = list.getFirst();
|
|
if (q.starttime > timeout) break;
|
|
addToDump(list.removeFirst(), resultCount);
|
|
}
|
|
}
|
|
|
|
public static Iterator<QueryParams> get(final Location location) {
|
|
if (location == Location.local) return localSearches.descendingIterator();
|
|
if (location == Location.remote) return remoteSearches.descendingIterator();
|
|
return null;
|
|
}
|
|
|
|
public static int size(final Location location) {
|
|
if (location == Location.local) synchronized (localSearches) {return localSearches.size();}
|
|
if (location == Location.remote) synchronized (remoteSearches) {return remoteSearches.size();}
|
|
return 0;
|
|
}
|
|
|
|
private static void addToDump(final QueryParams query, int resultCount) {
|
|
String queryString = query.getQueryGoal().getQueryString(false);
|
|
if (queryString == null || queryString.isEmpty()) return;
|
|
addToDump(queryString, Integer.toString(resultCount), new Date(query.starttime));
|
|
}
|
|
|
|
public static void addToDump(String querystring, String resultcount) {
|
|
addToDump(querystring, resultcount, new Date());
|
|
if (lastLogDump + DUMP_PERIOD < System.currentTimeMillis()) {
|
|
lastLogDump = System.currentTimeMillis();
|
|
dumpLog();
|
|
}
|
|
}
|
|
|
|
public static void addToDump(String querystring, String resultcount, Date d) {
|
|
//if (query.resultcount == 0) return;
|
|
if (querystring == null || querystring.isEmpty()) return;
|
|
final StringBuilder sb = new StringBuilder(40);
|
|
sb.append(GenericFormatter.SHORT_SECOND_FORMATTER.format(d));
|
|
sb.append(' ');
|
|
sb.append(resultcount);
|
|
sb.append(' ');
|
|
sb.append(querystring);
|
|
synchronized (log) {
|
|
log.add(sb.toString());
|
|
}
|
|
}
|
|
|
|
public static void dumpLog() {
|
|
while (!localSearches.isEmpty()) {
|
|
addToDump(localSearches.removeFirst(), 0);
|
|
}
|
|
Thread t = new Thread() {
|
|
@Override
|
|
public void run() {
|
|
ArrayList<String> logCopy = new ArrayList<String>();
|
|
synchronized (log) {
|
|
logCopy.addAll(log);
|
|
log.clear();
|
|
}
|
|
RandomAccessFile raf = null;
|
|
try {
|
|
raf = new RandomAccessFile(dumpFile, "rw");
|
|
raf.seek(raf.length());
|
|
for (final String s: logCopy) {
|
|
raf.write(UTF8.getBytes(s));
|
|
raf.writeByte(10);
|
|
}
|
|
logCopy.clear();
|
|
} catch (final FileNotFoundException e) {
|
|
ConcurrentLog.logException(e);
|
|
} catch (final IOException e) {
|
|
ConcurrentLog.logException(e);
|
|
} finally {
|
|
if (raf != null) try {raf.close();} catch (final IOException e) {}
|
|
}
|
|
}
|
|
};
|
|
t.start();
|
|
}
|
|
|
|
/**
|
|
* read the log and return a list of lines which are equal or greater than
|
|
* the from-date and smaller than the to-date
|
|
* @param f the dump file
|
|
* @param from the left boundary of the sequence to search for (included)
|
|
* @param to the right boundary of the sequence to search for (excluded)
|
|
* @return a list of lines within the given dates
|
|
*/
|
|
public static List<EventTracker.Event> readLog(File f, Date from, Date to) {
|
|
List<EventTracker.Event> events = new ArrayList<>();
|
|
RandomAccessFile raf = null;
|
|
try {
|
|
raf = new RandomAccessFile(f, "r");
|
|
Date fd = readDate(raf, 0);
|
|
if (fd.after(from)) from = fd;
|
|
long seekFrom = binarySearch(raf, from, 0, raf.length());
|
|
long seekTo = binarySearch(raf, to, seekFrom, raf.length());
|
|
//Date eDate = readDate(raf, seekTo);
|
|
//if (eDate.before(to)) seekTo = raf.length();
|
|
raf.seek(seekFrom);
|
|
byte[] buffer = new byte[(int) (seekTo - seekFrom)];
|
|
raf.readFully(buffer); // we make a copy because that dramatically speeds up reading lines; RandomAccessFile.readLine is very slow
|
|
raf.close();
|
|
ByteArrayInputStream bais = new ByteArrayInputStream(buffer);
|
|
BufferedReader reader = new BufferedReader(new InputStreamReader(bais, "UTF-8"));
|
|
String line;
|
|
while ((line = reader.readLine()) != null) {
|
|
// parse the line
|
|
if (line.length() < GenericFormatter.PATTERN_SHORT_SECOND.length() + 3 ||
|
|
line.charAt(GenericFormatter.PATTERN_SHORT_SECOND.length()) != ' ') continue;
|
|
String dateStr = line.substring(0, GenericFormatter.PATTERN_SHORT_SECOND.length());
|
|
int countEnd = -1;
|
|
for (int i = GenericFormatter.PATTERN_SHORT_SECOND.length() + 2; i < line.length(); i++) {
|
|
if (line.charAt(i) == ' ') { countEnd = i; break; }
|
|
}
|
|
if (countEnd == -1) continue;
|
|
String countStr = line.substring(GenericFormatter.PATTERN_SHORT_SECOND.length() + 1, countEnd);
|
|
if (countStr.length() > 5) continue;
|
|
int hits = countStr.length() == 1 ? (countStr.charAt(0)) - 48 : Integer.parseInt(countStr);
|
|
EventTracker.Event event;
|
|
try {
|
|
event = new EventTracker.Event(dateStr, 0, "query", line.substring(dateStr.length() + countStr.length() + 2), hits);
|
|
events.add(event);
|
|
} catch (NumberFormatException e) {
|
|
continue;
|
|
} catch (Throwable e) {
|
|
continue;
|
|
}
|
|
}
|
|
reader.close();
|
|
bais.close();
|
|
buffer = null;
|
|
} catch (final FileNotFoundException e) {
|
|
ConcurrentLog.logException(e);
|
|
} catch (final IOException e) {
|
|
ConcurrentLog.logException(e);
|
|
} finally {
|
|
if (raf != null) try {raf.close();} catch (final IOException e) {}
|
|
}
|
|
return events;
|
|
}
|
|
|
|
/**
|
|
* recursively search for a the smallest date which is equal or greater than the given date
|
|
* @param raf the random access file
|
|
* @param date the given date
|
|
* @param l first seek position to look (included, we expect a date there or after the position l)
|
|
* @param r last seek position to look (excluded, we do not expect that there is a date)
|
|
* @return the first position where a date appears that is equal or greater than the given one
|
|
*/
|
|
private static long binarySearch(RandomAccessFile raf, Date date, long l, long r) throws IOException {
|
|
if (r <= l) return l;
|
|
long m = seekLB(raf, (l + r) / 2);
|
|
if (m <= l) return m;
|
|
Date mDate = readDate(raf, m);
|
|
if (mDate.after(date)) return binarySearch(raf, date, l, m);
|
|
return binarySearch(raf, date, m, r);
|
|
}
|
|
|
|
/**
|
|
* find the beginning of a line
|
|
* @param raf the random access file
|
|
* @param x any seek position in the file
|
|
* @return the seek position of the beginning of a line smaller or equal to x
|
|
* @throws IOException
|
|
*/
|
|
private static long seekLB(RandomAccessFile raf, long x) throws IOException {
|
|
if (x <= 0) return x;
|
|
raf.seek(x);
|
|
while (x > 0 && raf.read() >= 32) {x--; raf.seek(x);}
|
|
if (x == 0) return 0;
|
|
raf.seek(x);
|
|
return raf.read() >= 32 ? x : x + 1;
|
|
}
|
|
|
|
/**
|
|
* read a date at the seek position; the seek position must be exactly at the date start
|
|
* @param raf the random access file
|
|
* @param x the seek position of the date string start position
|
|
* @return the date at position x
|
|
* @throws IOException
|
|
*/
|
|
private static Date readDate(RandomAccessFile raf, long x) throws IOException {
|
|
raf.seek(x);
|
|
byte[] b = new byte[GenericFormatter.PATTERN_SHORT_SECOND.length()];
|
|
raf.readFully(b);
|
|
try {
|
|
return GenericFormatter.SHORT_SECOND_FORMATTER.parse(UTF8.String(b), 0).getTime();
|
|
} catch (ParseException e) {
|
|
throw new IOException(e.getMessage());
|
|
}
|
|
}
|
|
|
|
public static void main(String[] args) {
|
|
// i.e. /Users/admin/git/rc1/DATA/LOG/queries.log 20140522135156 20140614223118
|
|
String file = args[0];
|
|
Date from;
|
|
try {
|
|
from = GenericFormatter.SHORT_SECOND_FORMATTER.parse(args[1], 0).getTime();
|
|
Date to = GenericFormatter.SHORT_SECOND_FORMATTER.parse(args[2], 0).getTime();
|
|
List<EventTracker.Event> dump = readLog(new File(file), from, to);
|
|
for (EventTracker.Event s: dump) System.out.println(s.toString());
|
|
} catch (ParseException e) {
|
|
e.printStackTrace();
|
|
}
|
|
}
|
|
}
|