yacy_search_server/source/net/yacy/search/query/AccessTracker.java

347 lines
14 KiB
Java

/**
* AccessTracker
* an interface for Adaptive Replacement Caches
* Copyright 2009 by Michael Peter Christen, mc@yacy.net, Frankfurt a. M., Germany
* First released 29.08.2009 at http://yacy.net
*
* $LastChangedDate$
* $LastChangedRevision$
* $LastChangedBy$
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.search.query;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.RandomAccessFile;
import java.nio.charset.StandardCharsets;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Date;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.document.WordCache;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.kelondro.util.MemoryControl;
import net.yacy.search.EventTracker;
public class AccessTracker {
private final static long DUMP_PERIOD = 3600000L;
private final static int DUMP_SIZE = 50000;
private static final int minSize = 100;
private static final int maxSize = 1000;
private static final int maxAge = 24 * 60 * 60 * 1000;
public static class QueryEvent {
final public String address;
final public String userAgent;
final public String query;
final public Date date;
final public short offset;
final public short requestedResults;
final public short returnedResults;
final public short knownResults;
final public short executionTime;
public QueryEvent(
final String address, final String userAgent,
final String query, final Date date,
final short offset, final short requestedResults,
final short returnedResults, final short knownResults,
final short executionTime) {
this.address = address;
this.userAgent = userAgent;
this.query = query;
this.date = date;
this.offset = offset;
this.requestedResults = requestedResults;
this.returnedResults = returnedResults;
this.knownResults = knownResults;
this.executionTime = executionTime;
}
}
public enum Location {local, remote}
private static final LinkedList<QueryParams> localSearches = new LinkedList<QueryParams>();
private static final LinkedList<QueryParams> remoteSearches = new LinkedList<QueryParams>();
private static final ArrayList<String> log = new ArrayList<String>();
private static long lastLogDump = System.currentTimeMillis();
private static long localCount = 0;
private static long remoteCount = 0;
private static File dumpFile = null;
public static void setDumpFile(File f) {
dumpFile = f;
}
public static File getDumpFile() {
return dumpFile;
}
public static void add(final Location location, final QueryParams query, int resultCount) {
if (location == Location.local) synchronized (localSearches) {add(localSearches, query, resultCount);}
if (location == Location.remote) synchronized (remoteSearches) {add(remoteSearches, query, resultCount);}
}
private static void add(final LinkedList<QueryParams> list, final QueryParams query, int resultCount) {
// learn that this word can be a word completion for the DidYouMeanLibrary
String queryString = query.getQueryGoal().getQueryString(false);
if (resultCount > 10 && queryString != null && queryString.length() > 0) {
final StringBuilder sb = new StringBuilder(queryString);
sb.append(queryString);
WordCache.learn(sb);
}
// add query to statistics list
list.add(query);
// shrink dump list but keep essentials in dump
while (list.size() > maxSize || (!list.isEmpty() && MemoryControl.shortStatus())) {
synchronized (list) {
if (!list.isEmpty()) addToDump(list.removeFirst(), resultCount); else break;
}
}
// if the list is small we can terminate
if (list.size() <= minSize) return;
// if the list is large we look for too old entries
final long timeout = System.currentTimeMillis() - maxAge;
while (!list.isEmpty()) {
final QueryParams q = list.getFirst();
if (q.starttime > timeout) break;
addToDump(list.removeFirst(), resultCount);
}
}
public static Iterator<QueryParams> get(final Location location) {
if (location == Location.local) return localSearches.descendingIterator();
if (location == Location.remote) return remoteSearches.descendingIterator();
return null;
}
public static long size(final Location location) {
if (location == Location.local) synchronized (localSearches) {return localCount + localSearches.size();}
if (location == Location.remote) synchronized (remoteSearches) {return remoteCount + remoteSearches.size();}
return 0;
}
private static void addToDump(final QueryParams query, int resultCount) {
String queryString = query.getQueryGoal().getQueryString(false);
if (queryString == null || queryString.isEmpty()) return;
addToDump(queryString, Integer.toString(resultCount), new Date(query.starttime));
}
public static void addToDump(String querystring, String resultcount) {
addToDump(querystring, resultcount, new Date());
}
public static void addToDump(String querystring, String resultcount, Date d) {
//if (query.resultcount == 0) return;
if (querystring == null || querystring.isEmpty()) return;
final StringBuilder sb = new StringBuilder(40);
sb.append(GenericFormatter.SHORT_SECOND_FORMATTER.format(d));
sb.append(' ');
sb.append(resultcount);
sb.append(' ');
sb.append(querystring);
synchronized (log) {
log.add(sb.toString());
}
if (log.size() > DUMP_SIZE || lastLogDump + DUMP_PERIOD < System.currentTimeMillis()) {
dumpLog();
}
}
public static void dumpLog() {
lastLogDump = System.currentTimeMillis();
localCount += localSearches.size();
while (!localSearches.isEmpty()) {
addToDump(localSearches.removeFirst(), 0);
}
remoteCount += remoteSearches.size();
while (!remoteSearches.isEmpty()) {
addToDump(remoteSearches.removeFirst(), 0);
}
Thread t = new Thread() {
@Override
public void run() {
ArrayList<String> logCopy = new ArrayList<String>();
synchronized (log) {
logCopy.addAll(log);
log.clear();
}
RandomAccessFile raf = null;
try {
raf = new RandomAccessFile(dumpFile, "rw");
raf.seek(raf.length());
for (final String s: logCopy) {
raf.write(UTF8.getBytes(s));
raf.writeByte(10);
}
logCopy.clear();
} catch (final FileNotFoundException e) {
ConcurrentLog.logException(e);
} catch (final IOException e) {
ConcurrentLog.logException(e);
} finally {
if (raf != null) try {raf.close();} catch (final IOException e) {}
}
}
};
t.start();
}
/**
* read the log and return a list of lines which are equal or greater than
* the from-date and smaller than the to-date
* @param f the dump file
* @param from the left boundary of the sequence to search for (included)
* @param to the right boundary of the sequence to search for (excluded)
* @return a list of lines within the given dates
*/
public static List<EventTracker.Event> readLog(File f, Date from, Date to) {
List<EventTracker.Event> events = new ArrayList<>();
RandomAccessFile raf = null;
try {
raf = new RandomAccessFile(f, "r");
Date fd = readDate(raf, 0);
if (fd.after(from)) from = fd;
long seekFrom = binarySearch(raf, from, 0, raf.length());
long seekTo = binarySearch(raf, to, seekFrom, raf.length());
//Date eDate = readDate(raf, seekTo);
//if (eDate.before(to)) seekTo = raf.length();
raf.seek(seekFrom);
byte[] buffer = new byte[(int) (seekTo - seekFrom)];
raf.readFully(buffer); // we make a copy because that dramatically speeds up reading lines; RandomAccessFile.readLine is very slow
raf.close();
ByteArrayInputStream bais = new ByteArrayInputStream(buffer);
BufferedReader reader = new BufferedReader(new InputStreamReader(bais, StandardCharsets.UTF_8));
String line;
while ((line = reader.readLine()) != null) {
// parse the line
if (line.length() < GenericFormatter.PATTERN_SHORT_SECOND.length() + 3 ||
line.charAt(GenericFormatter.PATTERN_SHORT_SECOND.length()) != ' ') continue;
String dateStr = line.substring(0, GenericFormatter.PATTERN_SHORT_SECOND.length());
int countEnd = -1;
for (int i = GenericFormatter.PATTERN_SHORT_SECOND.length() + 2; i < line.length(); i++) {
if (line.charAt(i) == ' ') { countEnd = i; break; }
}
if (countEnd == -1) continue;
String countStr = line.substring(GenericFormatter.PATTERN_SHORT_SECOND.length() + 1, countEnd);
if (countStr.length() > 5) continue;
int hits = countStr.length() == 1 ? (countStr.charAt(0)) - 48 : Integer.parseInt(countStr);
EventTracker.Event event;
try {
event = new EventTracker.Event(dateStr, 0, "query", line.substring(dateStr.length() + countStr.length() + 2), hits);
events.add(event);
} catch (NumberFormatException e) {
continue;
} catch (Throwable e) {
continue;
}
}
reader.close();
bais.close();
buffer = null;
} catch (final FileNotFoundException e) {
ConcurrentLog.logException(e);
} catch (final IOException e) {
ConcurrentLog.logException(e);
} finally {
if (raf != null) try {raf.close();} catch (final IOException e) {}
}
return events;
}
/**
* recursively search for a the smallest date which is equal or greater than the given date
* @param raf the random access file
* @param date the given date
* @param l first seek position to look (included, we expect a date there or after the position l)
* @param r last seek position to look (excluded, we do not expect that there is a date)
* @return the first position where a date appears that is equal or greater than the given one
*/
private static long binarySearch(RandomAccessFile raf, Date date, long l, long r) throws IOException {
if (r <= l) return l;
long m = seekLB(raf, (l + r) / 2);
if (m <= l) return m;
Date mDate = readDate(raf, m);
if (mDate.after(date)) return binarySearch(raf, date, l, m);
return binarySearch(raf, date, m, r);
}
/**
* find the beginning of a line
* @param raf the random access file
* @param x any seek position in the file
* @return the seek position of the beginning of a line smaller or equal to x
* @throws IOException
*/
private static long seekLB(RandomAccessFile raf, long x) throws IOException {
if (x <= 0) return x;
raf.seek(x);
while (x > 0 && raf.read() >= 32) {x--; raf.seek(x);}
if (x == 0) return 0;
raf.seek(x);
return raf.read() >= 32 ? x : x + 1;
}
/**
* read a date at the seek position; the seek position must be exactly at the date start
* @param raf the random access file
* @param x the seek position of the date string start position
* @return the date at position x
* @throws IOException
*/
private static Date readDate(RandomAccessFile raf, long x) throws IOException {
raf.seek(x);
byte[] b = new byte[GenericFormatter.PATTERN_SHORT_SECOND.length()];
raf.readFully(b);
try {
return GenericFormatter.SHORT_SECOND_FORMATTER.parse(UTF8.String(b), 0).getTime();
} catch (ParseException e) {
throw new IOException(e.getMessage());
}
}
public static void main(String[] args) {
// i.e. /Users/admin/git/rc1/DATA/LOG/queries.log 20140522135156 20140614223118
String file = args[0];
Date from;
try {
from = GenericFormatter.SHORT_SECOND_FORMATTER.parse(args[1], 0).getTime();
Date to = GenericFormatter.SHORT_SECOND_FORMATTER.parse(args[2], 0).getTime();
List<EventTracker.Event> dump = readLog(new File(file), from, to);
for (EventTracker.Event s: dump) System.out.println(s.toString());
} catch (ParseException e) {
e.printStackTrace();
}
}
}