- added coordinate storage in solr schema

- fixed shutdown process
- fixed some solr-to-metadata reading
- added a large number of metadata attributes in ViewFile.html
This commit is contained in:
Michael Peter Christen 2012-08-13 10:40:04 +02:00
parent da851c6071
commit b51df6c7e8
14 changed files with 186 additions and 107 deletions

View File

@ -170,7 +170,7 @@
<dynamicField name="*_ds" type="double" indexed="true" stored="true" multiValued="true"/>
<!-- Type used to index the lat and lon components for the "location" FieldType -->
<dynamicField name="*_coordinate" type="tdouble" indexed="true" stored="false" />
<dynamicField name="*_coordinate" type="tdouble" indexed="true" stored="true" />
<dynamicField name="*_dt" type="date" indexed="true" stored="true"/>
<dynamicField name="*_dts" type="date" indexed="true" stored="true" multiValued="true"/>

View File

@ -34,14 +34,35 @@
<form method="get" action="ViewFile.html" accept-charset="ascii">
<fieldset><legend>URL Metadata</legend>
<dl>
<dt>URL:</dt> <dd><a href="#[url]#">#[url]#</a></dd>
<dt>Hash:</dt> <dd><a href="/api/yacydoc.html?urlhash=#[hash]#">#[hash]#</a></dd>
<dt>In URL-DB:</dt> <dd>#(inurldb)#no::yes#(/inurldb)#</dd>
<dt>In Cache:</dt> <dd>#(incache)#no::yes#(/incache)#</dd>
<dt>Word Count:</dt> <dd>#[wordCount]#</dd>
<dt>URL:</dt><dd><a href="#[url]#">#[url]#</a></dd>
<dt>Hash:</dt><dd><a href="/api/yacydoc.html?urlhash=#[hash]#">#[hash]#</a></dd>
<dt>In Metadata:</dt><dd>#(inurldb)#no::yes#(/inurldb)#</dd>
<dt>In Cache:</dt><dd>#(incache)#no::yes#(/incache)#</dd>
<dt>Word Count:</dt><dd>#[wordCount]#</dd>
<dt>Description:</dt><dd>#[desc]#</dd>
<dt>Size:</dt> <dd>#[size]# Bytes</dd>#(mimeTypeAvailable)#::
<dt>MimeType:</dt> <dd>#[mimeType]#</dd>#(/mimeTypeAvailable)#
<dt>Size:</dt><dd>#[size]# Bytes</dd>#(mimeTypeAvailable)#::
<dt>MimeType:</dt><dd>#[mimeType]#</dd>#(/mimeTypeAvailable)#
<dt>Referrer Hash:</dt><dd>#[referrerHash]#</dd>
<dt>Modified Date:</dt><dd>#[moddate]#</dd>
<dt>Load Date:</dt><dd>#[loaddate]#</dd>
<dt>Fresh Date:</dt><dd>#[freshdate]#</dd>
<dt>Host Hash:</dt><dd>#[hosthash]#</dd>
<dt>dc_creator:</dt><dd>#[dc_creator]#</dd>
<dt>dc_publisher:</dt><dd>#[dc_publisher]#</dd>
<dt>dc_subject:</dt><dd>#[dc_subject]#</dd>
<dt>md5:</dt><dd>#[md5]#</dd>
<dt>lat:</dt><dd>#[lat]#</dd>
<dt>lon:</dt><dd>#[lon]#</dd>
<dt>doctype:</dt><dd>#[doctype]#</dd>
<dt>Language:</dt><dd>#[language]#</dd>
<dt>Flags:</dt><dd>#[flags]#</dd>
<dt>Word Count:</dt><dd>#[wordCount]#</dd>
<dt>Local Links:</dt><dd>#[llocal]#</dd>
<dt>Global Links:</dt><dd>#[lother]#</dd>
<dt>Image Links:</dt><dd>#[limage]#</dd>
<dt>Audio Links:</dt><dd>#[laudio]#</dd>
<dt>Video Links:</dt><dd>#[lvideo]#</dd>
<dt>App Links:</dt><dd>#[lapp]#</dd>
<dt>Triplestore:</dt><dd><pre>#[triples]#</pre></dd>
<dt><label for="viewMode">View as</label>:</dt>
<dd>

View File

@ -112,7 +112,28 @@ public class ViewFile {
boolean pre = false;
// get the url hash from which the content should be loaded
String urlHash = post.get("urlHash", "");
String urlHash = post.get("urlHash", post.get("urlhash", ""));
if (urlHash.length() == 0) {
// alternatively, get the url simply from a url String
// this can be used as a simple tool to test the text parser
final String urlString = post.get("url", "");
if (urlString.length() > 0) try {
// this call forces the peer to download web pages
// it is therefore protected by the admin password
if (!sb.verifyAuthentication(header)) {
prop.authenticationRequired();
return prop;
}
// define an url by post parameter
url = new DigestURI(MultiProtocolURI.unescape(urlString));
urlHash = ASCII.String(url.hash());
pre = post.getBoolean("pre");
} catch (final MalformedURLException e) {}
}
URIMetadata urlEntry = null;
// get the urlEntry that belongs to the url hash
//boolean ue = urlHash.length() > 0 && indexSegment.exists(ASCII.getBytes(urlHash));
@ -133,25 +154,6 @@ public class ViewFile {
prop.put("error_inurldb", urlEntry == null ? 0 : 1);
// alternatively, get the url simply from a url String
// this can be used as a simple tool to test the text parser
final String urlString = post.get("url", "");
if (urlString.length() > 0) try {
// this call forces the peer to download web pages
// it is therefore protected by the admin password
if (!sb.verifyAuthentication(header)) {
prop.authenticationRequired();
return prop;
}
// define an url by post parameter
url = new DigestURI(MultiProtocolURI.unescape(urlString));
urlHash = ASCII.String(url.hash());
pre = post.getBoolean("pre");
} catch (final MalformedURLException e) {}
if (url == null) {
prop.put("error", "1");
prop.put("viewMode", VIEW_MODE_NO_TEXT);
@ -346,6 +348,53 @@ public class ViewFile {
prop.put("error_mimeTypeAvailable_mimeType", response.getMimeType());
Model model = JenaTripleStore.getSubmodelBySubject(YaCyMetadata.hashURI(url.hash()));
prop.putXML("error_triples", JenaTripleStore.getRDFByModel(model));
if (urlEntry == null) {
prop.put("error_referrerHash", "");
prop.put("error_moddate", "");
prop.put("error_loaddate", "");
prop.put("error_freshdate", "");
prop.put("error_hosthash", "");
prop.putHTML("error_dc_creator", "");
prop.putHTML("error_dc_publisher", "");
prop.putHTML("error_dc_subject", "");
prop.put("error_md5", "");
prop.put("error_lat", "");
prop.put("error_lon", "");
prop.put("error_doctype", "");
prop.put("error_language", "");
prop.put("error_flags", "");
prop.put("error_wordCount", "");
prop.put("error_llocal", "");
prop.put("error_lother", "");
prop.put("error_limage", "");
prop.put("error_laudio", "");
prop.put("error_lvideo", "");
prop.put("error_lapp", "");
} else {
prop.put("error_referrerHash", urlEntry.referrerHash());
prop.put("error_moddate", urlEntry.moddate());
prop.put("error_loaddate", urlEntry.loaddate());
prop.put("error_freshdate", urlEntry.freshdate());
prop.put("error_hosthash", urlEntry.hosthash());
prop.putHTML("error_dc_creator", urlEntry.dc_creator());
prop.putHTML("error_dc_publisher", urlEntry.dc_publisher());
prop.putHTML("error_dc_subject", urlEntry.dc_subject());
prop.put("error_md5", urlEntry.md5());
prop.put("error_lat", urlEntry.lat());
prop.put("error_lon", urlEntry.lon());
prop.put("error_doctype", Response.doctype2mime(url.getFileExtension(), urlEntry.doctype()));
prop.put("error_language", urlEntry.language());
prop.put("error_flags", urlEntry.flags().toString());
prop.put("error_wordCount", urlEntry.wordCount());
prop.put("error_llocal", urlEntry.llocal());
prop.put("error_lother", urlEntry.lother());
prop.put("error_limage", urlEntry.limage());
prop.put("error_laudio", urlEntry.laudio());
prop.put("error_lvideo", urlEntry.lvideo());
prop.put("error_lapp", urlEntry.lapp());
}
return prop;
}

View File

@ -114,16 +114,14 @@ public final class CrawlStacker {
public void announceClose() {
this.log.logInfo("Flushing remaining " + size() + " crawl stacker job entries.");
this.fastQueue.announceShutdown();
this.slowQueue.announceShutdown();
this.fastQueue.shutdown();
this.slowQueue.shutdown();
}
public synchronized void close() {
this.log.logInfo("Shutdown. waiting for remaining " + size() + " crawl stacker job entries. please wait.");
this.fastQueue.announceShutdown();
this.slowQueue.announceShutdown();
this.fastQueue.awaitShutdown(2000);
this.slowQueue.awaitShutdown(2000);
this.fastQueue.shutdown();
this.slowQueue.shutdown();
this.log.logInfo("Shutdown. Closing stackCrawl queue.");

View File

@ -172,6 +172,7 @@ public class serverObjects extends HashMap<String, String> implements Cloneable
* @return the previous value as String.
*/
public String put(final String key, final byte[] value) {
if (value == null) return this.put(key, "NULL");
return this.put(key, UTF8.String(value));
}

View File

@ -13,12 +13,12 @@
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
@ -39,9 +39,9 @@ public class ASCII implements Comparator<String> {
public static final ASCII insensitiveASCIIComparator = new ASCII(true);
public static final ASCII identityASCIIComparator = new ASCII(false);
public boolean insensitive;
public ASCII(boolean insensitive) {
this.insensitive = insensitive;
}
@ -51,6 +51,7 @@ public class ASCII implements Comparator<String> {
return this; // because we do not have any class variables that changes
}
@Override
public int compare(String s0, String s1) {
if (s0 == null && s1 == null) return 0;
if (s0 == null) return -1;
@ -72,7 +73,7 @@ public class ASCII implements Comparator<String> {
if (l1 > l0) return -1;
return 0;
}
public boolean equals(String s0, String s1) {
if (s0 == null && s1 == null) return true;
if (s0 == null) return false;
@ -102,16 +103,16 @@ public class ASCII implements Comparator<String> {
public int hashCode() {
return System.identityHashCode(this);
}
public final static String String(final byte[] bytes) {
StringBuilder sb = new StringBuilder(bytes.length);
for (int i = 0; i < bytes.length; ++ i) {
if (bytes[i] < 0) throw new IllegalArgumentException();
sb.append((char) bytes[i]);
for (byte b : bytes) {
if (b < 0) throw new IllegalArgumentException();
sb.append((char) b);
}
return sb.toString();
}
public final static String String(final byte[] bytes, final int offset, final int length) {
int l = Math.min(length, bytes.length - offset);
StringBuilder sb = new StringBuilder(l);
@ -121,10 +122,11 @@ public class ASCII implements Comparator<String> {
}
return sb.toString();
}
public final static byte[] getBytes(final String s) {
final byte[] b = new byte[s.length()];
for (int i = 0; i < s.length(); i++) {
int count = s.length();
final byte[] b = new byte[count];
for (int i = 0; i < count; i++) {
b[i] = (byte) s.charAt(i);
}
return b;

View File

@ -23,6 +23,7 @@
package net.yacy.kelondro.data.meta;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.regex.Pattern;
@ -105,12 +106,6 @@ public class URIMetadataNode implements URIMetadata {
return x.intValue();
}
private long getLong(YaCySchema field) {
Long x = (Long) this.doc.getFieldValue(field.name());
if (x == null) return 0;
return x.longValue();
}
private double getDouble(YaCySchema field) {
Double x = (Double) this.doc.getFieldValue(field.name());
if (x == null) return 0.0d;
@ -129,6 +124,13 @@ public class URIMetadataNode implements URIMetadata {
return x;
}
private ArrayList<Object> getArrayList(YaCySchema field) {
@SuppressWarnings({ "unchecked", "rawtypes" })
ArrayList<Object> x = (ArrayList) this.doc.getFieldValue(field.name());
if (x == null) return new ArrayList<Object>(0);
return x;
}
@Override
public byte[] hash() {
return this.hash;
@ -209,14 +211,16 @@ public class URIMetadataNode implements URIMetadata {
@Override
public char doctype() {
return Response.docType(getString(YaCySchema.content_type));
ArrayList<Object> a = getArrayList(YaCySchema.content_type);
if (a == null || a.size() == 0) return Response.docType(this.url);
return Response.docType((String) a.get(0));
}
@Override
public byte[] language() {
String[] languages = (String[]) this.doc.getFieldValue(YaCySchema.language_txt.name());
if (languages == null || languages.length == 0) return ASCII.getBytes("en");
return UTF8.getBytes(languages[0]);
ArrayList<Object> languages = getArrayList(YaCySchema.language_txt);
if (languages == null || languages.size() == 0) return ASCII.getBytes("en");
return UTF8.getBytes((String) languages.get(0));
}
@Override

View File

@ -80,14 +80,16 @@ public class WorkflowProcessor<J extends WorkflowJob> {
}
public int queueSize() {
if (this.input == null) return 0;
return this.input.size();
}
public boolean queueIsEmpty() {
return this.input.isEmpty();
return this.input == null || this.input.isEmpty();
}
public int queueSizeMax() {
if (this.input == null) return 0;
return this.input.size() + this.input.remainingCapacity();
}
@ -174,7 +176,7 @@ public class WorkflowProcessor<J extends WorkflowJob> {
}
@SuppressWarnings("unchecked")
public void announceShutdown() {
public void shutdown() {
if (this.executor == null) {
return;
}
@ -191,14 +193,18 @@ public class WorkflowProcessor<J extends WorkflowJob> {
Log.logInfo("serverProcessor", ".. poison pill is in queue " + this.processName + ", thread " + i + ". awaiting termination");
} catch (final InterruptedException e) { }
}
}
public void awaitShutdown(final long millisTimeout) {
// wait until input queue is empty
while (this.input.size() > 0) {
try {Thread.sleep(100);} catch (InterruptedException e) {}
}
// shut down executors
if (this.executor != null & !this.executor.isShutdown()) {
// wait for shutdown
try {
this.executor.shutdown();
this.executor.awaitTermination(millisTimeout, TimeUnit.MILLISECONDS);
this.executor.awaitTermination(Long.MAX_VALUE, TimeUnit.MILLISECONDS);
} catch (final InterruptedException e) {}
}
Log.logInfo("serverProcessor", "queue " + this.processName + ": shutdown.");

View File

@ -429,7 +429,7 @@ public class Dispatcher {
public void close() {
// removes all entries from the dispatcher and puts them back to a RAMRI
if (this.indexingTransmissionProcessor != null) this.indexingTransmissionProcessor.announceShutdown();
if (this.indexingTransmissionProcessor != null) this.indexingTransmissionProcessor.shutdown();
if (this.transmissionCloud != null) {
outerLoop: for (final Map.Entry<ByteArray, Transmission.Chunk> e : this.transmissionCloud.entrySet()) {
for (final ReferenceContainer<WordReference> i : e.getValue()) try {
@ -443,7 +443,6 @@ public class Dispatcher {
}
this.transmissionCloud = null;
if (this.indexingTransmissionProcessor != null) {
this.indexingTransmissionProcessor.awaitShutdown(10000);
this.indexingTransmissionProcessor.clear();
}
this.indexingTransmissionProcessor = null;

View File

@ -1559,19 +1559,15 @@ public final class Switchboard extends serverSwitch
net.yacy.gui.framework.Switchboard.shutdown();
this.log.logConfig("SWITCHBOARD SHUTDOWN STEP 2: sending termination signal to threaded indexing");
// closing all still running db importer jobs
this.indexingDocumentProcessor.announceShutdown();
this.indexingDocumentProcessor.awaitShutdown(12000);
this.crawlStacker.announceClose();
this.indexingCondensementProcessor.announceShutdown();
this.indexingAnalysisProcessor.announceShutdown();
this.indexingStorageProcessor.announceShutdown();
this.crawlStacker.close();
this.indexingDocumentProcessor.shutdown();
this.indexingCondensementProcessor.shutdown();
this.indexingAnalysisProcessor.shutdown();
this.indexingStorageProcessor.shutdown();
if ( this.dhtDispatcher != null ) {
this.dhtDispatcher.close();
}
this.indexingCondensementProcessor.awaitShutdown(12000);
this.indexingAnalysisProcessor.awaitShutdown(12000);
this.indexingStorageProcessor.awaitShutdown(12000);
this.crawlStacker.close();
// de.anomic.http.client.Client.closeAllConnections();
this.wikiDB.close();
this.blogDB.close();
@ -1584,8 +1580,7 @@ public final class Switchboard extends serverSwitch
this.webStructure.close();
this.crawlQueues.close();
this.crawler.close();
this.log
.logConfig("SWITCHBOARD SHUTDOWN STEP 3: sending termination signal to database manager (stand by...)");
this.log.logConfig("SWITCHBOARD SHUTDOWN STEP 3: sending termination signal to database manager (stand by...)");
this.index.close();
this.peers.close();
Cache.close();
@ -1707,6 +1702,7 @@ public final class Switchboard extends serverSwitch
baos.flush();
processSurrogate(new ByteArrayInputStream(baos.toByteArray()), entry.getName());
baos.close();
if (shallTerminate()) break;
}
} catch ( final IOException e ) {
Log.logException(e);
@ -1725,24 +1721,26 @@ public final class Switchboard extends serverSwitch
} catch ( final IOException e ) {
Log.logException(e);
} finally {
moved = infile.renameTo(outfile);
if ( moved ) {
// check if this file is already compressed, if not, compress now
if ( !outfile.getName().endsWith(".gz") ) {
final String gzname = outfile.getName() + ".gz";
final File gzfile = new File(outfile.getParentFile(), gzname);
try {
final OutputStream os =
new BufferedOutputStream(new GZIPOutputStream(new FileOutputStream(gzfile)));
FileUtils.copy(new BufferedInputStream(new FileInputStream(outfile)), os);
os.close();
if ( gzfile.exists() ) {
FileUtils.deletedelete(outfile);
if (!shallTerminate()) {
moved = infile.renameTo(outfile);
if ( moved ) {
// check if this file is already compressed, if not, compress now
if ( !outfile.getName().endsWith(".gz") ) {
final String gzname = outfile.getName() + ".gz";
final File gzfile = new File(outfile.getParentFile(), gzname);
try {
final OutputStream os =
new BufferedOutputStream(new GZIPOutputStream(new FileOutputStream(gzfile)));
FileUtils.copy(new BufferedInputStream(new FileInputStream(outfile)), os);
os.close();
if ( gzfile.exists() ) {
FileUtils.deletedelete(outfile);
}
} catch ( final FileNotFoundException e ) {
Log.logException(e);
} catch ( final IOException e ) {
Log.logException(e);
}
} catch ( final FileNotFoundException e ) {
Log.logException(e);
} catch ( final IOException e ) {
Log.logException(e);
}
}
}
@ -1795,6 +1793,7 @@ public final class Switchboard extends serverSwitch
Log.logException(e);
break;
}
if (shallTerminate()) break;
}
}

View File

@ -209,14 +209,6 @@ public final class MetadataRepository implements Iterable<byte[]> {
private URIMetadata load(final byte[] urlHash, WordReference wre, long weight) {
// get the metadata from the old metadata index
if (this.urlIndexFile != null) try {
final Row.Entry entry = this.urlIndexFile.get(urlHash, false);
if (entry != null) return new URIMetadataRow(entry, wre, weight);
} catch (final IOException e) {
Log.logException(e);
}
// get the metadata from Solr
try {
SolrDocument doc = this.solr.get(ASCII.String(urlHash));
@ -225,6 +217,14 @@ public final class MetadataRepository implements Iterable<byte[]> {
Log.logException(e);
}
// get the metadata from the old metadata index
if (this.urlIndexFile != null) try {
final Row.Entry entry = this.urlIndexFile.get(urlHash, false);
if (entry != null) return new URIMetadataRow(entry, wre, weight);
} catch (final IOException e) {
Log.logException(e);
}
return null;
}

View File

@ -216,8 +216,8 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
// coordinates
if (md.lat() != 0.0f && md.lon() != 0.0f) {
if (allAttr || contains(YaCySchema.lat_coordinate)) addSolr(solrdoc, YaCySchema.lat_coordinate, md.lat());
if (allAttr || contains(YaCySchema.lon_coordinate)) addSolr(solrdoc, YaCySchema.lon_coordinate, md.lon());
if (allAttr || contains(YaCySchema.lat_coordinate)) addSolr(solrdoc, YaCySchema.lat_coordinate, md.lat());
}
if (allAttr || contains(YaCySchema.httpstatus_i)) addSolr(solrdoc, YaCySchema.httpstatus_i, 200);
@ -579,8 +579,8 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
// coordinates
if (yacydoc.lat() != 0.0f && yacydoc.lon() != 0.0f) {
if (allAttr || contains(YaCySchema.lat_coordinate)) addSolr(solrdoc, YaCySchema.lat_coordinate, yacydoc.lat());
if (allAttr || contains(YaCySchema.lon_coordinate)) addSolr(solrdoc, YaCySchema.lon_coordinate, yacydoc.lon());
if (allAttr || contains(YaCySchema.lat_coordinate)) addSolr(solrdoc, YaCySchema.lat_coordinate, yacydoc.lat());
}
if (allAttr || contains(YaCySchema.httpstatus_i)) addSolr(solrdoc, YaCySchema.httpstatus_i, header == null ? 200 : header.getStatusCode());

View File

@ -72,8 +72,8 @@ public enum YaCySchema implements Schema {
outboundlinks_relflags_txt(SolrType.text_general, true, true, true, "external links, the rel property of the a-tag, coded binary"),
outboundlinks_text_txt(SolrType.text_general, true, true, true, "external links, the text content of the a-tag"),
charset_s(SolrType.string, true, true, "character encoding"),
lon_coordinate(SolrType.tdouble, true, false, "longitude of location as declared in WSG84"),
lat_coordinate(SolrType.tdouble, true, false, "latitude of location as declared in WSG84"),
lon_coordinate(SolrType.tdouble, true, true, "longitude of location as declared in WSG84"),
lat_coordinate(SolrType.tdouble, true, true, "latitude of location as declared in WSG84"),
httpstatus_i(SolrType.integer, true, true, "html status return code (i.e. \"200\" for ok), -1 if not loaded"),
h1_txt(SolrType.text_general, true, true, true, "h1 header"),
h2_txt(SolrType.text_general, true, true, true, "h2 header"),

View File

@ -114,13 +114,13 @@ public class EmbeddedSolrConnector extends AbstractSolrConnector implements Solr
public SolrConfig getConfig() {
return this.defaultCore.getSolrConfig();
}
@Override
public long getSize() {
// do some magic here to prevent the super.getSize() call which is a bad hack
return super.getSize();
return super.getSize();
}
@Override
public synchronized void close() {
super.close();