yacy_search_server/htroot/IndexControlURLs_p.html
Michael Peter Christen 0a879c98e7 added new 'firstSeen' database table and necessary data structures which
hold a date for each URL to record when a url was first seen. This is
then used to overwrite the modification date for urls upon recrawl in
case that the first-seen date is before the latest document date. This
behaviour is necessary due to the common behaviour of content management
systems which attach always the current date to all documents. Using the
firstSeen database it is possible to approximate a real first document
creation date in case that the crawler starts frequently for the same
domain. As a result the search results ordered by date have a much
better quality and the usage of YaCy as search agent for latest news has
a better quality.
2014-11-13 00:58:58 +01:00

266 lines
15 KiB
HTML

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "DTD/xhtml1-transitional.dtd">
<!-- This page is only XHTML 1.0 Transitional because target is being used in a links -->
<html xmlns="http://www.w3.org/1999/xhtml">
#(reload)#::<meta http-equiv="REFRESH" content="5; url=/IndexControlURLs_p.html">#(/reload)#
<head>
<title>YaCy '#[clientname]#': URL Database Administration</title>
#%env/templates/metas.template%#
<script type="text/javascript">
//<![CDATA[
function xmlhttpPost() {
var searchform = document.getElementById('searchform');
search(searchform.urlstring.value);
}
function search(query) {
var xmlHttpReq = false;
var self = this;
if (window.XMLHttpRequest) { // Mozilla/Safari
self.xmlHttpReq = new XMLHttpRequest();
}
else if (window.ActiveXObject) { // IE
self.xmlHttpReq = new ActiveXObject("Microsoft.XMLHTTP");
}
self.xmlHttpReq.open('GET', "/solr/select?q=sku:\"" + query + "\" OR host_s:\"" + query + "\" OR host_dnc_s:\"" + query + "\" OR host_organization_s:\"" + query + "\" OR host_organizationdnc_s:\"" + query + "\" OR host_subdomain_s:\"" + query + "\" OR url_paths_sxt:\"" + query + "\" OR url_file_name_s:\"" + query + "\"&start=0&rows=100&wt=yjson", true);
self.xmlHttpReq.setRequestHeader('Content-Type', 'application/x-www-form-urlencoded');
self.xmlHttpReq.onreadystatechange = function() {
if (self.xmlHttpReq.readyState == 4) {
updatepage(self.xmlHttpReq.responseText);
}
}
self.xmlHttpReq.send(null);
}
function updatepage(str) {
var raw = document.getElementById("raw");
if (raw != null) raw.innerHTML = str;
var rsp = eval("("+str+")");
var firstChannel = rsp.channels[0];
var totalResults = firstChannel.totalResults.replace(/[,.]/,"");
var startIndex = firstChannel.startIndex;
var itemsPerPage = firstChannel.itemsPerPage;
var navigation = firstChannel.navigation;
var html = "";
if (totalResults > 0 && firstChannel.items.length > 0) {
var item;
html += "<table class=\"networkTable\" border=\"0\" cellpadding=\"2\" cellspacing=\"1\" width=\"99%\">";
html += "<tr class=\"TableHeader\" valign=\"bottom\">";
html += "<td>URL from index (total results = " + totalResults + ")<\/td>";
for (var i = 0; i < firstChannel.items.length; i++) {
item = firstChannel.items[i];
html += "<tr class=\"TableCellLight\"><td align=\"left\"><a href=\"IndexControlURLs_p.html?urlstringsearch=&amp;urlstring=" + item.link + "\">" + item.link + "<\/a><\/td>";
}
html += "<\/table>";
}
document.getElementById("searchresults").innerHTML = html;
}
//]]>
</script>
</head>
<body id="IndexControl">
#%env/templates/header.template%#
#%env/templates/submenuIndexControl.template%#
<div id="api">
<a href="solr/select?defType=edismax&start=0&rows=3&core=collection1&wt=html&q=id:%22#[urlhash]#%22">
<img src="env/grafics/api.png" width="60" height="40" alt="API" /></a>
<span>These document details can be retrieved as <a href="http://www.w3.org/TR/xhtml-rdfa-primer/" target="_blank">XHTML+RDFa</a>
document containg <a href="http://www.w3.org/RDF/" target="_blank">RDF</a> annotations in <a href="http://dublincore.org/" target="_blank">Dublin Core</a> vocabulary.
The XHTML+RDFa data format is both a XML content format and a HTML display format and is considered as an important <a href="http://www.w3.org/2001/sw/" target="_blank">Semantic Web</a> content format.
The same content can also be retrieved as pure <a href="api/yacydoc.xml?urlhash=#[urlhash]#">XML metadata</a> with DC tag name vocabulary.
Click the API icon to see an example call to the search rss API.
To see a list of all APIs, please visit the <a href="http://www.yacy-websuche.de/wiki/index.php/Dev:API" target="_blank">API wiki page</a>.</span>
</div>
<h2>URL Database Administration</h2>
<p>The local index currently contains #[ucount]# URL references</p>
<form action="IndexControlURLs_p.html" id="searchform" method="post" enctype="multipart/form-data" accept-charset="UTF-8" onkeyup="xmlhttpPost(); return false;">
<fieldset><legend>URL Retrieval</legend>
<dl>
<dt class="TableCellDark">Retrieve by URL:</dt>
<dd><input type="text" name="urlstring" value="#[urlstring]#" size="40" maxlength="250" />
<input type="submit" name="urlstringsearch" value="Show Details for URL" class="btn btn-primary" style="width:240px;"/><br />
<div id="searchresults"></div>
</dd>
<dt class="TableCellDark">Retrieve by URL-Hash:</dt>
<dd><input type="text" name="urlhash" value="#[urlhash]#" size="40" maxlength="12" />
<input type="submit" name="urlhashsearch" value="Show Details for URL-Hash" class="btn btn-primary" style="width:240px;"/>
</dd>
</dl>
</fieldset>
</form>
<form action="IndexControlURLs_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8">
<fieldset#(cleanup)# disabled="disabled"::#(/cleanup)#><legend>Cleanup</legend>
<dl>
<dt class="TableCellDark">Index Deletion</dt>
<dd><input type="checkbox" name="deleteIndex" id="deleteIndex"
onclick="x=document.getElementById('deleteIndex').checked;#(rwi)#::document.getElementById('deleteRWI').checked=x;#(/rwi)#document.getElementById('deleteRobots').checked=x;document.getElementById('deleteRobots').checked=x;document.getElementById('deleteCrawlQueues').checked=x;c='disabled';document.getElementById('deleteSearchFl').checked=x;if(x){c='';};document.getElementById('deletecomplete').disabled=c;document.getElementById('deleteCache').disabled=c;document.getElementById('deleteFirstSeen').disabled=c;document.getElementById('deleteRobots').disabled=c;document.getElementById('deleteCrawlQueues').disabled=c;document.getElementById('deleteSearchFl').disabled=c;"
/> Delete local search index (embedded Solr and old Metadata)<br/>
#(cleanupsolr)#::<input type="checkbox" name="deleteRemoteSolr" id="deleteRemoteSolr" onclick="x=document.getElementById('deleteRemoteSolr').checked;c='disabled';if(x){c='';};document.getElementById('deletecomplete').disabled=c;" /> Delete remote solr index<br/>#(/cleanupsolr)#
#(cleanuprwi)#::<input type="checkbox" name="deleteRWI" id="deleteRWI" onclick="x=document.getElementById('deleteRWI').checked;c='disabled';if(x){c='';};document.getElementById('deletecomplete').disabled=c;" /> Delete RWI Index (DHT transmission words)<br/>#(/cleanuprwi)#
#(cleanupcitation)#::<input type="checkbox" name="deleteCitation" id="deleteCitation" onclick="x=document.getElementById('deleteCitation').checked;c='disabled';if(x){c='';};document.getElementById('deletecomplete').disabled=c;" /> Delete Citation Index (linking between URLs)<br/>#(/cleanupcitation)#
<input type="checkbox" name="deleteFirstSeen" id="deleteFirstSeen" disabled="disabled" /> Delete First-Seen Date Table<br/>
<input type="checkbox" name="deleteCache" id="deleteCache" disabled="disabled" /> Delete HTTP &amp; FTP Cache<br/>
<input type="checkbox" name="deleteCrawlQueues" id="deleteCrawlQueues" disabled="disabled" /> Stop Crawler and delete Crawl Queues<br/>
<input type="checkbox" name="deleteRobots" id="deleteRobots" disabled="disabled" /> Delete robots.txt Cache<br/>
<input type="checkbox" name="deleteSearchFl" id="deleteSearchFl" disabled="disabled" /> Delete cached snippet-fetching failures during search<br/><br/>
<input type="submit" name="deletecomplete" id="deletecomplete" value="Delete" disabled="disabled" class="btn btn-danger" style="width:240px;" onclick="return confirm('Confirm Deletion')"/>
</dd>
</dl>
</fieldset>
</form>
#(statistics)#::
<form action="IndexControlURLs_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8">
<fieldset><legend>Statistics about top-domains in URL Database</legend>
<dl>
<dt class="TableCellDark">&nbsp;</dt>
<dd>Show top <input type="text" name="lines" value="#[lines]#" size="6" maxlength="6" /> domains from all URLs.
<input type="submit" name="statistics" value="Generate Statistics" class="btn btn-primary" style="width:240px;"/>
</dd>
</dl>
</fieldset>
</form>
#(/statistics)#
#(statisticslines)#::
<p><em>Statistics about the top-#[domains]# domains in the database:</em></p>
<table >
<tr class="TableHeader">
<td align="center"></td>
<td><strong>Domain</strong></td>
<td><strong>URLs</strong></td>
</tr>
#{domains}#
<tr class="TableCell#(dark)#Light::Dark#(/dark)#">
<td>
<form action="IndexControlURLs_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8">
<div>
<input type="hidden" name="domain" value="#[domain]#" />
<input type="hidden" name="lines" value="#[lines]#" />
<input type="submit" name="deletedomain" value="delete all" class="btn btn-danger btn-xs" style="padding: 0px 12px;" />
</div>
</form>
</td>
<td><a href="http://#[domain]#/" target="_blank">#[domain]#</a></td>
<td>#[count]#</td>
</tr>
#{/domains}#
</table>
#(/statisticslines)#
#(dumprestore)#::
<form action="IndexControlURLs_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8">
<fieldset><legend>Dump and Restore of Solr Index</legend>
<dl>
<dt class="TableCellDark">&nbsp;</dt>
<dd><input type="submit" name="indexdump" value="Create Dump" class="btn btn-primary" style="width:240px;"/>
</dd>
</dl>
<dl>
<dt class="TableCellDark">Dump File</dt>
<dd><input type="text" name="dumpfile" value="#[dumpfile]#" size="80" maxlength="250" />
</dd>
<dt class="TableCellDark">&nbsp;</dt>
<dd><input type="submit" name="indexrestore" value="Restore Dump" class="btn btn-primary" style="width:240px;"/>
</dd>
</dl>
</fieldset>
</form>
<form action="IndexControlURLs_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8">
<fieldset><legend>Optimize Solr</legend>
<dl>
<dt class="TableCellDark">&nbsp;</dt>
<dd>merge to max. <input type="text" name="optimizemax" value="#[optimizemax]#" size="6" maxlength="6" /> segments
<input type="submit" name="optimizesolr" value="Optimize Solr" class="btn btn-primary" style="width:240px;"/>
</dd>
</dl>
</fieldset>
</form>
<form action="IndexControlURLs_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8">
<fieldset><legend>Reboot Solr Core</legend>
<dl>
<dt class="TableCellDark">&nbsp;</dt>
<dd><input type="submit" name="rebootsolr" value="Shut Down and Re-Start Solr" class="btn btn-primary" style="width:240px;"/>
</dd>
</dl>
</fieldset>
</form>::
#(/dumprestore)#
#(lurlexport)#::
<form action="IndexControlURLs_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8">
<fieldset><legend>Loaded URL Export</legend>
<dl>
<dt class="TableCellDark">Export File</dt>
<dd><input type="text" name="exportfile" value="#[exportfile]#" size="80" maxlength="250" />
</dd>
<dt class="TableCellDark">URL Filter</dt>
<dd><input type="text" name="exportfilter" value=".*.*" size="20" maxlength="250" />
</dd>
<dt class="TableCellDark">query</dt>
<dd><input type="text" name="exportquery" value="*:*" size="20" maxlength="250" />
</dd>
<dt class="TableCellDark">Export Format</dt>
<dd>Only Domain:
<input type="radio" name="format" value="dom-text" />Plain Text List (domains only)&nbsp;&nbsp;
<input type="radio" name="format" value="dom-html" checked="checked" />HTML (domains as URLs, no title)<br />
Full URL List:
<input type="radio" name="format" value="url-text" />Plain Text List (URLs only)&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
<input type="radio" name="format" value="url-html" />HTML (URLs with title)&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
<input type="radio" name="format" value="url-rss" />XML (RSS)
<br />
</dd>
<dt class="TableCellDark">&nbsp;</dt>
<dd><input type="submit" name="lurlexport" value="Export URLs" class="btn btn-primary" style="width:240px;"/>
</dd>
</dl>
</fieldset>
</form>::
<div class="commit" style="text-decoration:blink">Export to file #[exportfile]# is running .. #[urlcount]# URLs so far</div>::
#(/lurlexport)#
#(lurlexportfinished)#::
<div class="commit">Finished export of #[urlcount]# URLs to file <a href="file://#[exportfile]#" target="_">#[exportfile]#</a></div>::
#(/lurlexportfinished)#
#(lurlexporterror)#::
<div class="error">Export to file #[exportfile]# failed: #[exportfailmsg]#</div>::
#(/lurlexporterror)#
#(indexdump)#::
<div class="commit">Stored a solr dump to file #[dumpfile]#</div>::
#(/indexdump)#
#(genUrlProfile)#
::No entry found for URL-hash #[urlhash]#
::<iframe src="solr/select?defType=edismax&start=0&rows=3&core=collection1&wt=html&q=id:%22#[urlhash]#%22" width="100%" height="420" frameborder="0" scrolling="no"></iframe><br />
<form action="ViewFile.html" method="get" accept-charset="UTF-8">
<input type="hidden" name="viewMode" value="parsed" />
<input type="hidden" name="show" value="Show" />
<input type="hidden" name="urlHash" value="#[urlhash]#" />
<input type="submit" value="Show Content" name="showcontent" class="btn btn-primary" style="width:240px;"/><br />
</form>
<form action="IndexControlURLs_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8">
<input type="hidden" name="keystring" value="" />
<input type="hidden" name="keyhash" value="" />
<input type="hidden" name="urlstring" value="" />
<input type="hidden" name="urlhash" value="#[urlhash]#" />
<input type="submit" value="Delete URL" name="urlhashdelete" class="btn btn-primary" style="width:240px;"/><br />
<span class="small">&nbsp;this may produce unresolved references at other word indexes but they do not harm</span><br /><br />
<input type="submit" value="Delete URL and remove all references from words" name="urlhashdeleteall" class="btn btn-primary" style="width:240px;"/><br />
<span class="small">&nbsp;delete the reference to this url at every other word where the reference exists (very extensive, but prevents unresolved references)</span><br />
</form>
#(/genUrlProfile)#
#[result]#
#%env/templates/footer.template%#
</body>
</html>