- added function to OAI-PMH reader that can pull all records from a server using an evaluation of the resumption token to get URL to retrieve remaining records

- added monitoring for retrieved records

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6444 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
orbiter 2009-11-02 11:53:14 +00:00
parent 350d13e153
commit b0b7a4f9a5
12 changed files with 431 additions and 284 deletions

View File

@ -16,6 +16,7 @@
<li><a href="/CrawlResults.html?process=4" class="MenuItemLink lock">(4) Proxy Use</a></li>
<li><a href="/CrawlResults.html?process=5" class="MenuItemLink lock">(5) Local Crawling</a></li>
<li><a href="/CrawlResults.html?process=6" class="MenuItemLink">(6) Global Crawling</a></li>
<li><a href="/CrawlResults.html?process=7" class="MenuItemLink lock">(7) Surrogate Import</a></li>
</ul>
</div>
@ -28,6 +29,7 @@
<p>Case (6) is a monitor of the local receipt-generator, the opposed case of (1). It contains also an indexing result monitor but is not considered private
since it shows crawl requests from other peers.
</p>
<p>Case (7) occurs if surrogate files are imported</p>
<p><img src="/env/grafics/indexmonitor.png" alt="An illustration how yacy works" /></p>
<p>The image above illustrates the data flow initiated by web index acquisition.
Some processes occur double to document the complex index migration structure.
@ -70,6 +72,10 @@
<p>These pages had been indexed by your peer, but the crawl was initiated by a remote peer.
This is the 'mirror'-case of process (1).</p>
<p><em>Use Case:</em> This list may fill if you check the 'Accept remote crawling requests'-flag on the 'Index Crate' page</p>
::
<h2>(7) Results from surrogates import</h2>
<p>These records had been imported from surrogate files in DATA/SURROGATES/in</p>
<p><em>Use Case:</em> place files with dublin core metadata content into DATA/SURROGATES/in or use an index import method (i.e. wikimedia import, OAI-PMH retrieval)</p>
#(/process)#

View File

@ -10,12 +10,34 @@
#%env/templates/submenuContentIntegration.template%#
<h2>OAI-PMH Import</h2>
#(import)#
<p>#(status)#No import thread is running, you can start a new thread here::Bad input data: #[message]# #(/status)#</p>
<form action="IndexImportOAIPMH_p.html" method="get">
<fieldset>
<legend>OAI-PMH Import: set a OAI-PMH URL</legend>
<input name="oaipmhurl" type="text" value="#[defaulturl]#" size="100" />
<legend>Single request import</legend>
This will submit only a single request as given here to a OAI-PMH server and imports records into the index
<input name="urlstartone" type="text" value="#[defaulturl]#" size="100" />
<input name="submit" type="submit" value="Import OAI-PMH source" />
</fieldset>
</form>
#(import-one)#
::
<form><fieldset><legend>Import Process</legend>
<dl>
<dt>Source:</dt><dd>#[source]#</dd>
<dt>Processed:</dt><dd>#[count]# records</dd>
<dt>ResumptionToken:</dt><dd>#[rt]#</dd>
</dl>
</fieldset></form>
::
Import failed: #[error]#
#(/import-one)#
#(import-all)#
<p>#(status)#::Bad input data: #[message]# #(/status)#</p>
<form action="IndexImportOAIPMH_p.html" method="get">
<fieldset>
<legend>Import all Records from a server</legend>
Import all records that follow acording to resumption elements into index
<input name="urlstartall" type="text" value="" size="100" />
<input name="submit" type="submit" value="Import OAI-PMH source" />
</fieldset>
</form>
@ -24,14 +46,13 @@
<dl>
<dt>Thread:</dt><dd>#[thread]#</dd>
<dt>Source:</dt><dd>#[source]#</dd>
<dt>ResumptionToken:</dt><dd>#[rt]#</dd>
<dt>Processed:</dt><dd>#[count]# Wiki Entries</dd>
<dt>Speed:</dt><dd>#[speed]# articles per second</dd>
<dt>Processed:</dt><dd>#[count]# records</dd>
<dt>Speed:</dt><dd>#[speed]# records per second</dd>
<dt>Running Time:</dt><dd>#[runningHours]# hours, #[runningMinutes]# minutes</dd>
<dt>Remaining Time:</dt><dd>#[remainingHours]# hours, #[remainingMinutes]# minutes</dd>
</dl>
</fieldset></form>
#(/import)#
#(/import-all)#
#%env/templates/footer.template%#
</body>

View File

@ -26,6 +26,7 @@ import java.io.IOException;
import java.net.MalformedURLException;
import net.yacy.document.importer.OAIPMHImporter;
import net.yacy.document.importer.OAIPMHReader;
import net.yacy.document.importer.ResumptionToken;
import net.yacy.kelondro.data.meta.DigestURI;
@ -40,62 +41,82 @@ public class IndexImportOAIPMH_p {
final serverObjects prop = new serverObjects();
final Switchboard sb = (Switchboard) env;
prop.put("import_defaulturl", "");
prop.put("import-one", 0);
prop.put("import-all", 0);
prop.put("import-all_status", 0);
prop.put("defaulturl", "");
if (OAIPMHImporter.job != null) {
// show result from finished import
try {
ResumptionToken rt = OAIPMHImporter.job.getResumptionToken();
if (rt != null) prop.put("import_defaulturl", rt.resumptionURL(new DigestURI(OAIPMHImporter.job.source(), null)).toNormalform(true, false));
} catch (MalformedURLException e) {
prop.put("import_defaulturl", e.getMessage());
} catch (IOException e) {
// reached end of resumption
prop.put("import_defaulturl", e.getMessage());
}
}
if (OAIPMHImporter.job != null && OAIPMHImporter.job.isAlive()) {
// one import is running, no option to insert anything
prop.put("import", 1);
prop.put("import_thread", "running");
prop.put("import_source", OAIPMHImporter.job.source());
prop.put("import_count", OAIPMHImporter.job.count());
prop.put("import_speed", OAIPMHImporter.job.speed());
prop.put("import_runningHours", (OAIPMHImporter.job.runningTime() / 60) / 60);
prop.put("import_runningMinutes", (OAIPMHImporter.job.runningTime() / 60) % 60);
prop.put("import_remainingHours", (OAIPMHImporter.job.remainingTime() / 60) / 60);
prop.put("import_remainingMinutes", (OAIPMHImporter.job.remainingTime() / 60) % 60);
prop.put("import-all", 1);
prop.put("import-all_thread", (OAIPMHImporter.job.isAlive()) ? "running" : "finished");
prop.put("import-all_source", OAIPMHImporter.job.source());
prop.put("import-all_count", OAIPMHImporter.job.count());
prop.put("import-all_speed", OAIPMHImporter.job.speed());
prop.put("import-all_runningHours", (OAIPMHImporter.job.runningTime() / 60) / 60);
prop.put("import-all_runningMinutes", (OAIPMHImporter.job.runningTime() / 60) % 60);
prop.put("import-all_remainingHours", (OAIPMHImporter.job.remainingTime() / 60) / 60);
prop.put("import-all_remainingMinutes", (OAIPMHImporter.job.remainingTime() / 60) % 60);
return prop;
}
prop.put("import", 0);
if (post == null) {
prop.put("import_status", 0);
return prop;
}
if (post.containsKey("oaipmhurl")) {
String oaipmhurl = post.get("oaipmhurl");
DigestURI url = null;
try {
url = new DigestURI(oaipmhurl, null);
OAIPMHImporter.job = new OAIPMHImporter(sb.loader, url);
OAIPMHImporter.job.start();
prop.put("import", 1);
prop.put("import_thread", "started");
prop.put("import_source", OAIPMHImporter.job.source());
prop.put("import_rt", OAIPMHImporter.job.status());
prop.put("import_count", 0);
prop.put("import_speed", 0);
prop.put("import_runningHours", 0);
prop.put("import_runningMinutes", 0);
prop.put("import_remainingHours", 0);
prop.put("import_remainingMinutes", 0);
} catch (MalformedURLException e) {
e.printStackTrace();
prop.put("import", 0);
prop.put("import_status", 1);
prop.put("import_status_message", e.getMessage());
if (post != null) {
if (post.containsKey("urlstartone")) {
String oaipmhurl = post.get("urlstartone");
DigestURI url = null;
try {
url = new DigestURI(oaipmhurl, null);
OAIPMHReader r = new OAIPMHReader(sb.loader, url, sb.surrogatesInPath, "oaipmh-one");
ResumptionToken rt = r.getResumptionToken();
prop.put("import-one", 1);
prop.put("import-one_count", (rt == null) ? "not available" : Integer.toString(rt.getRecordCounter()));
prop.put("import-one_source", r.source());
prop.put("import-one_rt", r.getResumptionToken().toString());
// set next default url
try {
DigestURI nexturl = (rt == null) ? null : rt.resumptionURL(url);
if (rt != null) prop.put("defaulturl", (nexturl == null) ? "" : nexturl.toNormalform(true, false));
} catch (MalformedURLException e) {
prop.put("defaulturl", e.getMessage());
} catch (IOException e) {
// reached end of resumption
prop.put("defaulturl", e.getMessage());
}
} catch (MalformedURLException e) {
e.printStackTrace();
prop.put("import-one", 2);
prop.put("import-one_error", e.getMessage());
} catch (IOException e) {
e.printStackTrace();
prop.put("import-one", 2);
prop.put("import-one_error", e.getMessage());
}
}
if (post.containsKey("urlstartall")) {
String oaipmhurl = post.get("urlstartall");
DigestURI url = null;
try {
url = new DigestURI(oaipmhurl, null);
OAIPMHImporter.job = new OAIPMHImporter(sb.loader, url);
OAIPMHImporter.job.start();
prop.put("import-all", 1);
prop.put("import-all_thread", "started");
prop.put("import-all_source", OAIPMHImporter.job.source());
prop.put("import-all_count", 0);
prop.put("import-all_speed", 0);
prop.put("import-all_runningHours", 0);
prop.put("import-all_runningMinutes", 0);
prop.put("import-all_remainingHours", 0);
prop.put("import-all_remainingMinutes", 0);
} catch (MalformedURLException e) {
e.printStackTrace();
prop.put("import-all", 0);
prop.put("import-all_status", 1);
prop.put("import-all_status_message", e.getMessage());
}
}
}
return prop;

View File

@ -30,7 +30,6 @@ import java.io.File;
import java.io.IOException;
import java.util.Iterator;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.util.kelondroException;
@ -72,7 +71,7 @@ public final class CrawlSwitchboard {
final Log log,
final File queuesRoot) {
log.logInfo("Initializing Word Index for the network '" + networkName + "', word hash cache size is " + Word.hashCacheSize + ".");
log.logInfo("Initializing Word Index for the network '" + networkName + "'.");
if (networkName == null || networkName.length() == 0) {
log.logSevere("no network name given - shutting down");

View File

@ -17,11 +17,12 @@ public enum EventOrigin {
DHT_TRANSFER(3),
PROXY_LOAD(4),
LOCAL_CRAWLING(5),
GLOBAL_CRAWLING(6);
GLOBAL_CRAWLING(6),
SURROGATES(7);
protected int code;
private static final EventOrigin[] list = {
UNKNOWN, REMOTE_RECEIPTS, QUERIES, DHT_TRANSFER, PROXY_LOAD, LOCAL_CRAWLING, GLOBAL_CRAWLING};
UNKNOWN, REMOTE_RECEIPTS, QUERIES, DHT_TRANSFER, PROXY_LOAD, LOCAL_CRAWLING, GLOBAL_CRAWLING, SURROGATES};
private EventOrigin(int code) {
this.code = code;
}

View File

@ -93,7 +93,7 @@ public class Segment {
migrateTextIndex(segmentPath, segmentPath);
migrateTextMetadata(segmentPath, segmentPath);
log.logInfo("Initializing Segment '" + segmentPath + "', word hash cache size is " + Word.hashCacheSize + ".");
log.logInfo("Initializing Segment '" + segmentPath + ".");
this.log = log;
this.segmentPath = segmentPath;

View File

@ -56,7 +56,8 @@ public class Segments implements Iterable<Segment> {
PROXY,
LOCALCRAWLING,
REMOTECRAWLING,
PUBLIC; // includes the index that can be retrieved by the yacy p2p api
PUBLIC,
SURROGATES; // includes the index that can be retrieved by the yacy p2p api
public String toString() {
throw new UnsupportedOperationException("toString not allowed");
@ -97,6 +98,7 @@ public class Segments implements Iterable<Segment> {
this.process_assignment.put(Process.LOCALCRAWLING, "default");
this.process_assignment.put(Process.REMOTECRAWLING, "default");
this.process_assignment.put(Process.PUBLIC, "default");
this.process_assignment.put(Process.SURROGATES, "default");
}
public void setSegment(Process process, String segmentName) {

View File

@ -1237,7 +1237,7 @@ public final class Switchboard extends serverSwitch {
0
);
response = new Response(request, null, null, "200", this.crawler.defaultSurrogateProfile);
indexingQueueEntry queueEntry = new indexingQueueEntry(Segments.Process.LOCALCRAWLING, response, document, null);
indexingQueueEntry queueEntry = new indexingQueueEntry(Segments.Process.SURROGATES, response, document, null);
// place the queue entry into the concurrent process of the condenser (document analysis)
try {
@ -1717,9 +1717,10 @@ public final class Switchboard extends serverSwitch {
// CREATE INDEX
final String dc_title = document.dc_title();
final DigestURI referrerURL = queueEntry.referrerURL();
final EventOrigin processCase = queueEntry.processCase(peers.mySeed().hash);
EventOrigin processCase = queueEntry.processCase(peers.mySeed().hash);
if (process == Segments.Process.SURROGATES) processCase = EventOrigin.SURROGATES;
// remove stopwords
// remove stopwords
log.logInfo("Excluded " + condenser.excludeWords(stopwords) + " words in URL " + queueEntry.url());
// STORE URL TO LOADED-URL-DB

View File

@ -26,21 +26,12 @@
package net.yacy.document.importer;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import net.yacy.document.content.DCEntry;
import net.yacy.document.content.SurrogateReader;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.repository.LoaderDispatcher;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.retrieval.HTTPLoader;
import de.anomic.crawler.retrieval.Request;
import de.anomic.crawler.retrieval.Response;
import de.anomic.search.Switchboard;
@ -59,22 +50,31 @@ public class OAIPMHImporter extends Thread implements Importer {
private int count;
private long startTime;
private ResumptionToken resumptionToken;
private String message;
public OAIPMHImporter(LoaderDispatcher loader, DigestURI source) {
this.loader = loader;
this.source = source;
this.count = 0;
this.startTime = System.currentTimeMillis();
this.resumptionToken = null;
this.message = "import initialized";
// fix start url
String url = ResumptionToken.truncatedURL(source);
if (!url.endsWith("?")) url = url + "?";
try {
this.source = new DigestURI(url + "verb=ListRecords&metadataPrefix=oai_dc", null);
} catch (MalformedURLException e) {
// this should never happen
e.printStackTrace();
}
}
public int count() {
return this.count;
}
public String status() {
return (this.resumptionToken == null) ? "" : this.resumptionToken.toString();
return this.message;
}
public ResumptionToken getResumptionToken() {
@ -98,206 +98,20 @@ public class OAIPMHImporter extends Thread implements Importer {
}
public void run() {
Response response;
try {
response = this.loader.load(source, true, true, CrawlProfile.CACHE_STRATEGY_NOCACHE);
load(response);
} catch (IOException e) {
e.printStackTrace();
}
}
public void load0(DigestURI source) throws IOException {
Response response = HTTPLoader.load(new Request(source, null));
load(response);
}
private void load(Response response) throws IOException {
byte[] b = response.getContent();
this.resumptionToken = new ResumptionToken(new ByteArrayInputStream(b));
String file = this.source.getHost() + "_" + System.currentTimeMillis();
File f0 = new File(Switchboard.getSwitchboard().surrogatesInPath, file + ".tmp");
File f1 = new File(Switchboard.getSwitchboard().surrogatesInPath, file + ".xml");
FileUtils.copy(b, f0);
f0.renameTo(f1);
/*
SurrogateReader sr = new SurrogateReader(new ByteArrayInputStream(b), 100);
Thread srt = new Thread(sr);
srt.start();
DCEntry dce;
while ((dce = sr.take()) != DCEntry.poison) {
System.out.println(dce.toString());
}
try {
srt.join();
} catch (InterruptedException e) {}
*/
System.out.println("TOKEN: " + resumptionToken.toString());
}
public static StringBuilder escape(final String s) {
final int len = s.length();
final StringBuilder sbuf = new StringBuilder(len + 10);
for (int i = 0; i < len; i++) {
final int ch = s.charAt(i);
if ('A' <= ch && ch <= 'Z') { // 'A'..'Z'
sbuf.append((char)ch);
} else if ('a' <= ch && ch <= 'z') { // 'a'..'z'
sbuf.append((char)ch);
} else if ('0' <= ch && ch <= '9') { // '0'..'9'
sbuf.append((char)ch);
} else if (ch == ' ') { // space
sbuf.append("%20");
} else if (ch == '&' || ch == ':' // unreserved
|| ch == '-' || ch == '_'
|| ch == '.' || ch == '!'
|| ch == '~' || ch == '*'
|| ch == '\'' || ch == '('
|| ch == ')' || ch == ';') {
sbuf.append((char)ch);
this.message = "loading first part of records";
while (true) {
try {
OAIPMHReader reader = new OAIPMHReader(this.loader, this.source, Switchboard.getSwitchboard().surrogatesInPath, "oaipmh");
this.source = reader.getResumptionToken().resumptionURL(this.source);
if (this.source == null) {
this.message = "import terminated with source = null";
break;
}
this.message = "loading next resumption fragment, cursor = " + reader.getResumptionToken().getCursor();
} catch (IOException e) {
this.message = e.getMessage();
break;
}
}
return sbuf;
}
public static String unescape(final String s) {
final int l = s.length();
final StringBuilder sbuf = new StringBuilder(l);
int ch = -1;
int b, sumb = 0;
for (int i = 0, more = -1; i < l; i++) {
/* Get next byte b from URL segment s */
switch (ch = s.charAt(i)) {
case '%':
if (i + 2 < l) {
ch = s.charAt(++i);
int hb = (Character.isDigit ((char) ch) ? ch - '0' : 10 + Character.toLowerCase((char) ch) - 'a') & 0xF;
ch = s.charAt(++i);
int lb = (Character.isDigit ((char) ch) ? ch - '0' : 10 + Character.toLowerCase ((char) ch) - 'a') & 0xF;
b = (hb << 4) | lb;
} else {
b = ch;
}
break;
case '+':
b = ' ';
break;
default:
b = ch;
}
}
return sbuf.toString();
}
}
/*
http://an.oa.org/OAI-script?verb=GetRecord&identifier=oai:arXiv.org:hep-th/9901001&metadataPrefix=oai_dc
special characters in URIs must be encoded, the correct form of the above GET request URL is:
http://an.oa.org/OAI-script?verb=GetRecord&identifier=oai%3AarXiv.org%3Ahep-th%2F9901001&metadataPrefix=oai_dc
"/","%2F"
"?","%3F"
"#","%23"
"=","%3D"
"&","%26"
":","%3A"
";","%3B"
" ","%20"
"%","%25"
"+","%2B"
GetRecord
http://arXiv.org/oai2?verb=GetRecord&identifier=oai:arXiv.org:cs/0112017&metadataPrefix=oai_dc
http://opus.bsz-bw.de/fhhv/oai2/oai2.php?verb=GetRecord&identifier=oai:opus.bsz-bw.de-fhhv:6&metadataPrefix=oai_dc
Identify
http://memory.loc.gov/cgi-bin/oai?verb=Identify
ListIdentifiers
http://an.oa.org/OAI-script?verb=ListIdentifiers&from=1998-01-15&metadataPrefix=oldArXiv&set=physics:hep
http://an.oa.org/OAI-script?verb=ListIdentifiers&resumptionToken=xxx45abttyz
http://www.perseus.tufts.edu/cgi-bin/pdataprov?verb=ListIdentifiers&metadataPrefix=olac&from=2001-01-01&until=2001-01-01&set=Perseus:collection:PersInfo
ListMetadataFormats
http://www.perseus.tufts.edu/cgi-bin/pdataprov?verb=ListMetadataFormats&identifier=oai:perseus.tufts.edu:Perseus:text:1999.02.0119
http://memory.loc.gov/cgi-bin/oai?verb=ListMetadataFormats
http://memory.loc.gov/cgi-bin/oai?verb=ListMetadataFormats&identifier=oai:lcoa1.loc.gov:loc.rbc/rbpe.00000111
http://opus.bsz-bw.de/fhhv/oai2/oai2.php?verb=ListMetadataFormats
ListRecords
http://opus.bsz-bw.de/fhhv/oai2/oai2.php?verb=ListRecords&metadataPrefix=oai_dc
http://opus.bsz-bw.de/fhhv/oai2/oai2.php?verb=ListRecords&resumptionToken=455
http://opus.bsz-bw.de/fhhv/oai2/oai2.php?verb=ListRecords&resumptionToken=890
http://an.oa.org/OAI-script?verb=ListRecords&from=1998-01-15&set=physics:hep&metadataPrefix=oai_rfc1807
http://www.perseus.tufts.edu/cgi-b:in/pdataprov?verb=ListRecords&from=2002-05-01T14:15:00Z&until=2002-05-01T14:20:00Z&metadataPrefix=oai_dc
http://memory.loc.gov/cgi-bin/oai?verb=ListRecords&from=2002-06-01T02:00:00Z&until=2002-06-01T03:00:00Z&metadataPrefix=oai_marc
ListSets
http://an.oa.org/OAI-script?verb=ListSets
http://purl.org/alcme/etdcat/servlet/OAIHandler?verb=ListSets
urn identifier koennen ueber den resolver der d-nb aufgeloest werden:
http://nbn-resolving.de/urn:nbn:de:bsz:960-opus-1860
<?xml version="1.0" encoding="UTF-8"?>
<OAI-PMH xmlns="http://www.openarchives.org/OAI/2.0/"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/
http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd">
<responseDate>2009-10-01T22:20:04Z</responseDate>
<request verb="ListRecords" metadataPrefix="oai_dc">http://opus.bsz-bw.de/fhhv/oai2/oai2.php</request>
<ListRecords>
<record>
<header>
<identifier>oai:opus.bsz-bw.de-fhhv:1</identifier>
<datestamp>2008-03-04T12:17:33Z</datestamp>
<setSpec>ddc:020</setSpec>
<setSpec>pub-type:2</setSpec>
<setSpec>has-source-swb:false</setSpec>
</header>
<metadata>
<oai_dc:dc
xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/"
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/oai_dc/
http://www.openarchives.org/OAI/2.0/oai_dc.xsd">
<dc:title>Teaching Information Literacy with the Lerninformationssystem</dc:title>
<dc:creator>Hauschke, Christian</dc:creator>
<dc:creator>Ullmann, Nadine</dc:creator>
<dc:subject>Informationskompetenz</dc:subject>
<dc:subject>E-Learning</dc:subject>
<dc:subject>Bibliothek</dc:subject>
<dc:subject>Informationsvermittlung</dc:subject>
<dc:subject>Wissenschaftliches Arbeiten</dc:subject>
<dc:subject>information literacy</dc:subject>
<dc:subject>e-learning</dc:subject>
<dc:subject>library</dc:subject>
<dc:subject>information dissemination</dc:subject>
<dc:subject>Library and information sciences</dc:subject>
<dc:description>A German university has developed a learning information system to improve information literacy among German students. An online tutorial based on this Lerninformationssystem has been developed. The structure of this learning information system is described, an online tutorial based on it is illustrated, and the different learning styles that it supports are indicated.</dc:description>
<dc:publisher>Fachhochschule Hannover</dc:publisher>
<dc:publisher>Sonstige Einrichtungen. Sonstige Einrichtungen</dc:publisher>
<dc:date>2006</dc:date>
<dc:type>Article</dc:type>
<dc:format>application/pdf</dc:format>
<dc:identifier>urn:nbn:de:bsz:960-opus-10</dc:identifier>
<dc:identifier>http://opus.bsz-bw.de/fhhv/volltexte/2008/1/</dc:identifier>
<dc:source>Australian Academic &amp; Research Libraries, 37 (1), S. 55-60</dc:source>
<dc:language>eng</dc:language>
<dc:rights>http://creativecommons.org/licenses/by/2.0/de/deed.de</dc:rights>
</oai_dc:dc>
</metadata>
</record>
<resumptionToken
expirationDate="2009-10-02T20:20:04Z"
completeListSize="226"
cursor="0">119</resumptionToken>
</ListRecords>
</OAI-PMH>
*/
}

View File

@ -0,0 +1,256 @@
// OAIPMHReader
// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 30.09.2009 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2009-09-23 23:26:14 +0200 (Mi, 23 Sep 2009) $
// $LastChangedRevision: 6340 $
// $LastChangedBy: low012 $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package net.yacy.document.importer;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.util.Date;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.util.DateFormatter;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.repository.LoaderDispatcher;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.retrieval.Response;
// get one server with
// http://roar.eprints.org/index.php?action=csv
// list records from oai-pmh like
// http://opus.bsz-bw.de/fhhv/oai2/oai2.php?verb=ListRecords&metadataPrefix=oai_dc
public class OAIPMHReader {
private DigestURI source;
private ResumptionToken resumptionToken;
public OAIPMHReader(LoaderDispatcher loader, DigestURI source, File targetDir, String filePrefix) throws IOException {
this.source = source;
// load the file from the net
Response response;
response = loader.load(source, true, true, CrawlProfile.CACHE_STRATEGY_NOCACHE);
byte[] b = response.getContent();
this.resumptionToken = new ResumptionToken(new ByteArrayInputStream(b));
String file = filePrefix + "_" + this.source.getHost() + "_" + DateFormatter.formatShortMilliSecond(new Date());
File f0 = new File(targetDir, file + ".tmp");
File f1 = new File(targetDir, file + ".xml");
// transaction-safe writing
FileUtils.copy(b, f0);
f0.renameTo(f1);
/*
SurrogateReader sr = new SurrogateReader(new ByteArrayInputStream(b), 100);
Thread srt = new Thread(sr);
srt.start();
DCEntry dce;
while ((dce = sr.take()) != DCEntry.poison) {
System.out.println(dce.toString());
}
try {
srt.join();
} catch (InterruptedException e) {}
*/
}
public ResumptionToken getResumptionToken() {
return this.resumptionToken;
}
public String source() {
return source.toNormalform(true, false);
}
public static StringBuilder escape(final String s) {
final int len = s.length();
final StringBuilder sbuf = new StringBuilder(len + 10);
for (int i = 0; i < len; i++) {
final int ch = s.charAt(i);
if ('A' <= ch && ch <= 'Z') { // 'A'..'Z'
sbuf.append((char)ch);
} else if ('a' <= ch && ch <= 'z') { // 'a'..'z'
sbuf.append((char)ch);
} else if ('0' <= ch && ch <= '9') { // '0'..'9'
sbuf.append((char)ch);
} else if (ch == ' ') { // space
sbuf.append("%20");
} else if (ch == '&' || ch == ':' // unreserved
|| ch == '-' || ch == '_'
|| ch == '.' || ch == '!'
|| ch == '~' || ch == '*'
|| ch == '\'' || ch == '('
|| ch == ')' || ch == ';') {
sbuf.append((char)ch);
}
}
return sbuf;
}
public static String unescape(final String s) {
final int l = s.length();
final StringBuilder sbuf = new StringBuilder(l);
int ch = -1;
int b;
for (int i = 0; i < l; i++) {
/* Get next byte b from URL segment s */
switch (ch = s.charAt(i)) {
case '%':
if (i + 2 < l) {
ch = s.charAt(++i);
int hb = (Character.isDigit ((char) ch) ? ch - '0' : 10 + Character.toLowerCase((char) ch) - 'a') & 0xF;
ch = s.charAt(++i);
int lb = (Character.isDigit ((char) ch) ? ch - '0' : 10 + Character.toLowerCase ((char) ch) - 'a') & 0xF;
b = (hb << 4) | lb;
} else {
b = ch;
}
break;
case '+':
b = ' ';
break;
default:
b = ch;
}
sbuf.append(b);
}
return sbuf.toString();
}
}
/*
http://an.oa.org/OAI-script?verb=GetRecord&identifier=oai:arXiv.org:hep-th/9901001&metadataPrefix=oai_dc
special characters in URIs must be encoded, the correct form of the above GET request URL is:
http://an.oa.org/OAI-script?verb=GetRecord&identifier=oai%3AarXiv.org%3Ahep-th%2F9901001&metadataPrefix=oai_dc
"/","%2F"
"?","%3F"
"#","%23"
"=","%3D"
"&","%26"
":","%3A"
";","%3B"
" ","%20"
"%","%25"
"+","%2B"
GetRecord
http://arXiv.org/oai2?verb=GetRecord&identifier=oai:arXiv.org:cs/0112017&metadataPrefix=oai_dc
http://opus.bsz-bw.de/fhhv/oai2/oai2.php?verb=GetRecord&identifier=oai:opus.bsz-bw.de-fhhv:6&metadataPrefix=oai_dc
Identify
http://memory.loc.gov/cgi-bin/oai?verb=Identify
ListIdentifiers
http://an.oa.org/OAI-script?verb=ListIdentifiers&from=1998-01-15&metadataPrefix=oldArXiv&set=physics:hep
http://an.oa.org/OAI-script?verb=ListIdentifiers&resumptionToken=xxx45abttyz
http://www.perseus.tufts.edu/cgi-bin/pdataprov?verb=ListIdentifiers&metadataPrefix=olac&from=2001-01-01&until=2001-01-01&set=Perseus:collection:PersInfo
ListMetadataFormats
http://www.perseus.tufts.edu/cgi-bin/pdataprov?verb=ListMetadataFormats&identifier=oai:perseus.tufts.edu:Perseus:text:1999.02.0119
http://memory.loc.gov/cgi-bin/oai?verb=ListMetadataFormats
http://memory.loc.gov/cgi-bin/oai?verb=ListMetadataFormats&identifier=oai:lcoa1.loc.gov:loc.rbc/rbpe.00000111
http://opus.bsz-bw.de/fhhv/oai2/oai2.php?verb=ListMetadataFormats
ListRecords
http://opus.bsz-bw.de/fhhv/oai2/oai2.php?verb=ListRecords&metadataPrefix=oai_dc
http://opus.bsz-bw.de/fhhv/oai2/oai2.php?verb=ListRecords&resumptionToken=455
http://opus.bsz-bw.de/fhhv/oai2/oai2.php?verb=ListRecords&resumptionToken=890
http://an.oa.org/OAI-script?verb=ListRecords&from=1998-01-15&set=physics:hep&metadataPrefix=oai_rfc1807
http://www.perseus.tufts.edu/cgi-b:in/pdataprov?verb=ListRecords&from=2002-05-01T14:15:00Z&until=2002-05-01T14:20:00Z&metadataPrefix=oai_dc
http://memory.loc.gov/cgi-bin/oai?verb=ListRecords&from=2002-06-01T02:00:00Z&until=2002-06-01T03:00:00Z&metadataPrefix=oai_marc
ListSets
http://an.oa.org/OAI-script?verb=ListSets
http://purl.org/alcme/etdcat/servlet/OAIHandler?verb=ListSets
urn identifier koennen ueber den resolver der d-nb aufgeloest werden:
http://nbn-resolving.de/urn:nbn:de:bsz:960-opus-1860
<?xml version="1.0" encoding="UTF-8"?>
<OAI-PMH xmlns="http://www.openarchives.org/OAI/2.0/"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/
http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd">
<responseDate>2009-10-01T22:20:04Z</responseDate>
<request verb="ListRecords" metadataPrefix="oai_dc">http://opus.bsz-bw.de/fhhv/oai2/oai2.php</request>
<ListRecords>
<record>
<header>
<identifier>oai:opus.bsz-bw.de-fhhv:1</identifier>
<datestamp>2008-03-04T12:17:33Z</datestamp>
<setSpec>ddc:020</setSpec>
<setSpec>pub-type:2</setSpec>
<setSpec>has-source-swb:false</setSpec>
</header>
<metadata>
<oai_dc:dc
xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/"
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/oai_dc/
http://www.openarchives.org/OAI/2.0/oai_dc.xsd">
<dc:title>Teaching Information Literacy with the Lerninformationssystem</dc:title>
<dc:creator>Hauschke, Christian</dc:creator>
<dc:creator>Ullmann, Nadine</dc:creator>
<dc:subject>Informationskompetenz</dc:subject>
<dc:subject>E-Learning</dc:subject>
<dc:subject>Bibliothek</dc:subject>
<dc:subject>Informationsvermittlung</dc:subject>
<dc:subject>Wissenschaftliches Arbeiten</dc:subject>
<dc:subject>information literacy</dc:subject>
<dc:subject>e-learning</dc:subject>
<dc:subject>library</dc:subject>
<dc:subject>information dissemination</dc:subject>
<dc:subject>Library and information sciences</dc:subject>
<dc:description>A German university has developed a learning information system to improve information literacy among German students. An online tutorial based on this Lerninformationssystem has been developed. The structure of this learning information system is described, an online tutorial based on it is illustrated, and the different learning styles that it supports are indicated.</dc:description>
<dc:publisher>Fachhochschule Hannover</dc:publisher>
<dc:publisher>Sonstige Einrichtungen. Sonstige Einrichtungen</dc:publisher>
<dc:date>2006</dc:date>
<dc:type>Article</dc:type>
<dc:format>application/pdf</dc:format>
<dc:identifier>urn:nbn:de:bsz:960-opus-10</dc:identifier>
<dc:identifier>http://opus.bsz-bw.de/fhhv/volltexte/2008/1/</dc:identifier>
<dc:source>Australian Academic &amp; Research Libraries, 37 (1), S. 55-60</dc:source>
<dc:language>eng</dc:language>
<dc:rights>http://creativecommons.org/licenses/by/2.0/de/deed.de</dc:rights>
</oai_dc:dc>
</metadata>
</record>
<resumptionToken
expirationDate="2009-10-02T20:20:04Z"
completeListSize="226"
cursor="0">119</resumptionToken>
</ListRecords>
</OAI-PMH>
*/

View File

@ -55,8 +55,11 @@ public class ResumptionToken extends TreeMap<String, String> {
insensitiveCollator.setDecomposition(Collator.NO_DECOMPOSITION);
}
int recordCounter;
public ResumptionToken(final InputStream stream) throws IOException {
super((Collator) insensitiveCollator.clone());
this.recordCounter = 0;
new Reader(stream);
}
@ -67,6 +70,7 @@ public class ResumptionToken extends TreeMap<String, String> {
String token
) {
super((Collator) insensitiveCollator.clone());
this.recordCounter = 0;
this.put("expirationDate", DateFormatter.formatISO8601(expirationDate));
this.put("completeListSize", Integer.toString(completeListSize));
this.put("cursor", Integer.toString(cursor));
@ -80,12 +84,33 @@ public class ResumptionToken extends TreeMap<String, String> {
String token
) {
super((Collator) insensitiveCollator.clone());
this.recordCounter = 0;
this.put("expirationDate", expirationDate);
this.put("completeListSize", Integer.toString(completeListSize));
this.put("cursor", Integer.toString(cursor));
this.put("token", token);
}
/**
* truncate the given url at the '?'
* @param url
* @return a string containing the url up to and including the '?'
*/
public static String truncatedURL(DigestURI url) {
String u = url.toNormalform(true, true);
int i = u.indexOf('?');
if (i > 0) u = u.substring(0, i + 1);
return u;
}
/**
* while parsing the resumption token, also all records are counted
* @return the result from counting the records
*/
public int getRecordCounter() {
return this.recordCounter;
}
/**
* compute a url that can be used to resume the retrieval from the OAI-PMH resource
* @param givenURL
@ -97,9 +122,7 @@ public class ResumptionToken extends TreeMap<String, String> {
String token = this.getToken();
if (token == null || token.length() == 0) throw new IOException("end of resumption reached");
String url = givenURL.toNormalform(true, true);
int i = url.indexOf('?');
if (i > 0) url = url.substring(0, i + 1);
String url = truncatedURL(givenURL);
// encoded state
if (token.indexOf("from=") >= 0) {
@ -225,6 +248,9 @@ public class ResumptionToken extends TreeMap<String, String> {
*/
public void startElement(final String uri, final String name, final String tag, final Attributes atts) throws SAXException {
if ("record".equals(tag)) {
recordCounter++;
}
if ("resumptionToken".equals(tag)) {
this.parsingValue = true;
this.atts = atts;

View File

@ -52,7 +52,7 @@ public class Word {
*/
public static final int commonHashLength = 12;
public static final int hashCacheSize = Math.max(2048, Math.min(100000, (int) (MemoryControl.available() / 20000L)));
private static final int hashCacheSize = Math.max(2048, Math.min(100000, (int) (MemoryControl.available() / 20000L)));
private static final ARC<String, byte[]> hashCache = new ConcurrentARC<String, byte[]>(hashCacheSize, Runtime.getRuntime().availableProcessors());
// object carries statistics for words and sentences