mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
Add url input field as source for WarcImporter
allowing to import warc from url without prior download.
This commit is contained in:
parent
d3df8a46c4
commit
bec34d3546
|
@ -22,13 +22,16 @@
|
|||
You can download warc archives for example here
|
||||
<a href="https://archive.org/search.php?query=subject%3A%22warcarchives%22&and[]=subject%3A%22warcarchives%22" target="_blank">Internet Archive</a>.
|
||||
</p>
|
||||
<div class="input-group">
|
||||
<span style="display: inline-block">
|
||||
<input name="file" type="file" value="" size="75" /></span>
|
||||
<div class="btn-group">
|
||||
<input name="submit" class="btn btn-primary" type="submit" value="Import Warc File" />
|
||||
</div>
|
||||
</div>
|
||||
<dl>
|
||||
<dt class="TableCellDark"><label for="file">File:</label></dt>
|
||||
<dd><input name="file" id="file" type="file" value="" size="75" /></dd>
|
||||
<dt></dt>
|
||||
<dd>or</dd>
|
||||
<dt class="TableCellDark"><label for="url">Url:</label></dt>
|
||||
<dd><input name="url" id="url" value="" size="75"/></dd>
|
||||
<dt></dt>
|
||||
<dd><input name="submit" class="btn btn-primary" type="submit" value="Import Warc File" /></dd>
|
||||
</dl>
|
||||
</fieldset>
|
||||
</form>
|
||||
|
||||
|
|
|
@ -18,6 +18,10 @@
|
|||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.net.MalformedURLException;
|
||||
import net.yacy.cora.document.id.MultiProtocolURL;
|
||||
import net.yacy.cora.protocol.ClientIdentification;
|
||||
|
||||
import net.yacy.cora.protocol.RequestHeader;
|
||||
import net.yacy.document.importer.WarcImporter;
|
||||
|
@ -45,23 +49,42 @@ public class IndexImportWarc_p {
|
|||
} else {
|
||||
prop.put("import", 0);
|
||||
if (post != null) {
|
||||
if (post.containsKey("file")) {
|
||||
String file = post.get("file");
|
||||
final File sourcefile = new File(file);
|
||||
if (sourcefile.exists()) {
|
||||
try {
|
||||
WarcImporter wi = new WarcImporter(sourcefile);
|
||||
wi.start();
|
||||
prop.put("import_thread", "started");
|
||||
} catch (FileNotFoundException ex) {
|
||||
prop.put("import_thread", "Error: file not found [" + file + "]");
|
||||
if (post.containsKey("file") || post.containsKey("url")) {
|
||||
String filename = post.get("file");
|
||||
if (filename != null && filename.length() > 0) {
|
||||
final File sourcefile = new File(filename);
|
||||
if (sourcefile.exists()) {
|
||||
try {
|
||||
WarcImporter wi = new WarcImporter(sourcefile);
|
||||
wi.start();
|
||||
prop.put("import_thread", "started");
|
||||
} catch (FileNotFoundException ex) {
|
||||
prop.put("import_thread", "Error: file not found [" + filename + "]");
|
||||
}
|
||||
prop.put("import", 1);
|
||||
prop.put("import_warcfile", filename);
|
||||
} else {
|
||||
prop.put("import_warcfile", "");
|
||||
prop.put("import_thread", "Error: file not found [" + filename + "]");
|
||||
}
|
||||
prop.put("import_warcfile", file);
|
||||
} else {
|
||||
prop.put("import_warcfile", "");
|
||||
prop.put("import_thread", "Error: file not found [" + file + "]");
|
||||
String urlstr = post.get("url");
|
||||
if (urlstr != null && urlstr.length() > 0) {
|
||||
try {
|
||||
MultiProtocolURL url = new MultiProtocolURL(urlstr);
|
||||
WarcImporter wi = new WarcImporter(url.getInputStream(ClientIdentification.yacyInternetCrawlerAgent), urlstr);
|
||||
wi.start();
|
||||
prop.put("import_thread", "started");
|
||||
} catch (MalformedURLException ex) {
|
||||
prop.put("import_thread", ex.getMessage());
|
||||
} catch (IOException ex) {
|
||||
prop.put("import_thread", ex.getMessage());
|
||||
}
|
||||
prop.put("import", 1);
|
||||
prop.put("import_warcfile", urlstr);
|
||||
}
|
||||
}
|
||||
prop.put("import", 1);
|
||||
|
||||
prop.put("import_count", 0);
|
||||
prop.put("import_speed", 0);
|
||||
prop.put("import_runningHours", 0);
|
||||
|
|
|
@ -73,6 +73,18 @@ public class WarcImporter extends Thread implements Importer {
|
|||
sourceSize = -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Init the WarcImporter with input stream with a informational filename or
|
||||
* url als info for calls to the importer methode source() which returns
|
||||
* the urlinfo. Otherwise this methode is equivalent to WarchImporter(inputstream)
|
||||
* @param f the input stream to read the warc archive from
|
||||
* @param urlinfo a info like the url or the filename
|
||||
*/
|
||||
public WarcImporter (InputStream f, String urlinfo) {
|
||||
this(f);
|
||||
name = urlinfo;
|
||||
}
|
||||
|
||||
public WarcImporter(File f) throws FileNotFoundException{
|
||||
name = f.getName();
|
||||
sourceSize = f.length();
|
||||
|
|
Loading…
Reference in New Issue
Block a user