- added a new RSS reader interface. This is not finished but you can now load and look at RSS feeds. It will be used to index RSS feeds in a way that is appropriate for such kind of data.

- refactoring of Mediawiki and PHPBB3 loader interface names (just renamed)
- removed two old not used RSS loader interfaces
- fixed a bug in RSS parser library of cora
- added a new RSS parser component to the set of yacy document parsers

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7053 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
orbiter 2010-08-20 11:30:02 +00:00
parent 933dc1a600
commit e10cd115a9
14 changed files with 324 additions and 233 deletions

View File

@ -1,39 +0,0 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>YaCy '#[clientname]#': Feed Reader</title>
#%env/templates/metas.template%#
</head>
<body>
#%env/templates/header.template%#
#(page)#
please select your feed with ?url=Feedurl&max=5&offset=1 (to be implemented in html ;))
::
<dl>
<dt>Title</dt>
<dd>#[title]#</dd>
#(hasAuthor)#::<dt>Author</dt>
<dd>#[author]#</dd>#(/hasAuthor)#
<dt>Description</dt>
<dd>#[description]#</dd>
</dl>
<dl>
#{items}#
<dt><a href="#[link]#">#[title]#</a></dt>
<dd style="border: thin solid red">#[description]#</dd>
#{/items}#
</dl>
::
Error:
#(error)#
You need to install libx
::
Problem with url
#(/error)#
test
#(/page)#
#%env/templates/footer.template%#
</body>
</html>

View File

@ -1,91 +0,0 @@
//FeedReader_p.java
//------------
// part of YACY
//
// (C) 2007 Alexander Schier
//
//$LastChangedDate$
//$LastChangedRevision$
//$LastChangedBy$
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
import java.io.IOException;
import java.net.MalformedURLException;
import net.yacy.cora.document.Hit;
import net.yacy.cora.document.RSSFeed;
import net.yacy.cora.document.RSSReader;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.DateFormatter;
import de.anomic.http.server.RequestHeader;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.server.servletProperties;
// test url:
// http://localhost:8080/FeedReader_p.html?url=http://www.tagesthemen.de/xml/rss2
public class FeedReader_p {
public static servletProperties respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
final servletProperties prop = new servletProperties();
prop.put("page", "0");
if (post != null) {
DigestURI url;
try {
url = new DigestURI(post.get("url"), null);
} catch (final MalformedURLException e) {
prop.put("page", "2");
return prop;
}
// int maxitems=Integer.parseInt(post.get("max", "0"));
// int offset=Integer.parseInt(post.get("offset", "0")); //offset to the first displayed item
try {
final RSSFeed feed = new RSSReader(url.toString()).getFeed();
prop.putHTML("page_title", feed.getChannel().getTitle());
if (feed.getChannel().getAuthor() == null) {
prop.put("page_hasAuthor", "0");
} else {
prop.put("page_hasAuthor", "1");
prop.putHTML("page_hasAuthor_author", feed.getChannel().getAuthor());
}
prop.putHTML("page_description", feed.getChannel().getDescription());
int i = 0;
for (final Hit item: feed) {
prop.putHTML("page_items_" + i + "_author", item.getAuthor());
prop.putHTML("page_items_" + i + "_title", item.getTitle());
prop.putHTML("page_items_" + i + "_link", item.getLink());
prop.putHTML("page_items_" + i + "_description", item.getDescription());
prop.putHTML("page_items_" + i + "_date", DateFormatter.formatShortSecond(item.getPubDate()));
i++;
}
prop.put("page_items", feed.size());
prop.put("page", "1");
} catch (IOException e) {
Log.logException(e);
}
}
// return rewrite properties
return prop;
}
}

View File

@ -30,7 +30,7 @@ import de.anomic.search.SwitchboardConstants;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
public class ConfigWikiSearch {
public class Load_MediawikiWiki {
public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
// return variable that accumulates replacements

View File

@ -30,7 +30,7 @@ import de.anomic.search.SwitchboardConstants;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
public class ConfigPHPBB3Search {
public class Load_PHPBB3 {
public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
// return variable that accumulates replacements

90
htroot/Load_RSS_p.html Normal file
View File

@ -0,0 +1,90 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>YaCy '#[clientname]#': Configuration of a Wiki Search</title>
#%env/templates/metas.template%#
<script type="text/javascript">
<!--
function setall(name, check){
var selectForm = document.forms.namedItem(name);
var count = selectForm.elements["num"].value;
if (check) for(i = 0; i < count; i++) {
if (selectForm.elements["item_" + i].checked) {
check = false;
break;
}
}
for(i = 0; i < count; i++){
selectForm.elements["item_" + i].checked = check;
}
}
-->
</script>
<script type="text/javascript" src="/js/sorttable.js"></script>
</head>
<body id="IndexCreate">
#%env/templates/header.template%#
#%env/templates/submenuIndexCreate.template%#
<h2>Loading of RSS Feeds</h2>
<p>
RSS feeds can be loaded into the YaCy search index.
This does not load the rss file as such into the index but all the messages inside the RSS feeds as individual documents.
</p>
<form action="Load_RSS_p.html" method="get">
<fieldset>
<dl>
<dt><b>URL of the RSS feed</b></dt>
<dd><input type="text" name="url" value="#[url]#" size="60" maxlength="256"/></dd>
<dt>Simulation Mode</dt>
<dd><input type="submit" name="showrss" value="Show RSS Items" /></dd>
<dt>Indexing Mode</dt>
<dd>#(showload)#Available after successful loading of rss feed in simulation mode::
<!--<input type="submit" name="loadrss" value="Index RSS Items" />-->not yet implemented <b>THIS INTERFACE IS A STUB - DEVELOPMENT IS ONGOING</b>
#(/showload)#</dd>
</dl>
</fieldset>
</form>
#(showitems)#::
<form name="rssfeed"><fieldset>
<legend><label for="table">RSS Feed</label></legend>
<dl>
<dt>Title</dt><dd>#[title]#</dd>
<dt>Author</dt><dd>#[author]#</dd>
<dt>Description</dt><dd>#[description]#</dd>
<dt>Language</dt><dd>#[language]#</dd>
<dt>Date</dt><dd>#[date]#</dd>
<dt>Time-to-live</dt><dd>#[ttl]#</dd>
<dt>Docs</dt><dd>#[docs]#</dd>
</dl>
<table class="sortable" border="0" cellpadding="2" cellspacing="1">
<tr class="TableHeader" valign="bottom">
<td><input type="checkbox" name="allswitch" onclick="setall(this.form.name, this.value)" /></td>
<td>Title</td>
<td>URL</td>
<td>Author</td>
<td>Language</td>
<td>Date</td>
<td>Description</td>
</tr>
#{item}#
<tr class="TableCellLight">
<td align="left"><input type="checkbox" name="item_#[count]#" value="mark_#[hash]#" /></td>
<td><a href="#[link]#">#[title]#</a></td>
<td><a href="#[link]#">#[link]#</a></td>
<td>#[author]#</td>
<td>#[language]#</td>
<td>#[date]#</td>
<td>#[description]#</td>
</tr>
#{/item}#
</table>
<input type="hidden" name="num" value="#[num]#" />
</fieldset></form>
#(/showitems)#
#%env/templates/footer.template%#
</body>
</html>

114
htroot/Load_RSS_p.java Normal file
View File

@ -0,0 +1,114 @@
/**
* RSSLoader_p
* Copyright 2010 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany
* First released 20.08.2010 at http://yacy.net
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
import java.io.IOException;
import java.net.MalformedURLException;
import java.text.DateFormat;
import net.yacy.cora.document.Hit;
import net.yacy.cora.document.RSSFeed;
import net.yacy.cora.document.RSSMessage;
import net.yacy.cora.document.RSSReader;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.retrieval.Response;
import de.anomic.http.server.RequestHeader;
import de.anomic.search.Switchboard;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
public class Load_RSS_p {
public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
final serverObjects prop = new serverObjects();
final Switchboard sb = (Switchboard)env;
prop.put("showitems", 0);
prop.put("showload", 0);
prop.put("url", "");
if (post == null) return prop;
prop.put("url", post.get("url", ""));
DigestURI url = null;
try {
url = post.containsKey("url") ? new DigestURI(post.get("url", ""), null) : null;
} catch (MalformedURLException e) {
Log.logException(e);
}
// if we have an url then try to load the rss
RSSReader rss = null;
if (url != null) try {
prop.put("url", url.toNormalform(true, false));
Response entry = sb.loader.load(sb.loader.request(url, true, false), CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE);
byte[] resource = entry == null ? null : entry.getContent();
rss = resource == null ? null : RSSReader.parse(resource);
} catch (IOException e) {
Log.logException(e);
}
if (rss != null) {
prop.put("showitems", 1);
RSSFeed feed = rss.getFeed();
RSSMessage channel = feed.getChannel();
prop.putHTML("showitems_title", channel.getTitle());
String author = channel.getAuthor();
if (author == null || author.length() == 0) author = channel.getCopyright();
prop.putHTML("showitems_author", author == null ? "" : author);
prop.putHTML("showitems_description", channel.getDescription());
prop.putHTML("showitems_language", channel.getLanguage());
prop.putHTML("showitems_date", DateFormat.getDateTimeInstance().format(channel.getPubDate()));
prop.putHTML("showitems_ttl", channel.getTTL());
prop.putHTML("showitems_docs", channel.getDocs());
int i = 0;
for (final Hit item: feed) {
try {
url = new DigestURI(item.getLink(), null);
author = item.getAuthor();
if (author == null) author = item.getCopyright();
prop.put("showitems_item_" + i + "_count", i);
prop.putHTML("showitems_item_" + i + "_hash", new String(url.hash()));
prop.putHTML("showitems_item_" + i + "_author", author == null ? "" : author);
prop.putHTML("showitems_item_" + i + "_title", item.getTitle());
prop.putHTML("showitems_item_" + i + "_link", url.toNormalform(false, false));
prop.putHTML("showitems_item_" + i + "_description", item.getDescription());
prop.putHTML("showitems_item_" + i + "_language", item.getLanguage());
prop.putHTML("showitems_item_" + i + "_date", DateFormat.getDateTimeInstance().format(item.getPubDate()));
i++;
} catch (MalformedURLException e) {
Log.logException(e);
continue;
}
}
prop.put("showitems_item", i);
prop.put("showitems_num", i);
if (i > 0) prop.put("showload", 1);
}
return prop;
}
}

View File

@ -1,97 +0,0 @@
//ViewFile.java
//-----------------------
//part of YaCy
//(C) by Michael Peter Christen; mc@yacy.net
//first published on http://www.anomic.de
//Frankfurt, Germany, 2004
//last major change: 12.07.2004
//This program is free software; you can redistribute it and/or modify
//it under the terms of the GNU General Public License as published by
//the Free Software Foundation; either version 2 of the License, or
//(at your option) any later version.
//This program is distributed in the hope that it will be useful,
//but WITHOUT ANY WARRANTY; without even the implied warranty of
//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
//GNU General Public License for more details.
//You should have received a copy of the GNU General Public License
//along with this program; if not, write to the Free Software
//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//you must compile this file with
//javac -classpath .:../Classes Status.java
//if the shell's current path is HTROOT
import java.io.IOException;
import java.net.MalformedURLException;
import net.yacy.cora.document.RSSReader;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.retrieval.Response;
import de.anomic.http.server.RequestHeader;
import de.anomic.search.Switchboard;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
public class RSSLoader_p {
public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
final serverObjects prop = new serverObjects();
final Switchboard sb = (Switchboard)env;
if (post == null) {
return prop;
}
DigestURI url = null;
final String urlString = post.get("url", "");
if (urlString.length() > 0) try {
url = new DigestURI(urlString, null);
} catch (final MalformedURLException e) {
return prop;
}
// if the resource body was not cached we try to load it from web
Response entry = null;
try {
entry = sb.loader.load(sb.loader.request(url, true, false), CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE);
} catch (final Exception e) {
return prop;
}
if (entry == null) return prop;
byte[] resource = entry.getContent();
if (resource == null) {
return prop;
}
// now parse the content as rss
RSSReader rss;
try {
rss = RSSReader.parse(resource);
} catch (IOException e) {
Log.logException(e);
return prop;
}
// get the links out of the rss
//Map<DigestURI, String> map = doc.getAnchors();
// put the urls into crawler using the proxy profile
return prop;
}
}

View File

@ -2,8 +2,9 @@
<h3>Index Creation</h3>
<ul class="SubMenu">
<li><a href="/CrawlStart_p.html" class="MenuItemLink lock">Crawl Start (Advanced)</a></li>
<li><a href="/ConfigWikiSearch.html" class="MenuItemLink">Indexing of Media Wikis</a></li>
<li><a href="/ConfigPHPBB3Search.html" class="MenuItemLink">Indexing of phpBB3 Forums</a></li>
<li><a href="/Load_MediawikiWiki.html" class="MenuItemLink">Indexing of Media Wikis</a></li>
<li><a href="/Load_PHPBB3.html" class="MenuItemLink">Indexing of phpBB3 Forums</a></li>
<li><a href="/Load_RSS_p.html" class="MenuItemLink lock">Indexing of RSS Feeds</a></li>
<li><a href="/ProxyIndexingMonitor_p.html" class="MenuItemLink lock">Scraping Proxy Configuration</a></li>
</ul>
</div>

View File

@ -39,7 +39,7 @@ public class RSSMessage implements Hit {
title("title"),
link("link"),
description("description"),
pubDate("pubDate"),
pubDate("pubDate,lastBuildDate"),
copyright("copyright,dc:publisher,publisher"),
author("author,dc:creator,creator"),
subject("subject,dc:subject"),
@ -47,6 +47,7 @@ public class RSSMessage implements Hit {
referrer("referrer,referer"),
language("language"),
guid("guid"),
ttl("ttl"),
docs("docs");
private Set<String> keys;
@ -159,6 +160,10 @@ public class RSSMessage implements Hit {
return Token.guid.valueFrom(this.map);
}
public String getTTL() {
return Token.ttl.valueFrom(this.map);
}
public String getDocs() {
return Token.docs.valueFrom(this.map);
}

View File

@ -120,6 +120,11 @@ public class RSSReader extends DefaultHandler {
item = new RSSMessage();
parsingChannel = true;
} else if ("item".equals(tag)) {
if (parsingChannel) {
// the channel ends with the first item not with the channel close tag
theChannel.setChannel(item);
parsingChannel = false;
}
item = new RSSMessage();
parsingItem = true;
} else if ("image".equals(tag)) {
@ -132,7 +137,6 @@ public class RSSReader extends DefaultHandler {
if (tag == null) return;
if ("channel".equals(tag)) {
parsingChannel = false;
theChannel.setChannel(item);
} else if ("item".equals(tag)) {
theChannel.addMessage(item);
parsingItem = false;

View File

@ -45,6 +45,7 @@ import net.yacy.document.parser.ooxmlParser;
import net.yacy.document.parser.pdfParser;
import net.yacy.document.parser.pptParser;
import net.yacy.document.parser.psParser;
import net.yacy.document.parser.rssParser;
import net.yacy.document.parser.rtfParser;
import net.yacy.document.parser.sevenzipParser;
import net.yacy.document.parser.swfParser;
@ -81,6 +82,7 @@ public final class TextParser {
initParser(new pdfParser());
initParser(new pptParser());
initParser(new psParser());
initParser(new rssParser());
initParser(new rtfParser());
initParser(new sevenzipParser());
initParser(new swfParser());

View File

@ -0,0 +1,102 @@
/**
* rssParser.java
* Copyright 2010 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany
* First released 20.08.2010 at http://yacy.net
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.document.parser;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.document.RSSFeed;
import net.yacy.cora.document.RSSReader;
import net.yacy.cora.document.Hit;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.TextParser;
import net.yacy.document.parser.html.ImageEntry;
public class rssParser extends AbstractParser implements Parser {
public rssParser() {
super("RSS Parser");
SUPPORTED_EXTENSIONS.add("rss");
SUPPORTED_EXTENSIONS.add("xml");
SUPPORTED_MIME_TYPES.add("XML");
SUPPORTED_MIME_TYPES.add("text/rss");
SUPPORTED_MIME_TYPES.add("application/rss+xml");
SUPPORTED_MIME_TYPES.add("application/atom+xml");
}
public Document[] parse(MultiProtocolURI url, String mimeType, String charset, InputStream source) throws Failure, InterruptedException {
RSSReader rssReader;
try {
rssReader = new RSSReader(source);
} catch (IOException e) {
throw new Parser.Failure("Load error:" + e.getMessage(), url);
}
RSSFeed feed = rssReader.getFeed();
//RSSMessage channel = feed.getChannel();
List<Document> docs = new ArrayList<Document>();
MultiProtocolURI uri;
Set<String> languages;
Map<MultiProtocolURI, String> anchors;
Document doc;
for (Hit item: feed) try {
uri = new MultiProtocolURI(item.getLink());
languages = new HashSet<String>();
languages.add(item.getLanguage());
anchors = new HashMap<MultiProtocolURI, String>();
anchors.put(uri, item.getTitle());
doc = new Document(
uri,
TextParser.mimeOf(url),
charset,
languages,
item.getSubject(),
item.getTitle(),
item.getAuthor(),
item.getCopyright(),
new String[0],
item.getDescription(),
null,
anchors,
new HashMap<MultiProtocolURI, ImageEntry>(),
false);
docs.add(doc);
} catch (MalformedURLException e) {
continue;
}
Document[] da = new Document[docs.size()];
docs.toArray(da);
return da;
}
}