repaired rss feed reader

- removed old rss parser
- removed unused rss parser libraries
- added new rss reader
- added previously removed FeedReader_p.java and adopted it to new rss parser
- adopted parser interface for rss indexing to new rss parser

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3970 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
orbiter 2007-07-16 10:07:48 +00:00
parent 26ddf797eb
commit 9da0e53fe8
7 changed files with 308 additions and 185 deletions

View File

@ -16,7 +16,6 @@
<classpathentry kind="lib" path="libx/commons-discovery.jar"/>
<classpathentry kind="lib" path="libx/commons-jxpath-1.1.jar"/>
<classpathentry kind="lib" path="libx/commons-logging.jar"/>
<classpathentry kind="lib" path="libx/informa-0.6.0.jar"/>
<classpathentry kind="lib" path="libx/jakarta-oro-2.0.7.jar"/>
<classpathentry kind="lib" path="libx/jaxrpc.jar"/>
<classpathentry kind="lib" path="libx/jdom.jar"/>

79
htroot/FeedReader_p.java Normal file
View File

@ -0,0 +1,79 @@
//FeedReader_p.java
//------------
// part of YACY
//
// (C) 2007 Alexander Schier
//
// last change: $LastChangedDate: $ by $LastChangedBy: $
// $LastChangedRevision: $
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
import java.net.MalformedURLException;
import de.anomic.http.httpHeader;
import de.anomic.net.URL;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.server.servletProperties;
import de.anomic.xml.rssReader;
// test url:
// http://localhost:8080/FeedReader_p.html?url=http://www.tagesthemen.de/xml/rss2
public class FeedReader_p {
public static servletProperties respond(httpHeader header, serverObjects post, serverSwitch env) {
servletProperties prop = new servletProperties();
prop.put("page", 0);
if (post != null) {
URL url;
try {
url = new URL((String) post.get("url"));
} catch (MalformedURLException e) {
prop.put("page", 2);
return prop;
}
// int maxitems=Integer.parseInt(post.get("max", "0"));
// int offset=Integer.parseInt(post.get("offset", "0")); //offset to the first displayed item
rssReader parser = new rssReader(url.toString());
prop.put("page_title", parser.getChannel().getTitle());
if (parser.getChannel().getAuthor() == null) {
prop.put("page_hasAuthor", 0);
} else {
prop.put("page_hasAuthor", 1);
prop.put("page_hasAuthor_author", parser.getChannel().getAuthor());
}
prop.put("page_description", parser.getChannel().getDescription());
for (int i = 0; i < parser.items(); i++) {
rssReader.Item item = parser.getItem(i);
prop.put("page_items_" + i + "_author", item.getAuthor());
prop.put("page_items_" + i + "_title", item.getTitle());
prop.put("page_items_" + i + "_link", item.getLink());
prop.putASIS("page_items_" + i + "_description", item.getDescription());
prop.put("page_items_" + i + "_date", item.getPubDate());
}
prop.put("page_items", parser.items());
prop.put("page", 1);
}
// return rewrite properties
return prop;
}
}

View File

@ -1,126 +0,0 @@
//rssReader.java
//------------
// part of YACY
//
// (C) 2007 Alexander Schier
//
// last change: $LastChangedDate: $ by $LastChangedBy: $
// $LastChangedRevision: $
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.data;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Collection;
import java.util.Comparator;
import java.util.Date;
import java.util.Iterator;
import java.util.TreeSet;
import de.nava.informa.core.ChannelIF;
import de.nava.informa.core.ParseException;
import de.nava.informa.impl.basic.ChannelBuilder;
import de.nava.informa.parsers.FeedParser;
import de.anomic.yacy.yacyCore;
public class rssReader {
URL url;
ChannelIF channel;
TreeSet feedItems;
public rssReader(String url) throws MalformedURLException{
this.url=new URL(url);
String yAddress=yacyCore.seedDB.resolveYacyAddress(this.url.getHost());
if(yAddress != null){
this.url=new URL(this.url.getProtocol()+"://"+yAddress+"/"+this.url.getPath());
}
ChannelBuilder builder=new ChannelBuilder();
try {
channel=FeedParser.parse(builder, this.url);
Collection oldfeedItems=channel.getItems();
feedItems=new TreeSet(new ItemComparator());
Iterator it=oldfeedItems.iterator();
int count=0;
while(it.hasNext()){
de.nava.informa.impl.basic.Item item=(de.nava.informa.impl.basic.Item) it.next();
Item newItem=new Item(count++, item.getLink(), item.getTitle(), item.getDescription(), item.getDate(), item.getCreator());
feedItems.add(newItem);
}
}
catch (IOException e) {}
catch (ParseException e) {}
}
public String getCreator(){
return (channel!=null)? channel.getCreator(): null;
}
public String getTitle(){
return (channel!=null)? channel.getTitle(): null;
}
public String getDescription(){
return (channel!=null)? channel.getDescription(): null;
}
public Collection getFeedItems(){
return feedItems;
}
public class Item{
String creator, title, description;
Date date;
URL link;
int num;
public Item(int num, URL link, String title, String description, Date date, String creator){
this.link=link;
this.title=title;
this.description=description;
this.date=date;
this.creator=creator;
this.num=num;
}
public URL getLink(){
return link;
}
public String getTitle(){
return (title!=null)? title: "";
}
public String getDescription(){
return (description!=null)? description: "";
}
public Date getDate(){
return (date!=null)? date: new Date();
}
public String getCreator(){
return (creator!=null)? creator: "";
}
public int getNum(){
return num;
}
}
public class ItemComparator implements Comparator {
public int compare(Object o1, Object o2){
int num1=((Item)o1).getNum();
int num2=((Item)o2).getNum();
return num2-num1;
}
public boolean equals(Object o1, Object o2){
return compare(o1, o2)==0;
}
}
}

View File

@ -48,7 +48,6 @@ import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.lang.reflect.Method;
import java.util.Map;
import net.sf.jmimemagic.MagicDetector;
@ -93,26 +92,7 @@ public class rssDetector implements MagicDetector {
}
private String[] detect(InputStream input) {
try {
// getting the format detector class
Class formatDetector = Class.forName("de.nava.informa.utils.FormatDetector");
// getting the proper method
Method getFormat = formatDetector.getMethod("getFormat", new Class[]{InputStream.class});
// invoke the method
Object format = getFormat.invoke(null, new Object[] {input});
if (format == null) return null;
else if (format.toString().startsWith("RSS ")) return new String[]{"application/rss+xml"};
else if (format.toString().startsWith("Atom ")) return new String[]{"application/atom+xml"};
else return null;
} catch (Exception e) {
return null;
} catch (Error e) {
return null;
}
return new String[]{"application/rss+xml"};
}
}

View File

@ -46,10 +46,8 @@ package de.anomic.plasma.parser.rss;
import java.io.ByteArrayInputStream;
import java.io.InputStream;
import java.io.Writer;
import java.util.Collection;
import java.util.HashMap;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.Map;
import java.util.TreeSet;
@ -66,11 +64,8 @@ import de.anomic.plasma.parser.ParserException;
import de.anomic.server.serverByteBuffer;
import de.anomic.server.serverCharBuffer;
import de.anomic.server.serverFileUtils;
import de.nava.informa.core.ChannelIF;
import de.nava.informa.core.ImageIF;
import de.nava.informa.impl.basic.ChannelBuilder;
import de.nava.informa.impl.basic.Item;
import de.nava.informa.parsers.FeedParser;
import de.anomic.xml.rssReader;
import de.anomic.xml.rssReader.Item;
public class rssParser extends AbstractParser implements Parser {
@ -79,7 +74,7 @@ public class rssParser extends AbstractParser implements Parser {
* @see #getSupportedMimeTypes()
*/
public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable();
static {
static {
SUPPORTED_MIME_TYPES.put("text/rss","xml,rss,rdf");
SUPPORTED_MIME_TYPES.put("application/rdf+xml","xml,rss,rdf");
SUPPORTED_MIME_TYPES.put("application/rss+xml","xml,rss,rdf");
@ -90,11 +85,7 @@ public class rssParser extends AbstractParser implements Parser {
* a list of library names that are needed by this parser
* @see Parser#getLibxDependences()
*/
private static final String[] LIBX_DEPENDENCIES = new String[] {
"informa-0.6.0.jar",
"commons-logging.jar",
"jdom.jar"
};
private static final String[] LIBX_DEPENDENCIES = new String[] {};
public rssParser() {
super(LIBX_DEPENDENCIES);
@ -110,44 +101,32 @@ public class rssParser extends AbstractParser implements Parser {
serverByteBuffer text = new serverByteBuffer();
serverCharBuffer authors = new serverCharBuffer();
// creating a channel-builder
ChannelBuilder builder = new ChannelBuilder();
// parsing the rss/atom feed
ChannelIF channel = FeedParser.parse(builder, source);
rssReader reader = new rssReader(source);
// getting the rss feed title and description
String feedTitle = channel.getTitle();
String feedTitle = reader.getChannel().getTitle();
// getting feed creator
String feedCreator = channel.getCreator();
String feedCreator = reader.getChannel().getAuthor();
if (feedCreator != null && feedCreator.length() > 0) authors.append(",").append(feedCreator);
// getting the feed description
String feedDescription = channel.getDescription();
String feedDescription = reader.getChannel().getDescription();
// getting the channel site url
//URL channelSiteURL = channel.getSite();
ImageIF channelImage = channel.getImage();
if (channelImage != null) {
images.add(new htmlFilterImageEntry(new URL(channelImage.getLocation().toExternalForm()), channelImage.getTitle(), -1, -1));
if (reader.getImage() != null) {
images.add(new htmlFilterImageEntry(new URL(reader.getImage()), feedTitle, -1, -1));
}
// loop through the feed items
Collection feedItemCollection = channel.getItems();
if (!feedItemCollection.isEmpty()) {
Iterator feedItemIterator = feedItemCollection.iterator();
while (feedItemIterator.hasNext()) {
for (int i = 0; i < reader.items(); i++) {
// check for interruption
checkInterruption();
// getting the next item
Item item = (Item)feedItemIterator.next();
Item item = reader.getItem(i);
String itemTitle = item.getTitle();
URL itemURL = new URL(item.getLink().toExternalForm());
URL itemURL = new URL(item.getLink());
String itemDescr = item.getDescription();
String itemCreator = item.getCreator();
if (itemCreator != null && itemCreator.length() > 0) authors.append(",").append(itemCreator);
@ -158,7 +137,7 @@ public class rssParser extends AbstractParser implements Parser {
if ((text.length() != 0) && (text.byteAt(text.length() - 1) != 32)) text.append((byte) 32);
text.append(new serverCharBuffer(htmlFilterAbstractScraper.stripAll(new serverCharBuffer(itemDescr.toCharArray()))).trim().toString()).append(' ');
String itemContent = item.getElementValue("content");
String itemContent = item.getDescription();
if ((itemContent != null) && (itemContent.length() > 0)) {
htmlFilterContentScraper scraper = new htmlFilterContentScraper(itemURL);
@ -187,7 +166,6 @@ public class rssParser extends AbstractParser implements Parser {
}
}
}
}
plasmaParserDocument theDoc = new plasmaParserDocument(

View File

@ -186,7 +186,7 @@ public class SearchService extends AbstractService
/**
* @param url the url
* @param link the url
* @param viewMode one of (VIEW_MODE_AS_PLAIN_TEXT = 1,
* VIEW_MODE_AS_PARSED_TEXT = 2,
* VIEW_MODE_AS_PARSED_SENTENCES = 3) [Source: ViewFile.java]

View File

@ -0,0 +1,213 @@
package de.anomic.xml;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
public class rssReader extends DefaultHandler {
// statics for item generation and automatic categorization
private static int guidcount = 0;
private static final String[] tagsDef = new String[]{
"author", //
"copyright", //
"category", //
"title", //
"link", //
"language", //
"description", //
"creator", //
"pubDate", //
"guid", //
"docs" //
};
private static final HashSet tags = new HashSet();
static {
for (int i = 0; i < tagsDef.length; i++) {
tags.add(tagsDef[i]);
}
}
// class variables
private Item channel, item;
private StringBuffer buffer;
private boolean parsingChannel, parsingImage, parsingItem;
private String imageURL;
private ArrayList itemsGUID; // a list of GUIDs, so the items can be retrieved by a specific order
private HashMap items; // a guid:Item map
public rssReader(String path) {
init();
parse(path);
}
public rssReader(InputStream stream) {
init();
parse(stream);
}
private void init() {
itemsGUID = new ArrayList();
items = new HashMap();
buffer = new StringBuffer();
item = null;
channel = null;
parsingChannel = false;
parsingImage = false;
parsingItem = false;
}
private void parse(String path) {
try {
SAXParserFactory factory = SAXParserFactory.newInstance();
SAXParser saxParser = factory.newSAXParser();
saxParser.parse(path, this);
} catch (Exception e) {
e.printStackTrace();
}
}
private void parse(InputStream stream) {
try {
SAXParserFactory factory = SAXParserFactory.newInstance();
SAXParser saxParser = factory.newSAXParser();
saxParser.parse(stream, this);
} catch (Exception e) {
e.printStackTrace();
}
}
public void startElement(String uri, String name, String tag, Attributes atts) throws SAXException {
if ("channel".equals(tag)) {
channel = new Item();
parsingChannel = true;
} else if ("item".equals(tag)) {
item = new Item();
parsingItem = true;
} else if ("image".equals(tag)) {
parsingImage = true;
}
}
public void endElement(String uri, String name, String tag) {
if (tag == null) return;
if ("channel".equals(tag)) {
parsingChannel = false;
} else if ("item".equals(tag)) {
String guid = item.getGuid();
itemsGUID.add(guid);
items.put(guid, item);
parsingItem = false;
} else if ("image".equals(tag)) {
parsingImage = false;
} else if ((parsingImage) && (parsingChannel)) {
String value = buffer.toString().trim();
buffer.setLength(0);
if ("url".equals(tag)) imageURL = value;
} else if (parsingItem) {
String value = buffer.toString().trim();
buffer.setLength(0);
if (tags.contains(tag)) item.setValue(tag, value);
} else if (parsingChannel) {
String value = buffer.toString().trim();
buffer.setLength(0);
if (tags.contains(tag)) channel.setValue(tag, value);
}
}
public void characters(char ch[], int start, int length) {
if (parsingItem || parsingChannel) {
buffer.append(ch, start, length);
}
}
public Item getChannel() {
return channel;
}
public Item getItem(int i) {
// retrieve item by order number
return getItem((String) itemsGUID.get(i));
}
public Item getItem(String guid) {
// retrieve item by guid
return (Item) items.get(guid);
}
public int items() {
return items.size();
}
public String getImage() {
return this.imageURL;
}
public static class Item {
private HashMap map;
public Item() {
this.map = new HashMap();
this.map.put("guid", Long.toHexString(System.currentTimeMillis()) + ":" + guidcount++);
}
public void setValue(String name, String value) {
map.put(name, value);
}
public String getAuthor() {
return (String) map.get("author");
}
public String getCopyright() {
return (String) map.get("copyright");
}
public String getCategory() {
return (String) map.get("category");
}
public String getTitle() {
return (String) map.get("title");
}
public String getLink() {
return (String) map.get("link");
}
public String getLanguage() {
return (String) map.get("language");
}
public String getDescription() {
return (String) map.get("description");
}
public String getCreator() {
return (String) map.get("creator");
}
public String getPubDate() {
return (String) map.get("pubDate");
}
public String getGuid() {
return (String) map.get("guid");
}
public String getDocs() {
return (String) map.get("docs");
}
}
}