removed old rss parser (will be replaced with parser from cora package)

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7052 6c8d7289-2bf4-0310-a012-ef5d649a1542
2024-09-19 00:01:41 +02:00 · 2010-08-20 07:42:38 +00:00 · 2010-08-20 07:42:38 +00:00 · 933dc1a600
commit 933dc1a600
parent 70dd26ec95
4 changed files with 8 additions and 210 deletions
--- a/htroot/Crawler_p.java
+++ b/htroot/Crawler_p.java
@ -172,7 +172,7 @@ public class Crawler_p {
                    int repeat_time = Integer.parseInt(post.get("repeat_time", "-1"));
                    final String repeat_unit = post.get("repeat_unit", "seldays"); // selminutes, selhours, seldays
                    
-                    if (recrawl.equals("scheduler")) {
+                    if (recrawl.equals("scheduler") && repeat_time > 0) {
                        // set crawlingIfOlder attributes that are appropriate for scheduled crawling 
                        crawlingIfOlderCheck = true;
                        crawlingIfOlderNumber = repeat_unit.equals("selminutes") ? 1 : repeat_unit.equals("selhours") ? repeat_time / 2 : repeat_time * 12;
--- a/htroot/RSSLoader_p.java
+++ b/htroot/RSSLoader_p.java
@ -25,13 +25,12 @@
 //javac -classpath .:../Classes Status.java
 //if the shell's current path is HTROOT

-import java.io.ByteArrayInputStream;
+import java.io.IOException;
 import java.net.MalformedURLException;

-import net.yacy.document.Document;
-import net.yacy.document.Parser;
-import net.yacy.document.parser.rssParser;
+import net.yacy.cora.document.RSSReader;
 import net.yacy.kelondro.data.meta.DigestURI;
+import net.yacy.kelondro.logging.Log;

 import de.anomic.crawler.CrawlProfile;
 import de.anomic.crawler.retrieval.Response;
@ -77,14 +76,11 @@ public class RSSLoader_p {
        }
        
        // now parse the content as rss
-        ByteArrayInputStream bais = new ByteArrayInputStream(resource);
-        rssParser parser = new rssParser();
-        Document[] doc;
+        RSSReader rss;
        try {
-            doc = parser.parse(url, "text/rss", "UTF-8", bais);
-        } catch (Parser.Failure e) {
-            return prop;
-        } catch (InterruptedException e) {
+            rss = RSSReader.parse(resource);
+        } catch (IOException e) {
+            Log.logException(e);
            return prop;
        }
        
--- a/source/net/yacy/document/TextParser.java
+++ b/source/net/yacy/document/TextParser.java
@ -45,7 +45,6 @@ import net.yacy.document.parser.ooxmlParser;
 import net.yacy.document.parser.pdfParser;
 import net.yacy.document.parser.pptParser;
 import net.yacy.document.parser.psParser;
-import net.yacy.document.parser.rssParser;
 import net.yacy.document.parser.rtfParser;
 import net.yacy.document.parser.sevenzipParser;
 import net.yacy.document.parser.swfParser;
@ -82,7 +81,6 @@ public final class TextParser {
        initParser(new pdfParser());
        initParser(new pptParser());
        initParser(new psParser());
-        initParser(new rssParser());
        initParser(new rtfParser());
        initParser(new sevenzipParser());
        initParser(new swfParser());
--- a/source/net/yacy/document/parser/rssParser.java
+++ b/source/net/yacy/document/parser/rssParser.java
@ -1,196 +0,0 @@
-//rssParser.java 
-//------------------------
-//part of YaCy
-//(C) by Michael Peter Christen; mc@yacy.net
-//first published on http://www.anomic.de
-//Frankfurt, Germany, 2005
-//
-//this file is contributed by Martin Thelian
-//
-// $LastChangedDate$
-// $LastChangedRevision$
-// $LastChangedBy$
-//
-//This program is free software; you can redistribute it and/or modify
-//it under the terms of the GNU General Public License as published by
-//the Free Software Foundation; either version 2 of the License, or
-//(at your option) any later version.
-//
-//This program is distributed in the hope that it will be useful,
-//but WITHOUT ANY WARRANTY; without even the implied warranty of
-//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-//GNU General Public License for more details.
-//
-//You should have received a copy of the GNU General Public License
-//along with this program; if not, write to the Free Software
-//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
-
-package net.yacy.document.parser;
-
-import java.io.ByteArrayInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.UnsupportedEncodingException;
-import java.io.Writer;
-import java.net.MalformedURLException;
-import java.nio.charset.Charset;
-import java.util.HashMap;
-import java.util.LinkedList;
-import java.util.Map;
-
-import net.yacy.cora.document.Hit;
-import net.yacy.cora.document.MultiProtocolURI;
-import net.yacy.cora.document.RSSFeed;
-import net.yacy.cora.document.RSSReader;
-import net.yacy.document.AbstractParser;
-import net.yacy.document.Document;
-import net.yacy.document.Parser;
-import net.yacy.document.parser.html.AbstractScraper;
-import net.yacy.document.parser.html.ContentScraper;
-import net.yacy.document.parser.html.ImageEntry;
-import net.yacy.document.parser.html.TransformerWriter;
-import net.yacy.kelondro.io.CharBuffer;
-import net.yacy.kelondro.util.ByteBuffer;
-import net.yacy.kelondro.util.FileUtils;
-
-
-public class rssParser extends AbstractParser implements Parser {
-
-	public rssParser() {
-		super("Rich Site Summary/Atom Feed Parser");
-        SUPPORTED_EXTENSIONS.add("rss");
-        SUPPORTED_EXTENSIONS.add("xml");
-        SUPPORTED_MIME_TYPES.add("XML");
-        SUPPORTED_MIME_TYPES.add("text/rss");
-        SUPPORTED_MIME_TYPES.add("application/rss+xml");
-        SUPPORTED_MIME_TYPES.add("application/atom+xml");
-	}
-
-	public Document[] parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException {
-
-        final LinkedList<String> feedSections = new LinkedList<String>();
-        final HashMap<MultiProtocolURI, String> anchors = new HashMap<MultiProtocolURI, String>();
-        final HashMap<MultiProtocolURI, ImageEntry> images  = new HashMap<MultiProtocolURI, ImageEntry>();
-        final ByteBuffer text = new ByteBuffer();
-        final CharBuffer authors = new CharBuffer();
-        
-        RSSFeed feed = null;
-        try {
-            feed = new RSSReader(source).getFeed();
-        } catch (IOException e) {
-            throw new Parser.Failure("reading feed failed: " + e.getMessage(), location);
-        }
-        if (feed == null) throw new Parser.Failure("no feed in document", location);
-        
-        String feedTitle = "";
-        String feedDescription = "";
-        String feedPublisher = "";
-        String[] feedSubject = {""};
-        if (feed.getChannel() != null) {//throw new Parser.Failure("no channel in document",location);
-            
-            // get the rss feed title and description
-            feedTitle = feed.getChannel().getTitle();
-
-            // get feed creator
-			final String feedCreator = feed.getChannel().getAuthor();
-			if (feedCreator != null && feedCreator.length() > 0) authors.append(",").append(feedCreator);            
-            
-            // get the feed description
-            feedDescription = feed.getChannel().getDescription();
-            
-            // the feed publisher
-            feedPublisher = feed.getChannel().getCopyright();
-            
-            // the feed subject
-            feedSubject = feed.getChannel().getSubject();
-        }
-        
-        if (feed.getImage() != null) {
-            try {
-                MultiProtocolURI imgURL = new MultiProtocolURI(feed.getImage());
-                images.put(imgURL, new ImageEntry(imgURL, feedTitle, -1, -1, -1));
-            } catch (MalformedURLException e) {}
-        }            
-        
-        // loop through the feed items
-        for (final Hit item: feed) {
-                
-    			final String itemTitle = item.getTitle();
-    			MultiProtocolURI itemURL = null;
-                try {
-                    itemURL = new MultiProtocolURI(item.getLink());
-                } catch (MalformedURLException e) {
-                    continue;
-                }
-    			final String itemDescr = item.getDescription();
-    			final String itemCreator = item.getAuthor();
-    			if (itemCreator != null && itemCreator.length() > 0) authors.append(",").append(itemCreator);
-                
-                feedSections.add(itemTitle);
-                anchors.put(itemURL, itemTitle);
-                
-            	if ((text.length() != 0) && (text.byteAt(text.length() - 1) != 32)) text.append((byte) 32);
-            	text.append(AbstractScraper.stripAll(itemDescr).trim()).append(' ');
-                
-                final String itemContent = item.getDescription();
-                if ((itemContent != null) && (itemContent.length() > 0)) {
-                    
-                    final ContentScraper scraper = new ContentScraper(itemURL);
-                    final Writer writer = new TransformerWriter(null, null, scraper, null, false);
-                    try {
-                        FileUtils.copy(new ByteArrayInputStream(itemContent.getBytes("UTF-8")), writer, Charset.forName("UTF-8"));
-                    } catch (UnsupportedEncodingException e) {
-                        continue;
-                    } catch (IOException e) {
-                        continue;
-                    }
-                    
-                    final String itemHeadline = scraper.getTitle();     
-                    if (itemHeadline != null && itemHeadline.length() > 0) {
-                        feedSections.add(itemHeadline);
-                    }
-                    
-                    final Map<MultiProtocolURI, String> itemLinks = scraper.getAnchors();
-                    if (itemLinks != null && !itemLinks.isEmpty()) {
-                        anchors.putAll(itemLinks);
-                    }
-                    
-                    final HashMap<MultiProtocolURI, ImageEntry> itemImages = scraper.getImages();
-                    if (itemImages != null && !itemImages.isEmpty()) {
-                        ContentScraper.addAllImages(images, itemImages);
-                    }
-                    
-                    final byte[] extractedText = scraper.getText();
-                    if ((extractedText != null) && (extractedText.length > 0)) {
-						if ((text.length() != 0) && (text.byteAt(text.length() - 1) != 32)) text.append((byte) 32);
-						text.append(scraper.getText());
-                    }
-                    
-                }
-        }
-        
-        final Document[] docs = new Document[]{new Document(
-                location,
-                mimeType,
-                "UTF-8",
-                null,
-                feedSubject,
-                feedTitle,
-                (authors.length() > 0)?authors.toString(1,authors.length()):"",
-                feedPublisher,
-                feedSections.toArray(new String[feedSections.size()]),
-                feedDescription,
-                text.getBytes(),
-                anchors,
-                images,
-                false)};
-        // close streams
-        try {
-            text.close();
-            authors.close();
-        } catch (IOException e) {
-        }
-        
-        return docs;
-	}
-}