*) Migration of optional Content Parser integration

- each additional parser must be in a subpackage 
  of plasma.parser
- each parser must have its own ant build file (which will 
  be called automatically from the main build file)
- Calling the main build file results in building a separate 
  zip file for each optional parser. This zip file includes:
  + sources of the Parser.java
  + compiled classes of the Parser.java
  + needed additional libs (libx)
- To install an additional parser the user simply needs to
  extract the zip file listed above into his/her yacy directory.
- The configuration (enabling/disabling) of a parser can be done
  via the webinterface (currently the settings dialoge) and is
  done "on-the-fly". The installation can not be done "on-the-fly"
  at the moment because of classpath issues.
- The classpath of the linux startup/stop scripts is generated 
  automatically now (including all libraries from lib and libx).

*) Bugfix: File Extension was not calculated correctly by the crawler
   e.g.: file extension was accidentally: .php?param=value
   Corrected.

*) Adding additional parser for parsing of rss/atom feeds
- added needed libs to do this.

TODO:
- automatic building classpath for windows startup scripts


git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@78 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
theli 2005-05-03 09:47:56 +00:00
parent 1a4ad5a0ac
commit 351c86d5d9
27 changed files with 1285 additions and 525 deletions

View File

@ -3,6 +3,7 @@
YaCy - a Peer to Peer search Engine
</description>
<!-- defining all needed directory names -->
<property name="src" location="source"/>
<property name="lib" location="lib"/>
<property name="libx" location="libx"/>
@ -10,6 +11,10 @@
<property name="htroot" location="htroot"/>
<property name="release" location="RELEASE"/>
<!-- defining some compiler arguments -->
<property name="javacSource" value="1.4"/>
<property name="javacTarget" value="1.4"/>
<target name="init">
<tstamp/>
<mkdir dir="${build}/de/anomic/data"/>
@ -18,6 +23,7 @@
<mkdir dir="${build}/de/anomic/kelondro"/>
<mkdir dir="${build}/de/anomic/net"/>
<mkdir dir="${build}/de/anomic/plasma"/>
<mkdir dir="${build}/de/anomic/plasma/parser"/>
<mkdir dir="${build}/de/anomic/server"/>
<mkdir dir="${build}/de/anomic/tools"/>
<mkdir dir="${build}/de/anomic/yacy"/>
@ -36,7 +42,7 @@
<target name="compile" depends="init" description="compiles the source">
<javac srcdir="${src}/" destdir="${build}" excludes="de/anomic/plasma/parser/**/*">
<javac srcdir="${src}/" destdir="${build}" excludes="de/anomic/plasma/parser/*/*" source="${javacSource}" target="${javacTarget}">
<classpath>
<pathelement location="${build}" />
@ -45,9 +51,9 @@
<pathelement location="${lib}/commons-pool-1.2.jar" />
</classpath>
</javac>
<javac srcdir="${htroot}/" destdir="${htroot}" classpath="${build}"/>
<javac srcdir="${htroot}/htdocsdefault" destdir="${htroot}/htdocsdefault" classpath="${build}"/>
<javac srcdir="${htroot}/yacy" destdir="${htroot}/yacy" classpath="${build}"/>
<javac srcdir="${htroot}/" destdir="${htroot}" classpath="${build}" source="1.4" target="1.4"/>
<javac srcdir="${htroot}/htdocsdefault" destdir="${htroot}/htdocsdefault" classpath="${build}" source="1.4" target="1.4"/>
<javac srcdir="${htroot}/yacy" destdir="${htroot}/yacy" classpath="${build}" source="1.4" target="1.4"/>
</target>
@ -56,41 +62,19 @@
<move file="${src}/yacy.java.orig" tofile="${src}/yacy.java" />
</target>
<target name="dist" depends="all,pdfParser,docParser"/>
<!-- target to create a zip file for the optional pdf file Parser -->
<target name="pdfParser" depends="compile" description="Creating a zip file for the pdf parser addon">
<javac srcdir="${src}/de/anomic/plasma/parser/pdf" destdir="${build}">
<classpath>
<!-- libs needed to parse pdf files -->
<pathelement location="${build}" />
<pathelement location="${libx}/PDFBox-0.7.1.jar" />
<pathelement location="${libx}/log4j-1.2.9.jar" />
</classpath>
</javac>
<zip destfile="${release}/pdfparser.zip">
<zipfileset dir="${libx}" includes="PDFBox-0.7.1.*" prefix="libx/"/>
<zipfileset dir="${libx}" includes="log4j-1.2.9.*" prefix="libx/"/>
<zipfileset dir="${build}/de/anomic/plasma/parser/pdf" prefix="classes/de/anomic/plasma/parser/pdf"/>
</zip>
<target name="parsers" depends="compile" description="Compiling and zipping all additional parsers">
<subant target="">
<property name="src" location="${src}"/>
<property name="build" location="${build}"/>
<property name="libx" location="${libx}"/>
<property name="release" location="${release}"/>
<property name="javacSource" value="${javacSource}"/>
<property name="javacTarget" value="${javacTarget}"/>
<fileset dir="${src}/" includes="de/anomic/plasma/parser/*/build.xml"/>
</subant>
</target>
<!-- target to create a zip file for the optional doc file Parser -->
<target name="docParser" depends="compile" description="Creating a zip file for the doc parser addon">
<javac srcdir="${src}/de/anomic/plasma/parser/doc" destdir="${build}">
<classpath>
<pathelement location="${build}" />
<pathelement location="${libx}/tm-extractors-0.4.jar" />
</classpath>
</javac>
<zip destfile="${release}/docparser.zip">
<zipfileset dir="${libx}" includes="tm-extractors-0.4.*" prefix="libx/"/>
<zipfileset dir="${build}/de/anomic/plasma/parser/doc" prefix="classes/de/anomic/plasma/parser/doc"/>
</zip>
</target>
<target name="dist" depends="all,parsers"/>
<target name="clean" description="make clean">
<delete>

View File

@ -63,7 +63,9 @@ The settings have not been changed.
::
<b>The submitted peer name is not well-formed. Please choose a different name.</b><br>
Peer names must not contain characters other than (a-z, A-Z, 0-9, '-', '_') and must not be longer than 80 characters.
The settings have not been changed.#(/info)#
The settings have not been changed.
::
The new parser settings where changed successfully.#(/info)#
</p>
<p>You can now go back to the <a href="Settings_p.html">Settings</a> page if you want to make more changes.</p>

View File

@ -46,6 +46,7 @@
import java.util.*;
import java.io.*;
import de.anomic.tools.*;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.*;
import de.anomic.yacy.*;
import de.anomic.http.*;
@ -251,6 +252,19 @@ public class SettingsAck_p {
return prop;
}
if (post.containsKey("parserSettings")) {
plasmaSwitchboard sb = (plasmaSwitchboard)env;
post.remove("parserSettings");
// activate all received parsers
Enumeration mimeTypeEnum = post.keys();
sb.parser.setEnabledParserList(mimeTypeEnum);
prop.put("info", 18);
return prop;
}
// nothing made
prop.put("info", 1);//no information submitted
return prop;

View File

@ -171,6 +171,24 @@ but only if there had been changes to the seed-list.
</fieldset>
</form><br>
<p><form action="SettingsAck_p.html" method="post" enctype="multipart/form-data">
<fieldset><legend>Content Parser Settings</legend>
<p>Activation/Deactivation of additional content parsers ...</p>
<p>
<table>
#{parser}#
<tr>
<td><input type="checkbox" name="#[mime]#" align="top" #(status)#::checked#(/status)#></td>
<td>#[mime]#</td>
<td>#[name]#</td>
</tr>
#{/parser}#
<tr>
<td colspan="3"><input type="submit" name="parserSettings" value="submit"></td>
</tr>
</table>
</fieldset>
</form><br>
#[footer]#
</body>

View File

@ -45,6 +45,7 @@
import java.util.*;
import de.anomic.tools.*;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.*;
import de.anomic.yacy.*;
import de.anomic.http.*;
@ -124,8 +125,37 @@ public class Settings_p {
prop.put("seedFTPPath", env.getConfig("seedFTPPath", ""));
prop.put("seedFTPAccount", env.getConfig("seedFTPAccount", ""));
prop.put("seedFTPPassword", env.getConfig("seedFTPPassword", ""));
prop.put("seedURL", env.getConfig("seedURL", ""));
prop.put("seedURL", env.getConfig("seedURL", ""));
/*
* Parser Configuration
*/
plasmaSwitchboard sb = (plasmaSwitchboard)env;
Hashtable enabledParsers = sb.parser.getEnabledParserList();
Hashtable availableParsers = sb.parser.getAvailableParserList();
// fetching a list of all available mimetypes
List availableParserKeys = Arrays.asList(availableParsers.keySet().toArray(new String[availableParsers.size()]));
// sort it
Collections.sort(availableParserKeys);
// loop through the mimeTypes and add it to the properties
int parserIdx = 0;
Iterator availableParserIter = availableParserKeys.iterator();
while (availableParserIter.hasNext()) {
String mimeType = (String) availableParserIter.next();
prop.put("parser_" + parserIdx + "_mime", mimeType);
prop.put("parser_" + parserIdx + "_name", availableParsers.get(mimeType));
prop.put("parser_" + parserIdx + "_status", enabledParsers.containsKey(mimeType) ? 1:0);
parserIdx++;
}
prop.put("parser", parserIdx);
// return rewrite properties
return prop;
}

BIN
libx/commons-logging.jar Normal file

Binary file not shown.

BIN
libx/informa-0.6.0.jar Normal file

Binary file not shown.

View File

@ -0,0 +1,24 @@
//
// Informa -- RSS Library for Java
// Copyright (c) 2002 by Niko Schmuck
//
// Niko Schmuck
// http://sourceforge.net/projects/informa
// mailto:niko_schmuck@users.sourceforge.net
//
// This library is free software.
//
// You may redistribute it and/or modify it under the terms of the GNU
// Lesser General Public License as published by the Free Software Foundation.
//
// Version 2.1 of the license should be included with this distribution in
// the file LICENSE. If the license is not included with this distribution,
// you may find a copy at the FSF web site at 'www.gnu.org' or 'www.fsf.org',
// or you may write to the Free Software Foundation, 675 Mass Ave, Cambridge,
// MA 02139 USA.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied waranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//

BIN
libx/jdom.jar Normal file

Binary file not shown.

View File

@ -398,7 +398,7 @@ public abstract class htmlFilterAbstractScraper implements htmlFilterScraper {
return bb;
}
protected static serverByteBuffer stripAll(serverByteBuffer bb) {
public static serverByteBuffer stripAll(serverByteBuffer bb) {
//return stripAllTags(s);
return convertUmlaute(transscriptAll(stripAllTags(bb)));
}

View File

@ -0,0 +1,125 @@
//AbstractParser.java
//------------------------
//part of YaCy
//(C) by Michael Peter Christen; mc@anomic.de
//first published on http://www.anomic.de
//Frankfurt, Germany, 2005
//
//this file was contributed by Martin Thelian
//last major change: $LastChangedDate$ by $LastChangedBy$
//Revision: $LastChangedRevision$
//
//This program is free software; you can redistribute it and/or modify
//it under the terms of the GNU General Public License as published by
//the Free Software Foundation; either version 2 of the License, or
//(at your option) any later version.
//
//This program is distributed in the hope that it will be useful,
//but WITHOUT ANY WARRANTY; without even the implied warranty of
//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
//GNU General Public License for more details.
//
//You should have received a copy of the GNU General Public License
//along with this program; if not, write to the Free Software
//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
//Using this software in any meaning (reading, learning, copying, compiling,
//running) means that you agree that the Author(s) is (are) not responsible
//for cost, loss of data or any harm that may be caused directly or indirectly
//by usage of this softare or this documentation. The usage of this software
//is on your own risk. The installation and usage (starting/running) of this
//software may allow other people or application to access your computer and
//any attached devices and is highly dependent on the configuration of the
//software which must be done by the user of the software; the author(s) is
//(are) also not responsible for proper configuration and usage of the
//software, even if provoked by documentation provided together with
//the software.
//
//Any changes to this file according to the GPL as documented in the file
//gpl.txt aside this file in the shipment you received can be done to the
//lines that follows this copyright notice here, but changes must not be
//done inside the copyright notive above. A re-distribution must contain
//the intact and unchanged copyright notice.
//Contributions and changes to the program code must be marked as such.
package de.anomic.plasma.parser;
import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.InputStream;
import java.net.URL;
import de.anomic.plasma.plasmaParserDocument;
/**
* New classes implementing the {@link de.anomic.plasma.parser.Parser} interface
* can extend this class to inherit all functions already implemented in this class.
* @author Martin Thelian
* @version $LastChangedRevision$ / $LastChangedDate$
*/
public abstract class AbstractParser implements Parser{
/**
* The Constructor of this class.
*/
public AbstractParser() {
super();
}
/**
* Parsing a document available as byte array.
* @param location the origin of the document
* @param mimeType the mimetype of the document
* @param source the content byte array
* @return a {@link plasmaParserDocument} containing the extracted plain text of the document
* and some additional metadata.
* @throws ParserException if the content could not be parsed properly
*
* @see de.anomic.plasma.parser.Parser#parse(java.net.URL, java.lang.String, byte[])
*/
public plasmaParserDocument parse(URL location, String mimeType,
byte[] source) throws ParserException {
ByteArrayInputStream contentInputStream = new ByteArrayInputStream(source);
return this.parse(location,mimeType,contentInputStream);
}
/**
* Parsing a document stored in a {@link File}
* @param location the origin of the document
* @param mimeType the mimetype of the document
* @param sourceFile the file containing the content of the document
* @return a {@link plasmaParserDocument} containing the extracted plain text of the document
* and some additional metadata.
* @throws ParserException if the content could not be parsed properly
*
* @see de.anomic.plasma.parser.Parser#parse(java.net.URL, java.lang.String, java.io.File)
*/
public plasmaParserDocument parse(URL location, String mimeType,
File sourceFile) throws ParserException {
BufferedInputStream contentInputStream = null;
try {
contentInputStream = new BufferedInputStream(new FileInputStream(sourceFile));
} catch (FileNotFoundException e) {
e.printStackTrace();
}
return this.parse(location, mimeType, contentInputStream);
}
/**
* Parsing a document available as {@link InputStream}
* @param location the origin of the document
* @param mimeType the mimetype of the document
* @param source the {@link InputStream} containing the document content
* @return a {@link plasmaParserDocument} containing the extracted plain text of the document
* and some additional metadata.
* @throws ParserException if the content could not be parsed properly
*
* @see de.anomic.plasma.parser.Parser#parse(java.net.URL, java.lang.String, java.io.InputStream)
*/
public abstract plasmaParserDocument parse(URL location, String mimeType,
InputStream source) throws ParserException;
}

View File

@ -5,8 +5,9 @@
//first published on http://www.anomic.de
//Frankfurt, Germany, 2005
//
//this file is contributed by Martin Thelian
//last major change: 24.04.2005
//this file was contributed by Martin Thelian
//last major change: $LastChangedDate$ by $LastChangedBy$
//Revision: $LastChangedRevision$
//
//This program is free software; you can redistribute it and/or modify
//it under the terms of the GNU General Public License as published by
@ -46,23 +47,67 @@ package de.anomic.plasma.parser;
import java.io.File;
import java.io.InputStream;
import java.net.URL;
import java.util.HashSet;
import java.util.Hashtable;
import de.anomic.plasma.plasmaParserDocument;
/**
* This interface defines a list of methods that needs to be implemented
* by each content parser class.
* @author Martin Thelian
* @version $LastChangedRevision$ / $LastChangedDate$
*/
public interface Parser {
/**
* Parsing a document available as byte array
* @param location the origin of the document
* @param mimeType the mimetype of the document
* @param source the content byte array
* @return a {@link plasmaParserDocument} containing the extracted plain text of the document
* and some additional metadata.
*
* @throws ParserException if the content could not be parsed properly
*/
public plasmaParserDocument parse(URL location, String mimeType, byte[] source)
throws ParserException;
/**
* Parsing a document stored in a {@link File}
* @param location the origin of the document
* @param mimeType the mimetype of the document
* @param sourceFile the file containing the content of the document
* @return a {@link plasmaParserDocument} containing the extracted plain text of the document
* and some additional metadata.
*
* @throws ParserException if the content could not be parsed properly
*/
public plasmaParserDocument parse(URL location, String mimeType, File sourceFile)
throws ParserException;
/**
* Parsing a document available as {@link InputStream}
* @param location the origin of the document
* @param mimeType the mimetype of the document
* @param source the {@link InputStream} containing the document content
* @return a {@link plasmaParserDocument} containing the extracted plain text of the document
* and some additional metadata.
*
* @throws ParserException if the content could not be parsed properly
*/
public plasmaParserDocument parse(URL location, String mimeType, InputStream source)
throws ParserException;
public HashSet getSupportedMimeTypes();
/**
* Can be used to determine the MimeType(s) that are supported by the parser
* @return a {@link Hashtable} containing a list of MimeTypes that are supported by
* the parser
*/
public Hashtable getSupportedMimeTypes();
/**
* This function should be called before reusing the parser object.
*/
public void reset();

View File

@ -0,0 +1,40 @@
<?xml version="1.0"?>
<project name="YACY - docParser" default="dist">
<description>
A class to parse doc documents (application/msword)
</description>
<property name="parserShortName" value="doc"/>
<property name="parserVersion" value="0.1"/>
<property name="parserLongName" value="${parserShortName}Parser"/>
<property name="parserArchive" location="${release}/${parserLongName}_${parserVersion}.zip"/>
<target name="compile">
<echo message="Compiling ${parserLongName} Version ${parserVersion} ..."/>
<javac srcdir="${src}/de/anomic/plasma/parser/${parserShortName}" destdir="${build}" source="${javacSource}" target="${javacTarget}">
<classpath>
<pathelement location="${build}" />
<!-- main lib needed to parse doc files -->
<pathelement location="${libx}/tm-extractors-0.4.jar" />
</classpath>
</javac>
</target>
<target name="zip" depends="compile">
<echo message="Compressing ${parserLongName} Version ${parserVersion} ..."/>
<zip destfile="${parserArchive}">
<zipfileset dir="${libx}" includes="tm-extractors-0.4.*" prefix="libx/"/>
<zipfileset dir="${src}/de/anomic/plasma/parser/${parserShortName}" prefix="source/de/anomic/plasma/parser/${parserShortName}"/>
<zipfileset dir="${build}/de/anomic/plasma/parser/${parserShortName}" prefix="classes/de/anomic/plasma/parser/${parserShortName}"/>
</zip>
</target>
<target name="dist" depends="compile,zip" description="Compile and zip the parser"/>
</project>

View File

@ -43,53 +43,33 @@
package de.anomic.plasma.parser.doc;
import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.InputStream;
import java.net.URL;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Hashtable;
import org.textmining.text.extraction.WordExtractor;
import de.anomic.plasma.plasmaParserDocument;
import de.anomic.plasma.parser.AbstractParser;
import de.anomic.plasma.parser.Parser;
import de.anomic.plasma.parser.ParserException;
public class docParser implements Parser {
public class docParser
extends AbstractParser
implements Parser {
/**
* a list of mime types that are supported by this parser class
*/
public static final HashSet SUPPORTED_MIME_TYPES = new HashSet(Arrays.asList(new String[] {
new String("application/msword")
}));
* @see #getSupportedMimeTypes()
*/
public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable();
static { SUPPORTED_MIME_TYPES.put("application/msword","doc"); }
public docParser() {
super();
}
public plasmaParserDocument parse(URL location, String mimeType,
byte[] source) throws ParserException {
ByteArrayInputStream contentInputStream = new ByteArrayInputStream(source);
return this.parse(location,mimeType,contentInputStream);
}
public plasmaParserDocument parse(URL location, String mimeType,
File sourceFile) throws ParserException {
BufferedInputStream contentInputStream = null;
try {
contentInputStream = new BufferedInputStream(new FileInputStream(sourceFile));
} catch (FileNotFoundException e) {
e.printStackTrace();
}
return this.parse(location, mimeType, contentInputStream);
}
public plasmaParserDocument parse(URL location, String mimeType,
InputStream source) throws ParserException {
@ -117,21 +97,12 @@ public class docParser implements Parser {
}
}
public HashSet getSupportedMimeTypes() {
public java.util.Hashtable getSupportedMimeTypes() {
return docParser.SUPPORTED_MIME_TYPES;
}
public void reset() {
// TODO Auto-generated method stub
}
/**
* @param args
*/
public static void main(String[] args) {
// TODO Auto-generated method stub
// Nothing todo here at the moment
}
}

View File

@ -0,0 +1,44 @@
<?xml version="1.0"?>
<project name="YACY - pdfParser" default="dist">
<description>
A class to parse pdf documents (application/pdf)
</description>
<property name="parserShortName" value="pdf"/>
<property name="parserVersion" value="0.1"/>
<property name="parserLongName" value="${parserShortName}Parser"/>
<property name="parserArchive" location="${release}/${parserLongName}_${parserVersion}.zip"/>
<target name="compile">
<echo message="Compiling ${parserLongName} Version ${parserVersion} ..."/>
<javac srcdir="${src}/de/anomic/plasma/parser/${parserShortName}" destdir="${build}" source="${javacSource}" target="${javacTarget}">
<classpath>
<pathelement location="${build}" />
<!-- main lib needed to parse pdf files -->
<pathelement location="${libx}/PDFBox-0.7.1.jar" />
<!-- libs needed by the main lib -->
<pathelement location="${libx}/log4j-1.2.9.jar" />
</classpath>
</javac>
</target>
<target name="zip" depends="compile">
<echo message="Compressing ${parserLongName} Version ${parserVersion} ..."/>
<zip destfile="${parserArchive}">
<zipfileset dir="${libx}" includes="PDFBox-0.7.1.*" prefix="libx/"/>
<zipfileset dir="${libx}" includes="log4j-1.2.9.*" prefix="libx/"/>
<zipfileset dir="${src}/de/anomic/plasma/parser/${parserShortName}" prefix="source/de/anomic/plasma/parser/${parserShortName}"/>
<zipfileset dir="${build}/de/anomic/plasma/parser/${parserShortName}" prefix="classes/de/anomic/plasma/parser/${parserShortName}"/>
</zip>
</target>
<target name="dist" depends="compile,zip" description="Compile and zip the parser"/>
</project>

View File

@ -54,6 +54,7 @@ import java.io.OutputStreamWriter;
import java.net.URL;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Hashtable;
import org.pdfbox.pdfparser.PDFParser;
@ -62,42 +63,36 @@ import org.pdfbox.pdmodel.PDDocumentInformation;
import org.pdfbox.util.PDFTextStripper;
import de.anomic.plasma.plasmaParserDocument;
import de.anomic.plasma.parser.AbstractParser;
import de.anomic.plasma.parser.Parser;
import de.anomic.plasma.parser.ParserException;
public class pdfParser implements Parser
public class pdfParser extends AbstractParser implements Parser
{
/**
* a list of mime types that are supported by this parser class
* @see #getSupportedMimeTypes()
*/
public static final HashSet SUPPORTED_MIME_TYPES = new HashSet(Arrays.asList(new String[] {
new String("application/pdf")
}));
public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable();
static { SUPPORTED_MIME_TYPES.put("application/pdf","pdf"); }
/**
* a list of file extensions that are supported by this parser class
* @see #getSupportedMimeTypes()
*/
public static final HashSet SUPPORTED_FILE_EXT = new HashSet(Arrays.asList(new String[] {
new String("pdf")
}));
public pdfParser() {
super();
}
public HashSet getSupportedMimeTypes() {
public Hashtable getSupportedMimeTypes() {
return SUPPORTED_MIME_TYPES;
}
public plasmaParserDocument parse(URL location, String mimeType, File sourceFile) throws ParserException {
BufferedInputStream contentInputStream = null;
try {
contentInputStream = new BufferedInputStream(new FileInputStream(sourceFile));
} catch (FileNotFoundException e) {
e.printStackTrace();
}
return this.parse(location, mimeType, contentInputStream);
}
public plasmaParserDocument parse(URL location, String mimeType, byte[] source) throws ParserException {
ByteArrayInputStream contentInputStream = new ByteArrayInputStream(source);
return this.parse(location,mimeType,contentInputStream);
}
public plasmaParserDocument parse(URL location, String mimeType, InputStream source) throws ParserException {
try {
@ -155,8 +150,12 @@ public class pdfParser implements Parser
}
public void reset() {
// TODO Auto-generated method stub
// Nothing todo here at the moment
}
public HashSet getSupportedFileExtensions() {
return SUPPORTED_FILE_EXT;
}
}

View File

@ -0,0 +1,47 @@
<?xml version="1.0"?>
<project name="YACY - rssParser" default="dist">
<description>
A class to parse rss/atom feeds
(application/rss+xml, application/rdf+xml, application/atom+xml, application/rss)
</description>
<property name="parserShortName" value="rss"/>
<property name="parserVersion" value="0.1"/>
<property name="parserLongName" value="${parserShortName}Parser"/>
<property name="parserArchive" location="${release}/${parserLongName}_${parserVersion}.zip"/>
<target name="compile">
<echo message="Compiling ${parserLongName} Version ${parserVersion} ..."/>
<javac srcdir="${src}/de/anomic/plasma/parser/${parserShortName}" destdir="${build}" source="${javacSource}" target="${javacTarget}">
<classpath>
<pathelement location="${build}" />
<!-- main lib needed to parse rss/atom feed files -->
<pathelement location="${libx}/informa-0.6.0.jar" />
<!-- libs needed by the main lib -->
<pathelement location="${libx}/commons-logging.jar" />
<pathelement location="${libx}/jdom.jar" />
</classpath>
</javac>
</target>
<target name="zip" depends="compile">
<echo message="Compressing ${parserLongName} Version ${parserVersion} ..."/>
<zip destfile="${parserArchive}">
<zipfileset dir="${libx}" includes="informa-0.6.0.*" prefix="libx/"/>
<zipfileset dir="${libx}" includes="commons-logging.jar" prefix="libx/"/>
<zipfileset dir="${libx}" includes="jdom.jar" prefix="libx/"/>
<zipfileset dir="${src}/de/anomic/plasma/parser/${parserShortName}" prefix="source/de/anomic/plasma/parser/${parserShortName}"/>
<zipfileset dir="${build}/de/anomic/plasma/parser/${parserShortName}" prefix="classes/de/anomic/plasma/parser/${parserShortName}"/>
</zip>
</target>
<target name="dist" depends="compile,zip" description="Compile and zip the parser"/>
</project>

View File

@ -0,0 +1,180 @@
package de.anomic.plasma.parser.rss;
import java.io.ByteArrayInputStream;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.URL;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.Map;
import de.anomic.htmlFilter.htmlFilterAbstractScraper;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.htmlFilter.htmlFilterOutputStream;
import de.anomic.plasma.plasmaParserDocument;
import de.anomic.plasma.parser.AbstractParser;
import de.anomic.plasma.parser.Parser;
import de.anomic.plasma.parser.ParserException;
import de.anomic.server.serverByteBuffer;
import de.anomic.server.serverFileUtils;
import de.nava.informa.core.ChannelIF;
import de.nava.informa.core.ImageIF;
import de.nava.informa.impl.basic.ChannelBuilder;
import de.nava.informa.impl.basic.Item;
import de.nava.informa.parsers.FeedParser;
public class rssParser extends AbstractParser implements Parser {
/**
* a list of mime types that are supported by this parser class
* @see #getSupportedMimeTypes()
*/
public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable();
static {
SUPPORTED_MIME_TYPES.put("text/rss","xml,rss,rdf");
SUPPORTED_MIME_TYPES.put("application/rdf+xml","xml,rss,rdf");
SUPPORTED_MIME_TYPES.put("application/rss+xml","xml,rss,rdf");
SUPPORTED_MIME_TYPES.put("application/atom+xml","xml,atom");
}
/**
* a list of file extensions that are supported by this parser class
* @see #getSupportedMimeTypes()
*/
public static final HashSet SUPPORTED_FILE_EXT = new HashSet(Arrays.asList(new String[] {
new String("xml"),
new String("rss"),
new String("rdf"),
new String("atom")
}));
public rssParser() {
super();
}
public plasmaParserDocument parse(URL location, String mimeType,
InputStream source) throws ParserException {
try {
LinkedList feedSections = new LinkedList();
HashMap anchors = new HashMap();
HashMap images = new HashMap();
serverByteBuffer text = new serverByteBuffer();
// creating a channel-builder
ChannelBuilder builder = new ChannelBuilder();
// parsing the rss/atom feed
ChannelIF channel = FeedParser.parse(builder, source);
// getting the rss feed title and description
String feedTitle = channel.getTitle();
// getting the feed description
String feedDescription = channel.getDescription();
// getting the channel site url
URL channelSiteURL = channel.getSite();
ImageIF channelImage = channel.getImage();
if (channelImage != null) {
images.put(channelImage.getLocation().toString(),channelImage.getTitle());
}
// loop through the feed items
Collection feedItemCollection = channel.getItems();
if (!feedItemCollection.isEmpty()) {
Iterator feedItemIterator = feedItemCollection.iterator();
while (feedItemIterator.hasNext()) {
Item item = (Item)feedItemIterator.next();
String itemTitle = item.getTitle();
URL itemURL = item.getLink();
String itemDescr = item.getDescription();
feedSections.add(itemTitle);
anchors.put(itemURL.toString(),itemTitle);
if ((text.length() != 0) && (text.byteAt(text.length() - 1) != 32)) text.append((byte) 32);
text.append(new serverByteBuffer(htmlFilterAbstractScraper.stripAll(new serverByteBuffer(itemDescr.getBytes()))).trim()).append((byte) ' ');
String itemContent = item.getElementValue("content");
if ((itemContent != null) && (itemContent.length() > 0)) {
htmlFilterContentScraper scraper = new htmlFilterContentScraper(itemURL);
OutputStream os = new htmlFilterOutputStream(null, scraper, null, false);
serverFileUtils.copy(new ByteArrayInputStream(itemContent.getBytes()), os);
String itemHeadline = scraper.getHeadline();
if ((itemHeadline != null) && (itemHeadline.length() > 0)) {
feedSections.add(itemHeadline);
}
Map itemLinks = scraper.getAnchors();
if ((itemLinks != null) && (itemLinks.size() > 0)) {
anchors.putAll(itemLinks);
}
Map itemImages = scraper.getImages();
if ((itemImages != null) && (itemImages.size() > 0)) {
images.putAll(itemImages);
}
byte[] extractedText = scraper.getText();
if ((extractedText != null) && (extractedText.length > 0)) {
if ((text.length() != 0) && (text.byteAt(text.length() - 1) != 32)) text.append((byte) 32);
text.append(scraper.getText());
}
}
}
}
/* (URL location, String mimeType,
String keywords, String shortTitle, String longTitle,
String[] sections, String abstrct,
byte[] text, Map anchors, Map images)
*/
plasmaParserDocument theDoc = new plasmaParserDocument(
location,
mimeType,
null,
null,
feedTitle,
(String[]) feedSections.toArray(new String[feedSections.size()]),
feedDescription,
text.getBytes(),
anchors,
images);
return theDoc;
} catch (Exception e) {
}
return null;
}
public Hashtable getSupportedMimeTypes() {
return SUPPORTED_MIME_TYPES;
}
public void reset() {
// TODO Auto-generated method stub
}
public HashSet getSupportedFileExtensions() {
// TODO Auto-generated method stub
return SUPPORTED_FILE_EXT;
}
}

File diff suppressed because it is too large Load Diff

View File

@ -148,7 +148,7 @@ public class plasmaParserDocument {
Iterator i;
String url;
int extpos;
String ext;
String ext = null;
i = anchors.entrySet().iterator();
hyperlinks = new HashMap();
medialinks = new HashMap();
@ -163,10 +163,14 @@ public class plasmaParserDocument {
extpos = url.lastIndexOf(".");
String normal;
if (extpos > 0) {
ext = url.substring(extpos).toLowerCase();
if (url.indexOf("?") != -1) {
ext = url.substring(extpos,url.indexOf("?")).toLowerCase();
} else {
ext = url.substring(extpos).toLowerCase();
}
normal = plasmaParser.urlNormalform(url);
if (normal != null) {
if (plasmaParser.mediaExtSet.contains(ext.substring(1))) {
if (plasmaParser.mediaExtContains(ext.substring(1))) {
// this is not an normal anchor, its a media link
medialinks.put(normal, entry.getValue());
} else {

View File

@ -76,7 +76,7 @@ public final class serverLog {
// statics
private static serverLog genericLog = new serverLog("GENERIC", LOGLEVEL_DEBUG); // generic log
private static LinkedList lastLog = new LinkedList(); // for Web-Interface
private static LinkedList lastLog = new LinkedList(); // for Web-Interface
private static int lastlogMaxSize = 400; // for Web-Interface
// class variables

View File

@ -1,2 +1,8 @@
cd `dirname $0`
java -classpath classes:lib/commons-collections.jar:lib/commons-pool-1.2.jar:libx/PDFBox-0.7.1.jar:libx/log4j-1.2.9.jar:libx/tm-extractors-0.4.jar -server yacy
# generating the proper classpath
CLASSPATH=""
for N in `ls -1 lib/*.jar`; do CLASSPATH="$CLASSPATH$N:"; done
for N in `ls -1 libx/*.jar`; do CLASSPATH="$CLASSPATH$N:"; done
java -classpath classes:$CLASSPATH -server yacy

View File

@ -6,13 +6,19 @@ then
echo
else
cd `dirname $0`
# generating the proper classpath
CLASSPATH=""
for N in `ls -1 lib/*.jar`; do CLASSPATH="$CLASSPATH$N:"; done
for N in `ls -1 libx/*.jar`; do CLASSPATH="$CLASSPATH$N:"; done
if [ x$1 != x-d ]
then
nohup java -classpath classes:lib/commons-collections.jar:lib/commons-pool-1.2.jar:libx/PDFBox-0.7.1.jar:libx/log4j-1.2.9.jar:libx/tm-extractors-0.4.jar yacy >> yacy.log &
nohup java -classpath classes:$CLASSPATH yacy >> yacy.log &
echo "YaCy started as daemon process. View it's activity in yacy.log"
echo "To stop YaCy, please execute stopYACY.sh and wait some seconds"
echo "To administrate YaCy, start your web browser and open http://localhost:8080"
else
java -classpath classes:lib/commons-collections.jar:lib/commons-pool-1.2.jar yacy
java -classpath classes:$CLASSPATH yacy
fi
fi

View File

@ -1,2 +1,8 @@
cd `dirname $0`
java -classpath classes:lib/commons-collections.jar:lib/commons-pool-1.2.jar:libx/PDFBox-0.7.1.jar:libx/log4j-1.2.9.jar:libx/tm-extractors-0.4.jar yacy -shutdown
# generating the proper classpath
CLASSPATH=""
for N in `ls -1 lib/*.jar`; do CLASSPATH="$CLASSPATH$N:"; done
for N in `ls -1 libx/*.jar`; do CLASSPATH="$CLASSPATH$N:"; done
java -classpath classes:$CLASSPATH yacy -shutdown

View File

@ -1,5 +1,11 @@
#!/bin/sh
cd `dirname $0`
java -classpath classes:lib/commons-collections.jar:lib/commons-pool-1.2.jar:libx/PDFBox-0.7.1.jar:libx/log4j-1.2.9.jar:libx/tm-extractors-0.4.jar yacy -shutdown
# generating the proper classpath
CLASSPATH=""
for N in `ls -1 lib/*.jar`; do CLASSPATH="$CLASSPATH$N:"; done
for N in `ls -1 libx/*.jar`; do CLASSPATH="$CLASSPATH$N:"; done
java -classpath classes:$CLASSPATH yacy -shutdown
echo "please wait until the YaCy daemon process terminates"
echo "you can monitor this with 'tail -f yacy.log' and 'fuser yacy.log'"

View File

@ -10,7 +10,7 @@
# the http service configurations
# port number of server
port = 8080
port = 8090
# time-out of client control socket in milliseconds
# since this applies only to the client-proxy connection,
@ -79,13 +79,13 @@ proxyCache = DATA/HTCACHE
proxyCacheSize = 200
# the following mime-types are the whitelist for indexing
parseableMime=application/xhtml+xml,text/html,text/plain
parseableMime=application/xhtml+xml,text/html,text/plain,application/pdf,application/msword
# media extension string
# a comma-separated list of extensions that denote media file formats
# this is important to recognize <a href> - tags as not-html reference
# These files will be excluded from indexing
mediaExt=swf,wmv,jpg,jpeg,jpe,rm,mov,mpg,mpeg,mp3,asf,gif,png,avi,zip,rar,sit,hqx,img,dmg,tar,gz,ps,pdf,doc,xls,ppt,ram,bz2,arj,jar,deb,torrent,ogg,iso,bin,ace,tgz,rpm,css
mediaExt=swf,wmv,jpg,jpeg,jpe,rm,mov,mpg,mpeg,mp3,asf,gif,png,avi,zip,rar,sit,hqx,img,dmg,tar,gz,ps,xls,ppt,ram,bz2,arj,jar,deb,torrent,ogg,iso,bin,ace,tgz,rpm,css
# Promotion Strings
# These strings appear in the Web Mask of the YACY search client

View File

@ -1,2 +1,8 @@
#plasmaParser configuration file
#Mon May 02 10:12:02 CEST 2005
application/atom+xml=de.anomic.plasma.parser.rss.rssParser
text/rss=de.anomic.plasma.parser.rss.rssParser
application/rss+xml=de.anomic.plasma.parser.rss.rssParser
application/rdf+xml=de.anomic.plasma.parser.rss.rssParser
application/msword=de.anomic.plasma.parser.doc.docParser
application/pdf=de.anomic.plasma.parser.pdf.pdfParser
application/msword=de.anomic.plasma.parser.doc.docParser