yacy_search_server/source/de/anomic/plasma/plasmaParserDocument.java

//plasmaParserDocument.java 
//------------------------
//part of YaCy
//(C) by Michael Peter Christen; mc@anomic.de
//first published on http://www.anomic.de
//Frankfurt, Germany, 2005
//
//last major change: 24.04.2005
//
//This program is free software; you can redistribute it and/or modify
//it under the terms of the GNU General Public License as published by
//the Free Software Foundation; either version 2 of the License, or
//(at your option) any later version.
//
//This program is distributed in the hope that it will be useful,
//but WITHOUT ANY WARRANTY; without even the implied warranty of
//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//GNU General Public License for more details.
//
//You should have received a copy of the GNU General Public License
//along with this program; if not, write to the Free Software
//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
//
//Using this software in any meaning (reading, learning, copying, compiling,
//running) means that you agree that the Author(s) is (are) not responsible
//for cost, loss of data or any harm that may be caused directly or indirectly
//by usage of this softare or this documentation. The usage of this software
//is on your own risk. The installation and usage (starting/running) of this
//software may allow other people or application to access your computer and
//any attached devices and is highly dependent on the configuration of the
//software which must be done by the user of the software; the author(s) is
//(are) also not responsible for proper configuration and usage of the
//software, even if provoked by documentation provided together with
//the software.
//
//Any changes to this file according to the GPL as documented in the file
//gpl.txt aside this file in the shipment you received can be done to the
//lines that follows this copyright notice here, but changes must not be
//done inside the copyright notive above. A re-distribution must contain
//the intact and unchanged copyright notice.
//Contributions and changes to the program code must be marked as such.

package de.anomic.plasma;

import java.net.URL;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;

public class plasmaParserDocument {
    
    URL location;       // the source url
    String mimeType;    // mimeType as taken from http header
    String keywords;    // most resources provide a keyword field
    String shortTitle;  // a shortTitle mostly appears in the window header (border)
    String longTitle;   // the real title of the document, commonly h1-tags
    String[] sections;  // if present: more titles/headlines appearing in the document
    String abstrct;     // an abstract, if present: short content description
    byte[] text;        // the clear text, all that is visible
    Map anchors;        // all links embedded as clickeable entities (anchor tags)
    Map images;         // all visible pictures in document
    // the anchors and images - Maps are URL-to-EntityDescription mappings.
    // The EntityDescription appear either as visible text in anchors or as alternative
    // text in image tags.
    Map hyperlinks;
    Map medialinks;
    Map emaillinks;
                    
    public plasmaParserDocument(URL location, String mimeType,
                    String keywords, String shortTitle, String longTitle,
                    String[] sections, String abstrct,
                    byte[] text, Map anchors, Map images) {
        this.location = location;
        this.mimeType = (mimeType==null)?"application/octet-stream":mimeType;
        this.keywords = (keywords==null)?"":keywords;
        this.shortTitle = (shortTitle==null)?"":shortTitle;
        this.longTitle = (longTitle==null)?"":longTitle;
        this.sections = sections;
        this.abstrct = (abstrct==null)?"":abstrct;
        this.text = text;
        this.anchors = (anchors==null)?new HashMap():anchors;
        this.images = (images==null)?new HashMap():images;
        this.hyperlinks = null;
        this.medialinks = null;
        this.emaillinks = null;
    }
    
    private String absolutePath(String relativePath) {
        try {
            return plasmaParser.urlNormalform(new URL(location, relativePath));
        } catch (Exception e) {
            return "";
        }
    }
    
    public String getMainShortTitle() {
        if (shortTitle != null) return shortTitle; else return longTitle;
    }
    
    public String getMainLongTitle() {
        if (longTitle != null) return longTitle; else return shortTitle;
    }
    
    public String[] getSectionTitles() {
        if (sections != null) return sections; else return new String[]{getMainLongTitle()};
    }

    public String getAbstract() {
        if (abstrct != null) return abstrct; else return getMainLongTitle();
    }
    
    public byte[] getText() {
        // returns only the clear (visible) text (not the source data)
        return text;
    }
    
    public Map getAnchors() {
        // returns all links embedded as anchors (clickeable entities)
        return anchors;
    }
    
    public Map getImages() {
        // returns all links enbedded as pictures (visible iin document)
        return images;
    }
    
    // the next three methods provide a calculated view on the getAnchors/getImages:
    
    public Map getHyperlinks() {
        // this is a subset of the getAnchor-set: only links to other hyperrefs
        if (hyperlinks == null) resortLinks();
        return hyperlinks;
    }
    
    public Map getMedialinks() {
        // this is partly subset of getAnchor and getImage: all non-hyperrefs
        if (medialinks == null) resortLinks();
        return medialinks;
    }
    
    public Map getEmaillinks() {
        // this is part of the getAnchor-set: only links to email addresses
        if (emaillinks == null) resortLinks();
        return emaillinks;
    }
    
    private synchronized void resortLinks() {
        Iterator i;
        String url;
        int extpos;
        String ext = null;
        i = anchors.entrySet().iterator();
        hyperlinks = new HashMap();
        medialinks = new HashMap();
        emaillinks = new HashMap();
        Map.Entry entry;
        while (i.hasNext()) {
            entry = (Map.Entry) i.next();
            url = (String) entry.getKey();
            if ((url != null) && (url.startsWith("mailto:"))) {
                emaillinks.put(url.substring(7), entry.getValue());
            } else {
                extpos = url.lastIndexOf(".");
                String normal;
                if (extpos > 0) {
                    if (url.indexOf("?") != -1) {
                        ext = url.substring(extpos,url.indexOf("?")).toLowerCase();
                    } else {
						ext = url.substring(extpos).toLowerCase();
                    }
                    normal = plasmaParser.urlNormalform(url);
                    if (normal != null) {
                        if (plasmaParser.mediaExtContains(ext.substring(1))) {
                            // this is not an normal anchor, its a media link
                            medialinks.put(normal, entry.getValue());
                        } else {
                            hyperlinks.put(normal, entry.getValue());
                        }
                    }
                }
            }
        }
        // finally add the images to the medialinks
        i = images.entrySet().iterator();
        String normal;
        while (i.hasNext()) {
            entry = (Map.Entry) i.next();
            url = (String) entry.getKey();
            normal = plasmaParser.urlNormalform(url);
            if (normal != null) medialinks.put(normal, entry.getValue()); // avoid NullPointerException
        }
        expandHyperlinks();
    }
    
    
    public synchronized void expandHyperlinks() {
        // we add artificial hyperlinks to the hyperlink set that can be calculated from
        // given hyperlinks and imagelinks
        hyperlinks.putAll(plasmaParser.allReflinks(hyperlinks));
        hyperlinks.putAll(plasmaParser.allReflinks(medialinks));
        hyperlinks.putAll(plasmaParser.allSubpaths(hyperlinks));
        hyperlinks.putAll(plasmaParser.allSubpaths(medialinks));
    }
    
}
*) Eclipse has accidentally copied in the wrong file header into the new files (because these headers were accidentally set as default for the whole workspace instead of the project) Fixed. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@48 6c8d7289-2bf4-0310-a012-ef5d649a1542 2005-04-24 23:47:34 +02:00			`//plasmaParserDocument.java`
			`//------------------------`
			`//part of YaCy`
			`//(C) by Michael Peter Christen; mc@anomic.de`
			`//first published on http://www.anomic.de`
			`//Frankfurt, Germany, 2005`
			`//`
			`//last major change: 24.04.2005`
			`//`
			`//This program is free software; you can redistribute it and/or modify`
			`//it under the terms of the GNU General Public License as published by`
			`//the Free Software Foundation; either version 2 of the License, or`
			`//(at your option) any later version.`
			`//`
			`//This program is distributed in the hope that it will be useful,`
			`//but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`//GNU General Public License for more details.`
			`//`
			`//You should have received a copy of the GNU General Public License`
			`//along with this program; if not, write to the Free Software`
			`//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA`
			`//`
			`//Using this software in any meaning (reading, learning, copying, compiling,`
			`//running) means that you agree that the Author(s) is (are) not responsible`
			`//for cost, loss of data or any harm that may be caused directly or indirectly`
			`//by usage of this softare or this documentation. The usage of this software`
			`//is on your own risk. The installation and usage (starting/running) of this`
			`//software may allow other people or application to access your computer and`
			`//any attached devices and is highly dependent on the configuration of the`
			`//software which must be done by the user of the software; the author(s) is`
			`//(are) also not responsible for proper configuration and usage of the`
			`//software, even if provoked by documentation provided together with`
			`//the software.`
			`//`
			`//Any changes to this file according to the GPL as documented in the file`
			`//gpl.txt aside this file in the shipment you received can be done to the`
			`//lines that follows this copyright notice here, but changes must not be`
			`//done inside the copyright notive above. A re-distribution must contain`
			`//the intact and unchanged copyright notice.`
			`//Contributions and changes to the program code must be marked as such.`
) adding an new package for extra content parsers ) adding content parser for - pdf (using the pdf-box library) - doc (using the textmining.org library) ) adding a Interface for content parsers ) adding a configuration file which can be used to configure which parser is used for which mimeType ) Sempahore class was moved and renamed to serverSemaphore ) Changing yacy shutdown behaviour Buzy waiting loop for shutdown was removed and replaced with a blocking call (using the semaphore class mentioned above) to the new switchboard.waitForShutdown method. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@46 6c8d7289-2bf4-0310-a012-ef5d649a1542 2005-04-24 23:24:53 +02:00
			`package de.anomic.plasma;`

			`import java.net.URL;`
			`import java.util.HashMap;`
			`import java.util.Iterator;`
			`import java.util.Map;`

			`public class plasmaParserDocument {`

			`URL location; // the source url`
			`String mimeType; // mimeType as taken from http header`
			`String keywords; // most resources provide a keyword field`
			`String shortTitle; // a shortTitle mostly appears in the window header (border)`
			`String longTitle; // the real title of the document, commonly h1-tags`
			`String[] sections; // if present: more titles/headlines appearing in the document`
			`String abstrct; // an abstract, if present: short content description`
			`byte[] text; // the clear text, all that is visible`
			`Map anchors; // all links embedded as clickeable entities (anchor tags)`
			`Map images; // all visible pictures in document`
			`// the anchors and images - Maps are URL-to-EntityDescription mappings.`
			`// The EntityDescription appear either as visible text in anchors or as alternative`
			`// text in image tags.`
			`Map hyperlinks;`
			`Map medialinks;`
			`Map emaillinks;`

			`public plasmaParserDocument(URL location, String mimeType,`
			`String keywords, String shortTitle, String longTitle,`
			`String[] sections, String abstrct,`
			`byte[] text, Map anchors, Map images) {`
			`this.location = location;`
*) some minor changes git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@49 6c8d7289-2bf4-0310-a012-ef5d649a1542 2005-04-24 23:52:11 +02:00			`this.mimeType = (mimeType==null)?"application/octet-stream":mimeType;`
			`this.keywords = (keywords==null)?"":keywords;`
			`this.shortTitle = (shortTitle==null)?"":shortTitle;`
			`this.longTitle = (longTitle==null)?"":longTitle;`
) adding an new package for extra content parsers ) adding content parser for - pdf (using the pdf-box library) - doc (using the textmining.org library) ) adding a Interface for content parsers ) adding a configuration file which can be used to configure which parser is used for which mimeType ) Sempahore class was moved and renamed to serverSemaphore ) Changing yacy shutdown behaviour Buzy waiting loop for shutdown was removed and replaced with a blocking call (using the semaphore class mentioned above) to the new switchboard.waitForShutdown method. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@46 6c8d7289-2bf4-0310-a012-ef5d649a1542 2005-04-24 23:24:53 +02:00			`this.sections = sections;`
*) some minor changes git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@49 6c8d7289-2bf4-0310-a012-ef5d649a1542 2005-04-24 23:52:11 +02:00			`this.abstrct = (abstrct==null)?"":abstrct;`
) adding an new package for extra content parsers ) adding content parser for - pdf (using the pdf-box library) - doc (using the textmining.org library) ) adding a Interface for content parsers ) adding a configuration file which can be used to configure which parser is used for which mimeType ) Sempahore class was moved and renamed to serverSemaphore ) Changing yacy shutdown behaviour Buzy waiting loop for shutdown was removed and replaced with a blocking call (using the semaphore class mentioned above) to the new switchboard.waitForShutdown method. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@46 6c8d7289-2bf4-0310-a012-ef5d649a1542 2005-04-24 23:24:53 +02:00			`this.text = text;`
			`this.anchors = (anchors==null)?new HashMap():anchors;`
			`this.images = (images==null)?new HashMap():images;`
			`this.hyperlinks = null;`
			`this.medialinks = null;`
			`this.emaillinks = null;`
			`}`

			`private String absolutePath(String relativePath) {`
			`try {`
			`return plasmaParser.urlNormalform(new URL(location, relativePath));`
			`} catch (Exception e) {`
			`return "";`
			`}`
			`}`

			`public String getMainShortTitle() {`
			`if (shortTitle != null) return shortTitle; else return longTitle;`
			`}`

			`public String getMainLongTitle() {`
			`if (longTitle != null) return longTitle; else return shortTitle;`
			`}`

			`public String[] getSectionTitles() {`
			`if (sections != null) return sections; else return new String[]{getMainLongTitle()};`
			`}`

			`public String getAbstract() {`
			`if (abstrct != null) return abstrct; else return getMainLongTitle();`
			`}`

			`public byte[] getText() {`
			`// returns only the clear (visible) text (not the source data)`
			`return text;`
			`}`

			`public Map getAnchors() {`
			`// returns all links embedded as anchors (clickeable entities)`
			`return anchors;`
			`}`

			`public Map getImages() {`
			`// returns all links enbedded as pictures (visible iin document)`
			`return images;`
			`}`

			`// the next three methods provide a calculated view on the getAnchors/getImages:`

			`public Map getHyperlinks() {`
			`// this is a subset of the getAnchor-set: only links to other hyperrefs`
			`if (hyperlinks == null) resortLinks();`
			`return hyperlinks;`
			`}`

			`public Map getMedialinks() {`
			`// this is partly subset of getAnchor and getImage: all non-hyperrefs`
			`if (medialinks == null) resortLinks();`
			`return medialinks;`
			`}`

			`public Map getEmaillinks() {`
			`// this is part of the getAnchor-set: only links to email addresses`
			`if (emaillinks == null) resortLinks();`
			`return emaillinks;`
			`}`

			`private synchronized void resortLinks() {`
			`Iterator i;`
			`String url;`
			`int extpos;`
) Migration of optional Content Parser integration - each additional parser must be in a subpackage of plasma.parser - each parser must have its own ant build file (which will be called automatically from the main build file) - Calling the main build file results in building a separate zip file for each optional parser. This zip file includes: + sources of the Parser.java + compiled classes of the Parser.java + needed additional libs (libx) - To install an additional parser the user simply needs to extract the zip file listed above into his/her yacy directory. - The configuration (enabling/disabling) of a parser can be done via the webinterface (currently the settings dialoge) and is done "on-the-fly". The installation can not be done "on-the-fly" at the moment because of classpath issues. - The classpath of the linux startup/stop scripts is generated automatically now (including all libraries from lib and libx). ) Bugfix: File Extension was not calculated correctly by the crawler e.g.: file extension was accidentally: .php?param=value Corrected. *) Adding additional parser for parsing of rss/atom feeds - added needed libs to do this. TODO: - automatic building classpath for windows startup scripts git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@78 6c8d7289-2bf4-0310-a012-ef5d649a1542 2005-05-03 11:47:56 +02:00			`String ext = null;`
) adding an new package for extra content parsers ) adding content parser for - pdf (using the pdf-box library) - doc (using the textmining.org library) ) adding a Interface for content parsers ) adding a configuration file which can be used to configure which parser is used for which mimeType ) Sempahore class was moved and renamed to serverSemaphore ) Changing yacy shutdown behaviour Buzy waiting loop for shutdown was removed and replaced with a blocking call (using the semaphore class mentioned above) to the new switchboard.waitForShutdown method. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@46 6c8d7289-2bf4-0310-a012-ef5d649a1542 2005-04-24 23:24:53 +02:00			`i = anchors.entrySet().iterator();`
			`hyperlinks = new HashMap();`
			`medialinks = new HashMap();`
			`emaillinks = new HashMap();`
			`Map.Entry entry;`
			`while (i.hasNext()) {`
			`entry = (Map.Entry) i.next();`
			`url = (String) entry.getKey();`
			`if ((url != null) && (url.startsWith("mailto:"))) {`
			`emaillinks.put(url.substring(7), entry.getValue());`
			`} else {`
			`extpos = url.lastIndexOf(".");`
			`String normal;`
			`if (extpos > 0) {`
) Migration of optional Content Parser integration - each additional parser must be in a subpackage of plasma.parser - each parser must have its own ant build file (which will be called automatically from the main build file) - Calling the main build file results in building a separate zip file for each optional parser. This zip file includes: + sources of the Parser.java + compiled classes of the Parser.java + needed additional libs (libx) - To install an additional parser the user simply needs to extract the zip file listed above into his/her yacy directory. - The configuration (enabling/disabling) of a parser can be done via the webinterface (currently the settings dialoge) and is done "on-the-fly". The installation can not be done "on-the-fly" at the moment because of classpath issues. - The classpath of the linux startup/stop scripts is generated automatically now (including all libraries from lib and libx). ) Bugfix: File Extension was not calculated correctly by the crawler e.g.: file extension was accidentally: .php?param=value Corrected. *) Adding additional parser for parsing of rss/atom feeds - added needed libs to do this. TODO: - automatic building classpath for windows startup scripts git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@78 6c8d7289-2bf4-0310-a012-ef5d649a1542 2005-05-03 11:47:56 +02:00			`if (url.indexOf("?") != -1) {`
			`ext = url.substring(extpos,url.indexOf("?")).toLowerCase();`
			`} else {`
			`ext = url.substring(extpos).toLowerCase();`
			`}`
) adding an new package for extra content parsers ) adding content parser for - pdf (using the pdf-box library) - doc (using the textmining.org library) ) adding a Interface for content parsers ) adding a configuration file which can be used to configure which parser is used for which mimeType ) Sempahore class was moved and renamed to serverSemaphore ) Changing yacy shutdown behaviour Buzy waiting loop for shutdown was removed and replaced with a blocking call (using the semaphore class mentioned above) to the new switchboard.waitForShutdown method. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@46 6c8d7289-2bf4-0310-a012-ef5d649a1542 2005-04-24 23:24:53 +02:00			`normal = plasmaParser.urlNormalform(url);`
			`if (normal != null) {`
) Migration of optional Content Parser integration - each additional parser must be in a subpackage of plasma.parser - each parser must have its own ant build file (which will be called automatically from the main build file) - Calling the main build file results in building a separate zip file for each optional parser. This zip file includes: + sources of the Parser.java + compiled classes of the Parser.java + needed additional libs (libx) - To install an additional parser the user simply needs to extract the zip file listed above into his/her yacy directory. - The configuration (enabling/disabling) of a parser can be done via the webinterface (currently the settings dialoge) and is done "on-the-fly". The installation can not be done "on-the-fly" at the moment because of classpath issues. - The classpath of the linux startup/stop scripts is generated automatically now (including all libraries from lib and libx). ) Bugfix: File Extension was not calculated correctly by the crawler e.g.: file extension was accidentally: .php?param=value Corrected. *) Adding additional parser for parsing of rss/atom feeds - added needed libs to do this. TODO: - automatic building classpath for windows startup scripts git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@78 6c8d7289-2bf4-0310-a012-ef5d649a1542 2005-05-03 11:47:56 +02:00			`if (plasmaParser.mediaExtContains(ext.substring(1))) {`
) adding an new package for extra content parsers ) adding content parser for - pdf (using the pdf-box library) - doc (using the textmining.org library) ) adding a Interface for content parsers ) adding a configuration file which can be used to configure which parser is used for which mimeType ) Sempahore class was moved and renamed to serverSemaphore ) Changing yacy shutdown behaviour Buzy waiting loop for shutdown was removed and replaced with a blocking call (using the semaphore class mentioned above) to the new switchboard.waitForShutdown method. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@46 6c8d7289-2bf4-0310-a012-ef5d649a1542 2005-04-24 23:24:53 +02:00			`// this is not an normal anchor, its a media link`
			`medialinks.put(normal, entry.getValue());`
			`} else {`
			`hyperlinks.put(normal, entry.getValue());`
			`}`
			`}`
			`}`
			`}`
			`}`
			`// finally add the images to the medialinks`
			`i = images.entrySet().iterator();`
			`String normal;`
			`while (i.hasNext()) {`
			`entry = (Map.Entry) i.next();`
			`url = (String) entry.getKey();`
			`normal = plasmaParser.urlNormalform(url);`
			`if (normal != null) medialinks.put(normal, entry.getValue()); // avoid NullPointerException`
			`}`
			`expandHyperlinks();`
			`}`


			`public synchronized void expandHyperlinks() {`
			`// we add artificial hyperlinks to the hyperlink set that can be calculated from`
			`// given hyperlinks and imagelinks`
			`hyperlinks.putAll(plasmaParser.allReflinks(hyperlinks));`
			`hyperlinks.putAll(plasmaParser.allReflinks(medialinks));`
			`hyperlinks.putAll(plasmaParser.allSubpaths(hyperlinks));`
			`hyperlinks.putAll(plasmaParser.allSubpaths(medialinks));`
			`}`

several bugfixes git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@71 6c8d7289-2bf4-0310-a012-ef5d649a1542 2005-04-29 00:04:57 +02:00			`}`