yacy_search_server/source/net/yacy/document/parser/mmParser.java
reger 06d0e2aeb9 result heuristic (also used in greedy learning mode) to use outbound links if result is full index doc. Otherwise use default loader methode.
- Above brought up that parser start url parameter, declared as AnchorURL uses only methodes of parent object DigestURL (changed parameter declaration accordingly).
2016-02-16 02:05:58 +01:00

152 lines
4.7 KiB
Java

/**
* mmParser
* Copyright 2010 by Marc Nause, marc.nause@gmx.de, Braunschweig, Germany
* First released 27.12.2010 at http://yacy.net
*
* $LastChangedDate$
* $LastChangedRevision$
* $LastChangedBy$
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.document.parser;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.VocabularyScraper;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
// this is a new implementation of this parser idiom using multiple documents as result set
public class mmParser extends AbstractParser implements Parser {
public mmParser() {
super("FreeMind Parser");
this.SUPPORTED_EXTENSIONS.add("mm");
this.SUPPORTED_MIME_TYPES.add("application/freemind");
this.SUPPORTED_MIME_TYPES.add("application/x-freemind");
}
private static final ThreadLocal<SAXParser> tlSax = new ThreadLocal<SAXParser>();
private static SAXParser getParser() throws SAXException {
SAXParser parser = tlSax.get();
if (parser == null) {
try {
parser = SAXParserFactory.newInstance().newSAXParser();
} catch (final ParserConfigurationException e) {
throw new SAXException(e.getMessage(), e);
}
tlSax.set(parser);
}
return parser;
}
@Override
public Document[] parse(
final DigestURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source)
throws Parser.Failure, InterruptedException
{
final StringBuilder sb = new StringBuilder();
String rootElementText = "";
byte[] content = new byte[0];
try {
final SAXParser saxParser = getParser();
final FreeMindHandler freeMindHandler = new FreeMindHandler();
saxParser.parse(source, freeMindHandler);
final List<String> nodeTextList = freeMindHandler.getNodeText();
rootElementText = nodeTextList.isEmpty() ? "" : nodeTextList.get(0);
for (final String nodeText : nodeTextList) {
sb.append(nodeText);
sb.append(". ");
}
content = UTF8.getBytes(sb.toString());
} catch (final SAXException ex) {
AbstractParser.log.warn(ex.getMessage());
} catch (final IOException ex) {
AbstractParser.log.warn(ex.getMessage());
}
return new Document[]{new Document(
location,
mimeType,
StandardCharsets.UTF_8.name(),
this,
null,
null,
singleList(rootElementText),
null,
null,
null,
null,
0.0d, 0.0d,
content,
null,
null,
null,
false,
new Date())};
}
private class FreeMindHandler extends DefaultHandler {
private final List<String> nodeText = new ArrayList<String>();
@Override
public void startElement(final String uri, final String localName,
final String qName, final Attributes attributes) {
if (qName.equals("node")) {
final String textValue = attributes.getValue("TEXT");
if (textValue != null) {
this.nodeText.add(textValue);
}
}
}
protected List<String> getNodeText() {
return this.nodeText;
}
}
}