/** * Vocabulary_p * Copyright 2012 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany * First released 07.05.2012 at http://yacy.net * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program in the file lgpl21.txt * If not, see . */ import java.io.BufferedReader; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStreamReader; import java.io.UnsupportedEncodingException; import java.net.MalformedURLException; import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Set; import java.util.regex.Pattern; import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.language.synonyms.SynonymLibrary; import net.yacy.cora.lod.vocabulary.DCTerms; import net.yacy.cora.lod.vocabulary.Tagging; import net.yacy.cora.lod.vocabulary.Tagging.SOTuple; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.util.CommonPattern; import net.yacy.cora.util.ConcurrentLog; import net.yacy.crawler.retrieval.StreamResponse; import net.yacy.data.TransactionManager; import net.yacy.data.WorkTables; import net.yacy.document.LibraryProvider; import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.kelondro.util.FileUtils; import net.yacy.repository.Blacklist.BlacklistType; import net.yacy.search.Switchboard; import net.yacy.search.SwitchboardConstants; import net.yacy.search.index.Segment; import net.yacy.server.serverObjects; import net.yacy.server.serverSwitch; /** * Handle creation and edition of vocabularies through the Vocabulary_p.html page. */ public class Vocabulary_p { /** Logger */ private final static ConcurrentLog LOG = new ConcurrentLog(Vocabulary_p.class.getSimpleName()); public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) { final Switchboard sb = (Switchboard) env; final serverObjects prop = new serverObjects(); /* Acquire a transaction token for the next POST form submission */ final String nextToken = TransactionManager.getTransactionToken(header); prop.put(TransactionManager.TRANSACTION_TOKEN_PARAM, nextToken); prop.put("edit_" + TransactionManager.TRANSACTION_TOKEN_PARAM, nextToken); prop.put("create_" + TransactionManager.TRANSACTION_TOKEN_PARAM, nextToken); Collection vocs = LibraryProvider.autotagging.getVocabularies(); String vocabularyName = (post == null) ? null : post.get("vocabulary", null); String discovername = (post == null) ? null : post.get("discovername", null); Tagging vocabulary = vocabularyName == null ? null : LibraryProvider.autotagging.getVocabulary(vocabularyName); if (vocabulary == null) { vocabularyName = null; } if (post != null) { // create a vocabulary if (vocabulary == null && discovername != null && discovername.length() > 0) { /* Check the transaction is valid */ TransactionManager.checkPostTransaction(header, post); // get details of creation String discoverobjectspace = post.get("discoverobjectspace", ""); MultiProtocolURL discoveruri = null; if (discoverobjectspace.length() > 0) try {discoveruri = new MultiProtocolURL(discoverobjectspace);} catch (final MalformedURLException e) {} if (discoveruri == null) discoverobjectspace = ""; final Map table = new LinkedHashMap(); final File propFile = LibraryProvider.autotagging.getVocabularyFile(discovername); final boolean discoverNot = post.get("discovermethod", "").equals("none"); final boolean discoverFromPath = post.get("discovermethod", "").equals("path"); final boolean discoverFromTitle = post.get("discovermethod", "").equals("title"); final boolean discoverFromTitleSplitted = post.get("discovermethod", "").equals("titlesplitted"); final boolean discoverFromAuthor = post.get("discovermethod", "").equals("author"); final boolean discoverFromCSV = post.get("discovermethod", "").equals("csv"); final String discoverFromCSVPath = post.get("discoverpath", "").replaceAll("%20", " "); final Segment segment = sb.index; String t; int csvFileStatus = 0; if (!discoverNot) { if (discoverFromCSV) { if(discoverFromCSVPath.isEmpty()) { csvFileStatus = 1; } else { DigestURL csvUrl = null; if(discoverFromCSVPath.contains("://")) { try { csvUrl = new DigestURL(discoverFromCSVPath); } catch(final MalformedURLException e) { csvFileStatus = 5; prop.put("create_csvFileStatus_csvUrl", discoverFromCSVPath); } } else { final File discoverFromCSVFile = new File(discoverFromCSVPath); final String csvPath = discoverFromCSVFile.getAbsolutePath(); if (!discoverFromCSVFile.exists()) { csvFileStatus = 2; prop.put("create_csvFileStatus_csvPath", csvPath); } else if (!discoverFromCSVFile.canRead()) { csvFileStatus = 3; prop.put("create_csvFileStatus_csvFile", csvPath); } else if (discoverFromCSVFile.isDirectory()) { csvFileStatus = 4; prop.put("create_csvFileStatus_csvPath", csvPath); } else { try { csvUrl = new DigestURL(discoverFromCSVFile); } catch(final MalformedURLException e) { csvFileStatus = 5; prop.put("create_csvFileStatus_csvUrl", "file://" + discoverFromCSVFile.getAbsolutePath()); } } } if(csvUrl != null) { try { handleDiscoverFromCSV(sb, post, table, csvUrl); } catch(final IOException e) { LOG.warn("Could not read CSV file at " + csvUrl, e); csvFileStatus = 3; prop.put("create_csvFileStatus_csvFile", csvUrl.toString()); } } } } else { Iterator ui = segment.urlSelector(discoveruri, Long.MAX_VALUE, 100000); while (ui.hasNext()) { DigestURL u = ui.next(); String u0 = u.toNormalform(true); t = ""; if (discoverFromPath) { int exp = u0.lastIndexOf('.'); if (exp < 0) continue; int slp = u0.lastIndexOf('/', exp); if (slp < 0) continue; t = u0.substring(slp, exp); int p; while ((p = t.indexOf(':')) >= 0) t = t.substring(p + 1); while ((p = t.indexOf('=')) >= 0) t = t.substring(p + 1); } if (discoverFromTitle || discoverFromTitleSplitted) { URIMetadataNode m = segment.fulltext().getMetadata(u.hash()); if (m != null) t = m.dc_title(); if (t.endsWith(".jpg") || t.endsWith(".gif")) continue; } if (discoverFromAuthor) { URIMetadataNode m = segment.fulltext().getMetadata(u.hash()); if (m != null) t = m.dc_creator(); } t = t.replaceAll("_", " ").replaceAll("\"", " ").replaceAll("'", " ").replaceAll(",", " ").replaceAll(" ", " ").trim(); if (t.isEmpty()) continue; if (discoverFromTitleSplitted) { String[] ts = CommonPattern.SPACES.split(t); for (String s: ts) { if (s.isEmpty()) continue; if (s.endsWith(".jpg") || s.endsWith(".gif")) continue; table.put(s, new Tagging.SOTuple(Tagging.normalizeTerm(s), u0)); } } else if (discoverFromAuthor) { String[] ts = CommonPattern.SEMICOLON.split(t); // author names are often separated by ';' for (String s: ts) { if (s.isEmpty()) continue; int p = s.indexOf(','); // check if there is a reversed method to mention the name if (p >= 0) s = s.substring(p + 1).trim() + " " + s.substring(0, p).trim(); table.put(s, new Tagging.SOTuple(Tagging.normalizeTerm(s), u0)); } } else { table.put(t, new Tagging.SOTuple(Tagging.normalizeTerm(t), u0)); } } } } prop.put("create_csvFileStatus", csvFileStatus); if(csvFileStatus == 0) { try { Tagging newvoc = new Tagging(discovername, propFile, discoverobjectspace, table); prop.put("create_vocabWriteError", false); LibraryProvider.autotagging.addVocabulary(newvoc); vocabularyName = discovername; vocabulary = newvoc; // store this call as api call sb.tables.recordAPICall(post, "Vocabulary_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "vocabulary creation for " + discovername); } catch(final IOException e) { prop.put("create_vocabWriteError", true); final String vocabPath = propFile.getAbsolutePath(); prop.put("create_vocabWriteError_vocabPath", vocabPath); LOG.severe("Could not write vocabulary file at " + vocabPath, e); } } } else if (vocabulary != null && post.containsKey("set")) { /* Check the transaction is valid */ TransactionManager.checkPostTransaction(header, post); try { // check if objectspace was set vocabulary.setObjectspace(post.get("objectspace", vocabulary.getObjectspace() == null ? "" : vocabulary.getObjectspace())); // check if a term was added if (post.get("add_new", "").equals("checked") && post.get("newterm", "").length() > 0) { String objectlink = post.get("newobjectlink", ""); if (objectlink.length() > 0) try { objectlink = new MultiProtocolURL(objectlink).toNormalform(true); } catch (final MalformedURLException e) {} vocabulary.put(post.get("newterm", ""), post.get("newsynonyms", ""), objectlink); } // check if a term was modified for (Map.Entry e : post.entrySet()) { if (e.getKey().startsWith("modify_") && e.getValue().equals("checked")) { String term = e.getKey().substring(7); String synonyms = post.get("synonyms_" + term, ""); String objectlink = post.get("objectlink_" + term, ""); vocabulary.put(term, synonyms, objectlink); } } // check if the vocabulary shall be cleared if (post.get("clear_table", "").equals("checked") ) { vocabulary.clear(); } // check if the vocabulary shall be deleted if (post.get("delete_vocabulary", "").equals("checked") ) { LibraryProvider.autotagging.deleteVocabulary(vocabularyName); vocabulary = null; vocabularyName = null; } // check if a term shall be deleted if (vocabulary != null && vocabulary.size() > 0) for (Map.Entry e : post.entrySet()) { if (e.getKey().startsWith("delete_") && e.getValue().equals("checked")) { String term = e.getKey().substring(7); vocabulary.delete(term); } } // check the isFacet and isMatchFromLinkedData properties if (vocabulary != null && post.containsKey("set")) { boolean isFacet = post.getBoolean("isFacet"); vocabulary.setFacet(isFacet); Set omit = env.getConfigSet("search.result.show.vocabulary.omit"); if (isFacet) { omit.remove(vocabularyName); } else { omit.add(vocabularyName); } env.setConfig("search.result.show.vocabulary.omit", omit); boolean isMatchFromLinkedData = post.getBoolean("vocabularies.matchLinkedData"); vocabulary.setMatchFromLinkedData(isMatchFromLinkedData); final Set matchLinkedDataVocs = env.getConfigSet(SwitchboardConstants.VOCABULARIES_MATCH_LINKED_DATA_NAMES); if (isMatchFromLinkedData) { matchLinkedDataVocs.add(vocabularyName); } else { matchLinkedDataVocs.remove(vocabularyName); } env.setConfig(SwitchboardConstants.VOCABULARIES_MATCH_LINKED_DATA_NAMES, matchLinkedDataVocs); } } catch (final IOException e) { ConcurrentLog.logException(e); } } } int count = 0; for (Tagging v: vocs) { prop.put("vocabularyset_" + count + "_name", v.getName()); prop.put("vocabularyset_" + count + "_selected", ((vocabularyName != null && vocabularyName.equals(v.getName())) || (discovername != null && discovername.equals(v.getName()))) ? 1 : 0); count++; } prop.put("vocabularyset", count); prop.put("create", vocabularyName == null ? 1 : 0); if (vocabulary == null) { prop.put("edit", 0); } else { prop.put("edit", 1); boolean editable = vocabulary.getFile() != null && vocabulary.getFile().exists(); prop.put("edit_editable", editable ? 1 : 0); prop.putHTML("edit_editable_file", editable ? vocabulary.getFile().getAbsolutePath() : ""); prop.putHTML("edit_name", vocabulary.getName()); prop.putXML("edit_namexml", vocabulary.getName()); prop.putHTML("edit_namespace", vocabulary.getNamespace()); prop.put("edit_isFacet", vocabulary.isFacet() ? 1 : 0); prop.put("edit_vocabularies.matchLinkedData", vocabulary.isMatchFromLinkedData()); prop.put("edit_size", vocabulary.size()); prop.putHTML("edit_predicate", vocabulary.getPredicate()); prop.putHTML("edit_prefix", Tagging.DEFAULT_PREFIX); prop.putHTML("edit_editable_objectspace", vocabulary.getObjectspace() == null ? "" : vocabulary.getObjectspace()); prop.putHTML("edit_editable_objectspacepredicate", DCTerms.references.getPredicate()); int c = 0; boolean dark = false; int osl = vocabulary.getObjectspace() == null ? 0 : vocabulary.getObjectspace().length(); Map list = vocabulary.list(); prop.put("edit_size", list.size()); for (Map.Entry entry: list.entrySet()) { prop.put("edit_terms_" + c + "_editable", editable ? 1 : 0); prop.put("edit_terms_" + c + "_dark", dark ? 1 : 0); dark = !dark; prop.putXML("edit_terms_" + c + "_label", osl > entry.getValue().getObjectlink().length() ? entry.getKey() : entry.getValue().getObjectlink().substring(osl)); prop.putHTML("edit_terms_" + c + "_term", entry.getKey()); prop.putXML("edit_terms_" + c + "_termxml", entry.getKey()); prop.putHTML("edit_terms_" + c + "_editable_term", entry.getKey()); String synonymss = entry.getValue().getSynonymsCSV(); prop.putHTML("edit_terms_" + c + "_editable_synonyms", synonymss); if (synonymss.length() > 0) { String[] synonymsa = entry.getValue().getSynonymsList(); for (int i = 0; i < synonymsa.length; i++) { prop.put("edit_terms_" + c + "_synonyms_" + i + "_altLabel", synonymsa[i]); } prop.put("edit_terms_" + c + "_synonyms", synonymsa.length); } else { prop.put("edit_terms_" + c + "_synonyms", 0); } prop.putXML("edit_terms_" + c + "_editable_objectlink", entry.getValue().getObjectlink()); c++; if (c > 3000) break; } prop.put("edit_terms", c); } // make charset list for import method selector prop.putHTML("create_charset_" + 0 + "_name", "autodetect"); prop.put("create_charset_" + 0 + "_selected", 1); int c = 1; for (String cs: Charset.availableCharsets().keySet()) { prop.putHTML("create_charset_" + c + "_name", cs); prop.put("create_charset_" + c + "_selected", 0); c++; } prop.put("create_charset", c); // return rewrite properties return prop; } /** * Parse a CSV content line and extract field values. When the last field of * this line starts with and unclosed escape character, the current line is * appended to the escapedContent buffer. * * @param line * a raw line from a CSV document. Must not be null. * @param separatorPattern * the field separator character compiled as a Pattern instance. Must * not be null. * @param escape * escape character * @param multiLineContent * eventually holds content of previous lines whose last field * includes an escaped line separator * @return the list of field values extracted from the line, eventually empty. */ private static List parseCSVLine(final String line, final Pattern separatorPattern, final String escape, final StringBuilder multiLineContent) { final List fields = new ArrayList<>(); String[] tokens = separatorPattern.split(line); if (tokens.length == 0) { tokens = new String[]{line}; } /* Handle continuation of multi-lines field content escaped between escape char */ if(multiLineContent.length() > 0) { int closingEscapeIndex = -1; for(int index = 0; index < tokens.length; index++) { if(tokens[index].endsWith(escape)) { closingEscapeIndex = index; break; } } if(closingEscapeIndex >= 0) { /* End of multi-line escape */ multiLineContent.append("\n").append(line); tokens = separatorPattern.split(multiLineContent.toString()); multiLineContent.setLength(0); if (tokens.length == 0) { tokens = new String[]{line}; } } else { /* Multi-line escape continue */ multiLineContent.append("\n").append(line); return fields; } } /* Handle separator char escaped between escape char */ final StringBuilder escapedSeparatorContent = new StringBuilder(); for(final String field : tokens) { if(escapedSeparatorContent.length() == 0) { if(field.startsWith(escape) && !field.endsWith(escape)) { /* Beginning of escape */ escapedSeparatorContent.append(field).append(separatorPattern.toString()); continue; } } else if(field.endsWith(escape)) { /* End of field escape */ escapedSeparatorContent.append(field); fields.add(escapedSeparatorContent.toString()); escapedSeparatorContent.setLength(0); continue; } else { /* Escape continue */ escapedSeparatorContent.append(field).append(separatorPattern.toString()); continue; } fields.add(field); } if(escapedSeparatorContent.length() > 0) { /* Handle beginning of field content with escaped line separator */ multiLineContent.setLength(0); multiLineContent.append(line); } return fields; } /** * Fill the vocabulary table from a CSV file. * @param sb the main Switchbaord instance. Must not be null. * @param post current request parameters. Must not be null. * @param table the vocabulary table to fill. Must not be null. * @param csvFileUrl the file URL. Must not be null. * @throws IOException when a read/write error occurred * @throws UnsupportedEncodingException * @throws FileNotFoundException when the file does not exists or can not be read for some reason. */ protected static void handleDiscoverFromCSV(final Switchboard sb, final serverObjects post, final Map table, final DigestURL csvFileUrl) throws IOException, UnsupportedEncodingException, FileNotFoundException { String charsetName = post.get("charset", StandardCharsets.UTF_8.name()); final String columnSeparator = post.get("columnSeparator", ";"); final String escapeChar = "\""; final int lineStart = post.getInt("discoverLineStart", 0); final int discovercolumnliteral = post.getInt("discovercolumnliteral", 0); final int discovercolumnsynonyms = post.getInt("discovercolumnsynonyms", -1); final int discovercolumnobjectlink = post.getInt("discovercolumnobjectlink", -1); final boolean discoverenrichsynonyms = post.get("discoversynonymsmethod", "none").equals("enrichsynonyms"); final boolean discoverreadcolumn = post.get("discoversynonymsmethod", "none").equals("readcolumn"); final Pattern separatorPattern = Pattern.compile(columnSeparator); // auto-detect charset, used code from http://jchardet.sourceforge.net/; see also: http://www-archive.mozilla.org/projects/intl/chardet.html if (charsetName.equals("autodetect")) { try (final StreamResponse streamResponse = sb.loader.openInputStream( sb.loader.request(csvFileUrl, true, false), CacheStrategy.IFFRESH, BlacklistType.CRAWLER, ClientIdentification.yacyInternetCrawlerAgent, Integer.MAX_VALUE);) { if(streamResponse == null || streamResponse.getContentStream() == null) { throw new IOException("Could not get CSV content at " + csvFileUrl); } charsetName = streamResponse.getResponse().getCharacterEncoding(); if(charsetName == null) { /* Charset not provided in response headers : try to detect it from content */ final List charsets = FileUtils.detectCharset(streamResponse.getContentStream()); charsetName = charsets.get(0); LOG.info("detected charset: " + charsetName + " used to read " + csvFileUrl.toString()); } else { LOG.info("detected charset: " + charsetName + " used to read " + csvFileUrl.toString()); /* Use now the open stream */ try (final InputStreamReader reader = new InputStreamReader(streamResponse.getContentStream(), charsetName); final BufferedReader bufferedReader = new BufferedReader(reader);) { discoverFromCSVReader(table, escapeChar, lineStart, discovercolumnliteral, discovercolumnsynonyms, discovercolumnobjectlink, discoverenrichsynonyms, discoverreadcolumn, separatorPattern, bufferedReader); } return; } } } // when autodetection of content charset has been selected, a remote resource may opened again, but has some chances to be now in cache try(final StreamResponse streamResponse = sb.loader.openInputStream( sb.loader.request(csvFileUrl, true, false), CacheStrategy.IFFRESH, BlacklistType.CRAWLER, ClientIdentification.yacyInternetCrawlerAgent, Integer.MAX_VALUE);) { if(streamResponse == null || streamResponse.getContentStream() == null) { throw new IOException("Could not get CSV content at " + csvFileUrl); } try (final InputStreamReader reader = new InputStreamReader(streamResponse.getContentStream(), charsetName); final BufferedReader bufferedReader = new BufferedReader(reader);) { discoverFromCSVReader(table, escapeChar, lineStart, discovercolumnliteral, discovercolumnsynonyms, discovercolumnobjectlink, discoverenrichsynonyms, discoverreadcolumn, separatorPattern, bufferedReader); } } } /** * Fill the vocabulary table from reader open on CSV content. * @param table the vocabulary table to fill. Must not be null. * @param escapeChar CSV escape character (standard is double quote). Must not be null. * @param lineStart index (zero based) of the first line to parse. Previous lines are ignored. * @param literalsColumn index (zero based) of the column to read for literals * @param synonymsColumn index (zero based) of the column to read for synonyms. Set to -1 to ignore. * @param discovercolumnobjectlink * @param discoverenrichsynonyms * @param readSynonymFromColumn when true synonym terms are read from the column at synonymsColumn index * @param separatorPattern the field separator character compiled as a Pattern instance. Must not be null. * @param reader an open reader on CSV content. Must not be null. * @throws IOException when an read error occurred */ protected static void discoverFromCSVReader(final Map table, final String escapeChar, final int lineStart, final int literalsColumn, final int synonymsColumn, final int discovercolumnobjectlink, final boolean discoverenrichsynonyms, final boolean readSynonymFromColumn, final Pattern separatorPattern, final BufferedReader reader) throws IOException { String line = null; final StringBuilder multiLineContent = new StringBuilder(); final Map synonym2literal = new HashMap<>(); // helper map to check if there are double synonyms int lineIndex = -1; while ((line = reader.readLine()) != null) { lineIndex++; if(lineIndex < lineStart) { continue; } if (line.length() == 0) { continue; } final List fields = parseCSVLine(line, separatorPattern, escapeChar, multiLineContent); if (multiLineContent.length() > 0) { continue; } String literal = literalsColumn < 0 || fields.size() <= literalsColumn ? null : fields.get(literalsColumn).trim(); if (literal == null) { continue; } literal = normalizeLiteral(literal, escapeChar); final String objectlink = discovercolumnobjectlink < 0 || fields.size() <= discovercolumnobjectlink ? null : fields.get(discovercolumnobjectlink).trim(); if (literal.length() > 0) { String synonyms = ""; if (discoverenrichsynonyms) { final Set sy = SynonymLibrary.getSynonyms(literal); if (sy != null) { for (final String s: sy) { synonyms += "," + s; } } } else if (readSynonymFromColumn) { synonyms = synonymsColumn < 0 || fields.size() <= synonymsColumn ? null : fields.get(synonymsColumn).trim(); synonyms = normalizeLiteral(synonyms, escapeChar); } else { synonyms = Tagging.normalizeTerm(literal); } // check double synonyms if (synonyms.length() > 0) { String oldliteral = synonym2literal.get(synonyms); if (oldliteral != null && !literal.equals(oldliteral)) { // replace old entry with combined new table.remove(oldliteral); String newliteral = oldliteral + "," + literal; literal = newliteral; } synonym2literal.put(synonyms, literal); } // store term table.put(literal, new Tagging.SOTuple(synonyms, objectlink == null ? "" : objectlink)); } } } private static String normalizeLiteral(String literal, final String escapeChar) { if (literal == null) { return ""; } if(literal.length() > 1 && literal.startsWith(escapeChar) && literal.endsWith(escapeChar)) { literal = literal.replace("\"\"", "\""); } if (literal.length() > 0 && (literal.charAt(0) == '"' || literal.charAt(0) == '\'')) { literal = literal.substring(1); } if (literal.length() > 0 && (literal.charAt(literal.length() - 1) == '"' || literal.charAt(literal.length() - 1) == '\'')) { literal = literal.substring(0, literal.length() - 1); } return literal; } }