Merge branch 'master' of git@gitorious.org:yacy/rc1.git

This commit is contained in:
orbiter 2014-04-23 23:13:23 +02:00
commit 2fd8a0ead6
3 changed files with 39 additions and 10 deletions

View File

@ -520,7 +520,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
if (rel.length() == 0) rel = "nofollow"; else if (rel.indexOf("nofollow") < 0) rel += ",nofollow";
tag.opts.put("rel", rel);
}
tag.opts.put("text", new String(tag.content.getChars()));
tag.opts.put("text", stripAllTags(tag.content.getChars())); // strip any inline html in tag text like "<a ...> <span>test</span> </a>"
tag.opts.put("href", url.toNormalform(true)); // we must assign this because the url may have resolved backpaths and may not be absolute
url.setAll(tag.opts);
recursiveParse(url, tag.content.getChars());

View File

@ -5,8 +5,6 @@ import java.io.IOException;
import net.yacy.cora.federate.solr.instance.EmbeddedInstance;
import net.yacy.search.schema.CollectionSchema;
import net.yacy.search.schema.WebgraphSchema;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrInputDocument;
import org.junit.After;
import org.junit.Before;
@ -57,12 +55,8 @@ public class EmbeddedSolrConnectorTest {
System.out.println("query solr");
long expResult = 1;
SolrDocumentList result;
try {
result = solr.getDocumentListByQuery(CollectionSchema.text_t.name() + ":tempor", 0, 10,"");
assertEquals(expResult, result.getNumFound());
} catch (final IOException ex) {
fail("Solr query no result");
}
long result = solr.getCountByQuery(CollectionSchema.text_t.name() + ":tempor");
System.out.println("found = " + result + " (expected = 1 )");
assertEquals(expResult, result);
}
}

View File

@ -5,11 +5,15 @@ import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.net.MalformedURLException;
import java.nio.charset.Charset;
import java.util.List;
import static junit.framework.Assert.assertEquals;
import static junit.framework.Assert.assertTrue;
import junit.framework.TestCase;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.parser.html.ContentScraper;
import static net.yacy.document.parser.htmlParser.parseToScraper;
import org.junit.Test;
public class htmlParserTest extends TestCase {
@ -80,4 +84,35 @@ public class htmlParserTest extends TestCase {
}
}
/**
* Test of parseToScraper method, of class htmlParser.
*/
@Test
public void testParseToScraper_4args() throws Exception {
// test link with inline html in text
// expectation to deliver pure text as it is possibly indexed in outboundlinks_anchortext_txt/inboundlinks_anchortext_txt
final AnchorURL url = new AnchorURL("http://localhost/");
final String mimetype = "text/html";
final String testhtml = "<html><bod>"
+ "<a href='x1.html'><span>testtext</span></a>" // "testtext"
+ "<a href=\"http://localhost/x2.html\"> <i id=\"home-icon\" class=\"img-sprite\"></i>Start</a>" // "Start"
+ "<a href='x1.html'><span class='button'><img src='pic.gif'/></span></a>" // "" + image
+ "</body></html>";
ContentScraper scraper = parseToScraper(url, mimetype, testhtml, 10);
List<AnchorURL> anchorlist = scraper.getAnchors();
String linktxt = anchorlist.get(0).getTextProperty();
assertEquals("testtext", linktxt);
linktxt = anchorlist.get(1).getTextProperty();
assertEquals("Start", linktxt);
linktxt = anchorlist.get(2).getTextProperty();
assertEquals("", linktxt);
int cnt = scraper.getImages().size();
assertEquals(1,cnt);
}
}