mirror of
https://github.com/yacy/yacy_search_server.git
synced 2024-09-19 00:01:41 +02:00
Properly resolve relative URLs against document URL in html base tags
Fixes issue #256
This commit is contained in:
parent
73a6e45524
commit
3fb449b3b6
|
@ -744,9 +744,15 @@ public class ContentScraper extends AbstractScraper implements Scraper {
|
|||
} catch (final NumberFormatException e) {}
|
||||
this.evaluationScores.match(Element.imgpath, src);
|
||||
} else if(tag.name.equalsIgnoreCase("base")) {
|
||||
try {
|
||||
this.root = new DigestURL(tag.opts.getProperty("href", EMPTY_STRING));
|
||||
} catch (final MalformedURLException e) {}
|
||||
final String baseHref = tag.opts.getProperty("href", EMPTY_STRING);
|
||||
if(!baseHref.isEmpty()) {
|
||||
/* We must use here AnchorURL.newAnchor as the base href may also be an URL relative to the document URL */
|
||||
try {
|
||||
this.root = AnchorURL.newAnchor(this.root, baseHref);
|
||||
} catch (final MalformedURLException | RuntimeException ignored) {
|
||||
/* Nothing more to do when the base URL is malformed */
|
||||
}
|
||||
}
|
||||
} else if (tag.name.equalsIgnoreCase("frame")) {
|
||||
final AnchorURL src = absolutePath(tag.opts.getProperty("src", EMPTY_STRING));
|
||||
if(src != null) {
|
||||
|
|
|
@ -342,6 +342,106 @@ public class ContentScraperTest {
|
|||
Assert.assertEquals("{{abc}{def}}", ContentScraper.removeUnpairedBrackets("{{abc}{def}}", '{', '}'));
|
||||
}
|
||||
|
||||
/**
|
||||
* Test base tag URL resolution
|
||||
* @throws IOException when an unexpected error occurred
|
||||
*/
|
||||
@Test
|
||||
public void testBaseTagUrlResolution() throws IOException {
|
||||
final String htmlHeaderBeginning = "<!DOCTYPE html><head><title>Test document</title>";
|
||||
final DigestURL docUrl = new DigestURL("http://example.org/parent/base.html");
|
||||
|
||||
final String htmlLinksList = "<ul>"
|
||||
+ "<li><a href=\"http://example.org/sameDomain/absolute.html\">Absolute on same domain</a></li>"
|
||||
+ "<li><a href=\"http://localhost/otherDomain/absolute.html\">Absolute on another domain</a></li>"
|
||||
+ "<li><a href=\"//example.org/sameDomain/scheme-relative.html\">scheme-relative on same domain</a></li>"
|
||||
+ "<li><a href=\"//example.net/otherDomain/scheme-relative.html\">scheme-relative on another domain</a></li>"
|
||||
+ "<li><a href=\"/path/absolute.html\">path-absolute</a></li>"
|
||||
+ "<li><a href=\"path/relative/schemeless.html\">path-relative-scheme-less</a></li>"
|
||||
+ "</ul>";
|
||||
|
||||
|
||||
final Map<String, String[]> html2Results = new HashMap<>();
|
||||
/* No base tag */
|
||||
String html = htmlHeaderBeginning + "</head>" + htmlLinksList;
|
||||
String[] expectedUrls = { "http://example.org/sameDomain/absolute.html",
|
||||
"http://localhost/otherDomain/absolute.html", "http://example.org/sameDomain/scheme-relative.html",
|
||||
"http://example.net/otherDomain/scheme-relative.html", "http://example.org/path/absolute.html",
|
||||
"http://example.org/parent/path/relative/schemeless.html" };
|
||||
html2Results.put(html, expectedUrls);
|
||||
|
||||
/* Base with absolute href on same domain */
|
||||
html = htmlHeaderBeginning + "<base href=\"http://example.org/base/index.html\"/>"
|
||||
+ "</head>" + htmlLinksList;
|
||||
expectedUrls = new String[]{ "http://example.org/sameDomain/absolute.html",
|
||||
"http://localhost/otherDomain/absolute.html", "http://example.org/sameDomain/scheme-relative.html",
|
||||
"http://example.net/otherDomain/scheme-relative.html", "http://example.org/path/absolute.html",
|
||||
"http://example.org/base/path/relative/schemeless.html" };
|
||||
html2Results.put(html, expectedUrls);
|
||||
|
||||
/* Base with absolute href on another domain */
|
||||
html = htmlHeaderBeginning + "<base href=\"http://example.net/base/index.html\"/>"
|
||||
+ "</head>" + htmlLinksList;
|
||||
expectedUrls = new String[]{ "http://example.org/sameDomain/absolute.html",
|
||||
"http://localhost/otherDomain/absolute.html", "http://example.org/sameDomain/scheme-relative.html",
|
||||
"http://example.net/otherDomain/scheme-relative.html", "http://example.net/path/absolute.html",
|
||||
"http://example.net/base/path/relative/schemeless.html" };
|
||||
html2Results.put(html, expectedUrls);
|
||||
|
||||
/* Base with scheme-relative href on same domain */
|
||||
html = htmlHeaderBeginning + "<base href=\"//example.org/base/index.html\"/>"
|
||||
+ "</head>" + htmlLinksList;
|
||||
expectedUrls = new String[]{ "http://example.org/sameDomain/absolute.html",
|
||||
"http://localhost/otherDomain/absolute.html", "http://example.org/sameDomain/scheme-relative.html",
|
||||
"http://example.net/otherDomain/scheme-relative.html", "http://example.org/path/absolute.html",
|
||||
"http://example.org/base/path/relative/schemeless.html" };
|
||||
html2Results.put(html, expectedUrls);
|
||||
|
||||
/* Base with scheme-relative href on another domain */
|
||||
html = htmlHeaderBeginning + "<base href=\"//example.net/base/index.html\"/>"
|
||||
+ "</head>" + htmlLinksList;
|
||||
expectedUrls = new String[]{ "http://example.org/sameDomain/absolute.html",
|
||||
"http://localhost/otherDomain/absolute.html", "http://example.org/sameDomain/scheme-relative.html",
|
||||
"http://example.net/otherDomain/scheme-relative.html", "http://example.net/path/absolute.html",
|
||||
"http://example.net/base/path/relative/schemeless.html" };
|
||||
html2Results.put(html, expectedUrls);
|
||||
|
||||
/* Base with path-absolute relative href */
|
||||
html = htmlHeaderBeginning + "<base href=\"/base/index.html\"/>"
|
||||
+ "</head>" + htmlLinksList;
|
||||
expectedUrls = new String[]{ "http://example.org/sameDomain/absolute.html",
|
||||
"http://localhost/otherDomain/absolute.html", "http://example.org/sameDomain/scheme-relative.html",
|
||||
"http://example.net/otherDomain/scheme-relative.html", "http://example.org/path/absolute.html",
|
||||
"http://example.org/base/path/relative/schemeless.html" };
|
||||
html2Results.put(html, expectedUrls);
|
||||
|
||||
/* Base with path-relative-scheme-less relative href */
|
||||
html = htmlHeaderBeginning + "<base href=\"base/index.html\"/>"
|
||||
+ "</head>" + htmlLinksList;
|
||||
expectedUrls = new String[]{ "http://example.org/sameDomain/absolute.html",
|
||||
"http://localhost/otherDomain/absolute.html", "http://example.org/sameDomain/scheme-relative.html",
|
||||
"http://example.net/otherDomain/scheme-relative.html", "http://example.org/path/absolute.html",
|
||||
"http://example.org/parent/base/path/relative/schemeless.html" };
|
||||
html2Results.put(html, expectedUrls);
|
||||
|
||||
for (final Entry<String, String[]> html2Result : html2Results.entrySet()) {
|
||||
ContentScraper scraper = new ContentScraper(docUrl, 10, new HashSet<String>(), new VocabularyScraper(), 0);
|
||||
try (final Writer writer = new TransformerWriter(null, null, scraper, false)) {
|
||||
FileUtils.copy(new StringReader(html2Result.getKey()), writer);
|
||||
|
||||
final Set<DigestURL> expected = new HashSet<>();
|
||||
for (final String url : html2Result.getValue()) {
|
||||
expected.add(new DigestURL(url));
|
||||
}
|
||||
|
||||
Assert.assertEquals(expected.size(), scraper.getAnchors().size());
|
||||
Assert.assertTrue(expected.containsAll(scraper.getAnchors()));
|
||||
} finally {
|
||||
scraper.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test microdata itemtype attribute parsing
|
||||
* @throws IOException
|
||||
|
|
Loading…
Reference in New Issue
Block a user