From ab5a9bee66feceb62577bc05b4bb11c3584b8223 Mon Sep 17 00:00:00 2001 From: theli Date: Mon, 4 Sep 2006 14:38:29 +0000 Subject: [PATCH] *) adding some copyright headers *) next step of restructuring for new crawlers - adding first testversion of ftp crawler class -- does not create a htCache entry yet git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2483 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- source/de/anomic/net/ftpc.java | 2 +- .../plasma/crawler/AbstractCrawlWorker.java | 47 ++++++ .../plasma/crawler/ftp/CrawlWorker.java | 141 ++++++++++++++++++ .../plasma/crawler/http/CrawlWorker.java | 2 +- .../plasma/crawler/plasmaCrawlWorker.java | 47 ++++++ .../plasma/crawler/plasmaCrawlerFactory.java | 46 ++++++ .../plasma/crawler/plasmaCrawlerMsgQueue.java | 2 +- .../plasma/crawler/plasmaCrawlerPool.java | 47 ++++++ .../de/anomic/plasma/plasmaCrawlLoader.java | 2 +- source/de/anomic/plasma/plasmaHTCache.java | 15 +- .../de/anomic/plasma/plasmaSwitchboard.java | 13 +- 11 files changed, 351 insertions(+), 13 deletions(-) create mode 100644 source/de/anomic/plasma/crawler/ftp/CrawlWorker.java diff --git a/source/de/anomic/net/ftpc.java b/source/de/anomic/net/ftpc.java index da133a869..c6a880ff7 100644 --- a/source/de/anomic/net/ftpc.java +++ b/source/de/anomic/net/ftpc.java @@ -597,7 +597,7 @@ public class ftpc { } } - private boolean isFolder(String path) { + public boolean isFolder(String path) { try { send("CWD " + path); String reply = receive(); diff --git a/source/de/anomic/plasma/crawler/AbstractCrawlWorker.java b/source/de/anomic/plasma/crawler/AbstractCrawlWorker.java index 7001a51bb..f46b3cfae 100644 --- a/source/de/anomic/plasma/crawler/AbstractCrawlWorker.java +++ b/source/de/anomic/plasma/crawler/AbstractCrawlWorker.java @@ -1,3 +1,50 @@ +// AbstractCrawlWorker.java +// ------------------------------------- +// part of YACY +// (C) by Michael Peter Christen; mc@anomic.de +// first published on http://www.anomic.de +// Frankfurt, Germany, 2006 +// +// This file ist contributed by Martin Thelian +// +// $LastChangedDate: 2006-02-20 23:57:42 +0100 (Mo, 20 Feb 2006) $ +// $LastChangedRevision: 1715 $ +// $LastChangedBy: theli $ +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// +// Using this software in any meaning (reading, learning, copying, compiling, +// running) means that you agree that the Author(s) is (are) not responsible +// for cost, loss of data or any harm that may be caused directly or indirectly +// by usage of this softare or this documentation. The usage of this software +// is on your own risk. The installation and usage (starting/running) of this +// software may allow other people or application to access your computer and +// any attached devices and is highly dependent on the configuration of the +// software which must be done by the user of the software; the author(s) is +// (are) also not responsible for proper configuration and usage of the +// software, even if provoked by documentation provided together with +// the software. +// +// Any changes to this file according to the GPL as documented in the file +// gpl.txt aside this file in the shipment you received can be done to the +// lines that follows this copyright notice here, but changes must not be +// done inside the copyright notive above. A re-distribution must contain +// the intact and unchanged copyright notice. +// Contributions and changes to the program code must be marked as such. + + package de.anomic.plasma.crawler; import java.io.IOException; diff --git a/source/de/anomic/plasma/crawler/ftp/CrawlWorker.java b/source/de/anomic/plasma/crawler/ftp/CrawlWorker.java new file mode 100644 index 000000000..0af2f5734 --- /dev/null +++ b/source/de/anomic/plasma/crawler/ftp/CrawlWorker.java @@ -0,0 +1,141 @@ +// CrawlerWorker.java +// ------------------------------------- +// part of YACY +// (C) by Michael Peter Christen; mc@anomic.de +// first published on http://www.anomic.de +// Frankfurt, Germany, 2006 +// +// This file ist contributed by Martin Thelian +// +// $LastChangedDate: 2006-02-20 23:57:42 +0100 (Mo, 20 Feb 2006) $ +// $LastChangedRevision: 1715 $ +// $LastChangedBy: theli $ +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// +// Using this software in any meaning (reading, learning, copying, compiling, +// running) means that you agree that the Author(s) is (are) not responsible +// for cost, loss of data or any harm that may be caused directly or indirectly +// by usage of this softare or this documentation. The usage of this software +// is on your own risk. The installation and usage (starting/running) of this +// software may allow other people or application to access your computer and +// any attached devices and is highly dependent on the configuration of the +// software which must be done by the user of the software; the author(s) is +// (are) also not responsible for proper configuration and usage of the +// software, even if provoked by documentation provided together with +// the software. +// +// Any changes to this file according to the GPL as documented in the file +// gpl.txt aside this file in the shipment you received can be done to the +// lines that follows this copyright notice here, but changes must not be +// done inside the copyright notive above. A re-distribution must contain +// the intact and unchanged copyright notice. +// Contributions and changes to the program code must be marked as such. + + +package de.anomic.plasma.crawler.ftp; + +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.IOException; +import java.io.PrintStream; + +import de.anomic.net.ftpc; +import de.anomic.plasma.plasmaHTCache; +import de.anomic.plasma.plasmaSwitchboard; +import de.anomic.plasma.crawler.AbstractCrawlWorker; +import de.anomic.plasma.crawler.plasmaCrawlWorker; +import de.anomic.plasma.crawler.plasmaCrawlerPool; +import de.anomic.plasma.plasmaHTCache.Entry; +import de.anomic.server.logging.serverLog; + +public class CrawlWorker extends AbstractCrawlWorker implements + plasmaCrawlWorker { + + public CrawlWorker(ThreadGroup theTG, plasmaCrawlerPool thePool, plasmaSwitchboard theSb, plasmaHTCache theCacheManager, serverLog theLog) { + super(theTG, thePool, theSb, theCacheManager, theLog); + + // this crawler supports ftp + this.protocol = "ftp"; + } + + public void close() { + // TODO Auto-generated method stub + + } + + public void init() { + // TODO Auto-generated method stub + } + + public Entry load() throws IOException { + + File cacheFile = cacheManager.getCachePath(url); + cacheFile.getParentFile().mkdirs(); + + ByteArrayOutputStream bout = new ByteArrayOutputStream(); + PrintStream out = new PrintStream(bout); + + ByteArrayOutputStream berr = new ByteArrayOutputStream(); + PrintStream err = new PrintStream(berr); + + ftpc ftpClient = new ftpc(System.in, out, err); + + String userInfo = this.url.getUserInfo(); + String userName, userPwd; + if (userInfo != null) { + int pos = userInfo.indexOf(":"); + userName = userInfo.substring(0,pos); + userPwd = userInfo.substring(pos+1); + } else { + userName = "anonymous"; + userPwd = "anonymous"; + } + + ftpClient.exec("open " + this.url.getHost(), false); + ftpClient.exec("user " + userName + " " + userPwd, false); + ftpClient.exec("binary", false); + + // cd + String file = ""; + String path = this.url.getPath(); + int pos = path.lastIndexOf("/"); + if (pos == -1) { + file = path; + path = "/"; + } else { + file = path.substring(pos+1); + path = path.substring(0,pos); + } + ftpClient.exec("cd \"" + path + "\"", false); + + if (ftpClient.isFolder(file)) { + ftpClient.exec("cd \"" + file + "\"", false); + + // TODO: dirlist + } else { + // download the remote file + ftpClient.exec("get \"" + file + "\" \"" + cacheFile.getAbsolutePath() + "\"", false); + } + + ftpClient.exec("close", false); + ftpClient.exec("exit", false); + + // TODO: create a new htCache entry .... + + return null; + } + +} diff --git a/source/de/anomic/plasma/crawler/http/CrawlWorker.java b/source/de/anomic/plasma/crawler/http/CrawlWorker.java index 07a195d72..7fe642b65 100644 --- a/source/de/anomic/plasma/crawler/http/CrawlWorker.java +++ b/source/de/anomic/plasma/crawler/http/CrawlWorker.java @@ -3,7 +3,7 @@ //part of YaCy //(C) by Michael Peter Christen; mc@anomic.de //first published on http://www.anomic.de -//Frankfurt, Germany, 2004 +//Frankfurt, Germany, 2006 // // $LastChangedDate: 2006-08-12 16:28:14 +0200 (Sa, 12 Aug 2006) $ // $LastChangedRevision: 2397 $ diff --git a/source/de/anomic/plasma/crawler/plasmaCrawlWorker.java b/source/de/anomic/plasma/crawler/plasmaCrawlWorker.java index 349d3d7ea..0eb0f0d79 100644 --- a/source/de/anomic/plasma/crawler/plasmaCrawlWorker.java +++ b/source/de/anomic/plasma/crawler/plasmaCrawlWorker.java @@ -1,3 +1,50 @@ +// plasmaCrawlWorker.java +// ------------------------------------- +// part of YACY +// (C) by Michael Peter Christen; mc@anomic.de +// first published on http://www.anomic.de +// Frankfurt, Germany, 2006 +// +// This file ist contributed by Martin Thelian +// +// $LastChangedDate: 2006-02-20 23:57:42 +0100 (Mo, 20 Feb 2006) $ +// $LastChangedRevision: 1715 $ +// $LastChangedBy: theli $ +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// +// Using this software in any meaning (reading, learning, copying, compiling, +// running) means that you agree that the Author(s) is (are) not responsible +// for cost, loss of data or any harm that may be caused directly or indirectly +// by usage of this softare or this documentation. The usage of this software +// is on your own risk. The installation and usage (starting/running) of this +// software may allow other people or application to access your computer and +// any attached devices and is highly dependent on the configuration of the +// software which must be done by the user of the software; the author(s) is +// (are) also not responsible for proper configuration and usage of the +// software, even if provoked by documentation provided together with +// the software. +// +// Any changes to this file according to the GPL as documented in the file +// gpl.txt aside this file in the shipment you received can be done to the +// lines that follows this copyright notice here, but changes must not be +// done inside the copyright notive above. A re-distribution must contain +// the intact and unchanged copyright notice. +// Contributions and changes to the program code must be marked as such. + + package de.anomic.plasma.crawler; import java.io.IOException; diff --git a/source/de/anomic/plasma/crawler/plasmaCrawlerFactory.java b/source/de/anomic/plasma/crawler/plasmaCrawlerFactory.java index 549cf8a87..7f4f229ee 100644 --- a/source/de/anomic/plasma/crawler/plasmaCrawlerFactory.java +++ b/source/de/anomic/plasma/crawler/plasmaCrawlerFactory.java @@ -1,3 +1,49 @@ +// plasmaCrawlerFactory.java +// ------------------------------------- +// part of YACY +// (C) by Michael Peter Christen; mc@anomic.de +// first published on http://www.anomic.de +// Frankfurt, Germany, 2006 +// +// This file ist contributed by Martin Thelian +// +// $LastChangedDate: 2006-02-20 23:57:42 +0100 (Mo, 20 Feb 2006) $ +// $LastChangedRevision: 1715 $ +// $LastChangedBy: theli $ +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// +// Using this software in any meaning (reading, learning, copying, compiling, +// running) means that you agree that the Author(s) is (are) not responsible +// for cost, loss of data or any harm that may be caused directly or indirectly +// by usage of this softare or this documentation. The usage of this software +// is on your own risk. The installation and usage (starting/running) of this +// software may allow other people or application to access your computer and +// any attached devices and is highly dependent on the configuration of the +// software which must be done by the user of the software; the author(s) is +// (are) also not responsible for proper configuration and usage of the +// software, even if provoked by documentation provided together with +// the software. +// +// Any changes to this file according to the GPL as documented in the file +// gpl.txt aside this file in the shipment you received can be done to the +// lines that follows this copyright notice here, but changes must not be +// done inside the copyright notive above. A re-distribution must contain +// the intact and unchanged copyright notice. +// Contributions and changes to the program code must be marked as such. + package de.anomic.plasma.crawler; import java.lang.reflect.Constructor; diff --git a/source/de/anomic/plasma/crawler/plasmaCrawlerMsgQueue.java b/source/de/anomic/plasma/crawler/plasmaCrawlerMsgQueue.java index 2743455c1..b263835ef 100644 --- a/source/de/anomic/plasma/crawler/plasmaCrawlerMsgQueue.java +++ b/source/de/anomic/plasma/crawler/plasmaCrawlerMsgQueue.java @@ -3,7 +3,7 @@ // part of YACY // (C) by Michael Peter Christen; mc@anomic.de // first published on http://www.anomic.de -// Frankfurt, Germany, 2004 +// Frankfurt, Germany, 2006 // // This file ist contributed by Martin Thelian // diff --git a/source/de/anomic/plasma/crawler/plasmaCrawlerPool.java b/source/de/anomic/plasma/crawler/plasmaCrawlerPool.java index 11accfae8..f51cd25ff 100644 --- a/source/de/anomic/plasma/crawler/plasmaCrawlerPool.java +++ b/source/de/anomic/plasma/crawler/plasmaCrawlerPool.java @@ -1,3 +1,50 @@ +// plasmaCrawlerPool.java +// ------------------------------------- +// part of YACY +// (C) by Michael Peter Christen; mc@anomic.de +// first published on http://www.anomic.de +// Frankfurt, Germany, 2006 +// +// This file ist contributed by Martin Thelian +// +// $LastChangedDate: 2006-02-20 23:57:42 +0100 (Mo, 20 Feb 2006) $ +// $LastChangedRevision: 1715 $ +// $LastChangedBy: theli $ +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// +// Using this software in any meaning (reading, learning, copying, compiling, +// running) means that you agree that the Author(s) is (are) not responsible +// for cost, loss of data or any harm that may be caused directly or indirectly +// by usage of this softare or this documentation. The usage of this software +// is on your own risk. The installation and usage (starting/running) of this +// software may allow other people or application to access your computer and +// any attached devices and is highly dependent on the configuration of the +// software which must be done by the user of the software; the author(s) is +// (are) also not responsible for proper configuration and usage of the +// software, even if provoked by documentation provided together with +// the software. +// +// Any changes to this file according to the GPL as documented in the file +// gpl.txt aside this file in the shipment you received can be done to the +// lines that follows this copyright notice here, but changes must not be +// done inside the copyright notive above. A re-distribution must contain +// the intact and unchanged copyright notice. +// Contributions and changes to the program code must be marked as such. + + package de.anomic.plasma.crawler; import org.apache.commons.pool.impl.GenericKeyedObjectPool; diff --git a/source/de/anomic/plasma/plasmaCrawlLoader.java b/source/de/anomic/plasma/plasmaCrawlLoader.java index 8601cb646..d8b8fdca7 100644 --- a/source/de/anomic/plasma/plasmaCrawlLoader.java +++ b/source/de/anomic/plasma/plasmaCrawlLoader.java @@ -83,7 +83,7 @@ public final class plasmaCrawlLoader extends Thread { // supported protocols // TODO: change this, e.g. by loading settings from file - this.supportedProtocols = new HashSet(Arrays.asList(new String[]{"http","https","ftp"})); + this.supportedProtocols = new HashSet(Arrays.asList(new String[]{"http","https" /* ,"ftp" */})); // configuring the crawler messagequeue this.theQueue = new plasmaCrawlerMsgQueue(); diff --git a/source/de/anomic/plasma/plasmaHTCache.java b/source/de/anomic/plasma/plasmaHTCache.java index a1ed35f15..8fe6b0693 100644 --- a/source/de/anomic/plasma/plasmaHTCache.java +++ b/source/de/anomic/plasma/plasmaHTCache.java @@ -733,11 +733,16 @@ public final class plasmaHTCache { ); } - public Entry(Date initDate, int depth, URL url, String name, - httpHeader requestHeader, - String responseStatus, httpHeader responseHeader, - String initiator, - plasmaCrawlProfile.entry profile) { + public Entry(Date initDate, + int depth, + URL url, + String name, + httpHeader requestHeader, + String responseStatus, + httpHeader responseHeader, + String initiator, + plasmaCrawlProfile.entry profile + ) { // normalize url // serverLog.logFine("PLASMA", "Entry: URL=" + url.toString()); diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index bee35bae3..ae8f929c1 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -893,10 +893,15 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } // enqueue for further crawling - enQueue(this.sbQueue.newEntry(entry.url, indexURL.urlHash(entry.referrerURL()), - entry.requestHeader.ifModifiedSince(), entry.requestHeader.containsKey(httpHeader.COOKIE), - entry.initiator(), entry.depth, entry.profile.handle(), - entry.name() + enQueue(this.sbQueue.newEntry( + entry.url, + indexURL.urlHash(entry.referrerURL()), + entry.requestHeader.ifModifiedSince(), + entry.requestHeader.containsKey(httpHeader.COOKIE), + entry.initiator(), + entry.depth, + entry.profile.handle(), + entry.name() )); } else { if (!entry.profile.storeHTCache() && entry.cacheFile.exists()) {