1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97
|
public class HtmlFileUtil { private static final String separator = File.separator; private static final Logger logger = LoggerFactory.getLogger(HtmlFileUtil.class);
public static boolean isHtml(String path) { return path.endsWith("/") || path.endsWith(".htm") || path.endsWith(".html"); }
public static boolean isServer(String path) { return path.endsWith(".php") || path.endsWith(".asp") || path.endsWith(".jsp") || path.endsWith(".action") || path.endsWith(".do"); } public static String processUrl(String url) { return url.replaceFirst("^(http|https)://", "") .replaceAll("//", "/"); }
public static void saveFile(String basePath, String siteName, String url) { String processUrl = processUrl(url); String filePath = basePath + "/" + processUrl; if (!processUrl.startsWith(siteName)) { filePath = basePath + "/" + siteName + "/file" + processUrl.replaceFirst("[^/]+/", "/"); }
File file = new File(filePath.replaceAll("/", separator)); logger.info(Thread.currentThread() + ": " + url + " -> " + file.getAbsolutePath());
if (!file.exists()) { File dirs = new File(filePath.replaceFirst("/[^/]+$", "/").replaceAll("/", separator)); try { if (!dirs.exists()) { if (!dirs.mkdirs()) throw new RuntimeException("make dir error"); } HttpClients.createDefault().execute(new HttpGet(url)) .getEntity().writeTo(new FileOutputStream(file));
} catch (ConnectTimeoutException e) { logger.warn("connect timeout", e); try { Thread.sleep(1000); } catch (InterruptedException ie) { logger.warn("interrupted error", ie); } saveFile(basePath, siteName, url); } catch (IOException e) { logger.warn("write file error", e); } } }
public static void startGet(String siteName, String basePath, int thread) { startGet(siteName, basePath, thread, null, 0); }
public static void startGet(String siteName, String basePath, int thread, String proxyAddr, int proxyPort) { String[] split = siteName.split("//"); if (split.length < 2) throw new RuntimeException("site name error"); startGet(split[0] + "//", split[1], basePath, thread, proxyAddr, proxyPort); }
public static void startGet(String scheme, String siteName, String basePath, int thread, String proxyAddr, int proxyPort) { HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); if (null != proxyAddr) httpClientDownloader.setProxyProvider(SimpleProxyProvider.from(new Proxy(proxyAddr, proxyPort)));
HtmlPageProcessor processor = new HtmlPageProcessor(siteName); Spider.create(processor).addUrl(scheme + siteName) .addPipeline(new HtmlFilePipeline(basePath)) .setDownloader(httpClientDownloader) .thread(thread).run();
ExecutorService service = Executors.newFixedThreadPool(thread); for (String link : processor.FileLinks) { Runnable runnable = () -> { if (link.startsWith("/") || link.startsWith("./")) { saveFile(basePath, siteName, scheme + siteName + link); } else { saveFile(basePath, siteName, link); } }; service.execute(runnable); }
service.shutdown(); } }
|