1.日志log4j
1.1.DEBUG:debug级别
1.2.stdout:输出到控制台
1.3.D:输出到文件
log4j.rootLogger=DEBUG, stdout,D 
  
#Console  
log4j.appender.stdout=org.apache.log4j.ConsoleAppender  
log4j.appender.stdout.Target = System.out
log4j.appender.stdout.layout=org.apache.log4j.PatternLayout  
log4j.appender.stdout.layout.ConversionPattern=[%-5p] %d{yyyy-MM-dd HH:mm:ss,SSS} method:%l%n%m%n
#D
log4j.appender.D = org.apache.log4j.DailyRollingFileAppender
log4j.appender.D.File = C://logs2/log.log
log4j.appender.D.Append = true
log4j.appender.D.Threshold = DEBUG 
log4j.appender.D.layout = org.apache.log4j.PatternLayout
log4j.appender.D.layout.ConversionPattern = %-d{yyyy-MM-dd HH:mm:ss}  [ %t:%r ] - [ %p ]  %m%n
2.初始化url
package com.open111.crawler;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.LinkedList;
import java.util.Queue;
import org.apache.http.HttpEntity;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.apache.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/**
 * 
 * 爬虫起始类
 * @author user
 *
 */
public class StartCrawler {
	
	private static Logger logger=Logger.getLogger(StartCrawler.class);
	public static String[] excludeUrl=new String[]{ ".pom", ".xml", ".md5", ".sha1", ".asc", ".gz", ".zip", "../"}; // 要过滤的url后缀
	
	public static Queue<String> waitForCrawlerUrls=new LinkedList<String>(); // 等待爬取的url
	
	private static int total=0;
	
	/**
	 * 解析网页内容
	 * @param webPageContent
	 */
	public static void parseWebPage(String webPageContent,String realPath){
		if("".equals(webPageContent)){
			return;
		}
		Document doc=Jsoup.parse(webPageContent);
		Elements links=doc.select("a"); // 获取所有超链接元素
		for(int i=0;i<links.size();i++){
			Element link=links.get(i);
			String url=link.attr("href");
			boolean f=true;
			for(int j=0;j<excludeUrl.length;j++){
				if(url.endsWith(excludeUrl[j])){
					f=false;
					break;
				}
			}
			if(f){ // 是我们需要的url
				if(url.endsWith(".jar")){ // 目标地址
					total++;
					logger.info("发现第"+total+"个目标:"+(realPath+url));
				}else{ // 要继续解析的Url 
					logger.info("爬虫url队列新增url:"+(realPath+url));
					addUrl(realPath+url,"解析网页");
				}
			}
		}
	}
	
	/**
	 * 添加url到爬虫队列,假如队列中存在 就不添加
	 * @param string
	 */
	private static void addUrl(String url,String info) {
		if(url==null || "".equals(url)){
			return;
		}
		if(!waitForCrawlerUrls.contains(url)){
			waitForCrawlerUrls.add(url);
			logger.info("["+info+"]"+url+"添加到爬虫队列");
		}
	}
	/**
	 * 解析网页请求
	 * @param url 请求的url
	 */
	public static void parseUrl(){
		while(waitForCrawlerUrls.size()>0){
			String url=waitForCrawlerUrls.poll(); // 摘取第一个元素
			logger.info("执行解析url:"+url);
			CloseableHttpClient httpClient=HttpClients.createDefault(); // 创建httpclient实例
			HttpGet httpGet=new HttpGet(url); // 创建httpget实例
			CloseableHttpResponse response=null; 
			try {
				response=httpClient.execute(httpGet);
				HttpEntity entity=response.getEntity(); // 获取返回实体
				if("text/html".equals(entity.getContentType().getValue())){
					String webPageContent=EntityUtils.toString(entity, "utf-8");
					parseWebPage(webPageContent,url);
				}
			} catch (ClientProtocolException e) {
				logger.error("ClientProtocolException", e);
				addUrl(url,"由于异常");
			} catch (IOException e) {
				logger.error("IOException", e);
				addUrl(url,"由于异常");
			}finally{
				if(response!=null){
					try {
						response.close();
					} catch (IOException e) {
						logger.error("IOException", e);
					}
				}
				try {
					httpClient.close();
				} catch (IOException e) {
					logger.error("IOException", e);
				}
			}
			
			try {
				Thread.sleep(1000); // 休息1秒钟
			} catch (InterruptedException e) {
				logger.error("InterruptedException", e);
			} 
		}
		
	}
	
	private static void init(){
		logger.info("读取爬虫配置文件");
		FileInputStream fis=null;
		InputStreamReader isr=null;
		BufferedReader br=null;
		try {
			String str=null;
			fis=new FileInputStream("c:\\crawler.txt");
			isr=new InputStreamReader(fis);
			br=new BufferedReader(isr);
			while((str=br.readLine())!=null){
				addUrl(str, "初始化");
			}
		} catch (FileNotFoundException e) {
			logger.error("FileNotFoundException", e);
		} catch (IOException e) {
			logger.error("IOException", e);
		}finally{
			try {
				br.close();
				isr.close();
				fis.close();
			} catch (IOException e) {
				logger.error("IOException", e);
			}
		}
		logger.info("完成读取爬虫配置文件");
		parseUrl();
	}
	
	public static void main(String[] args) {
		logger.info("开始执行爬虫任务");
		init();
	}
}
原文:http://www.cnblogs.com/csy666/p/6597261.html