首页 > 编程 > Java > 正文

分享一个简单的java爬虫框架

2019-11-26 10:57:52
字体:
来源:转载
供稿:网友

反复给网站编写不同的爬虫逻辑太麻烦了,自己实现了一个小框架

可以自定义的部分有:

请求方式(默认为Getuser-agent为谷歌浏览器的设置),可以通过实现RequestSet接口来自定义请求方式

储存方式(默认储存在f盘的html文件夹下),可以通过SaveUtil接口来自定义保存方式

需要保存的资源(默认为整个html页面)

筛选方式(默认所有url都符合要求),通过实现ResourseChooser接口来自定义需要保存的url和资源页面

实现的部分有:

html页面的下载方式,通过HttpClient实现html页面的下载

html页面的解析部分,通过jsoup实现html页面的解析

HtmlDownloader类,用于根据一个url下载一个html页面

package DownloadPackage;import java.io.BufferedReader;import java.io.IOException;import java.io.InputStreamReader;import org.apache.http.HttpEntity;import org.apache.http.HttpResponse;import org.apache.http.impl.client.CloseableHttpClient;import org.apache.http.impl.client.HttpClients;/* * 根据一个url下载一个html页面 */public class HtmlDownloader {	RequestSet requestset = null;	public HtmlDownloader(RequestSet requestset){		this.requestset = requestset;	}	public String downloadhtml(String url){		String html = null;		//创建一个客户端		//创建一个读取流从entity读取html		BufferedReader reader = null;		CloseableHttpClient httpclient = HttpClients.createDefault();		HttpResponse response = null;		try {			response = httpclient.execute(requestset.getMethod(url));			HttpEntity entity = response.getEntity();			reader = new BufferedReader(new InputStreamReader(entity.getContent()));			StringBuilder sb = new StringBuilder();			while((html = reader.readLine()) != null){				sb.append(html);			}			html = sb.toString();			System.out.println("一个html页面获取成功");		}		catch (IOException e) {			System.out.println(url+"连接失败");		}		finally{			if(reader != null){				try {					reader.close();					httpclient.close();				}				catch (IOException e) {					// TODO Auto-generated catch block					e.printStackTrace();				}			}		}		return html;	}}

UrlGet类,用于根据一个html页面获得所有的url连接

package DownloadPackage;import java.util.LinkedList;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.jsoup.select.Elements;public class UrlGet {	public LinkedList<String> geturls(String html){		LinkedList<String> urls = new LinkedList<String>();		Document doc = Jsoup.parse(html);		Elements links = doc.getElementsByTag("a");		for (Element link:links){			String url = link.attr("href");			urls.add(url);		}		return urls;	}}

资源选择接口,需要实现三个方法,第一是isNeed方法,判断url是否为需要的,第二个是isResourse方法,判断url页面是不是需要的资源页面,第三个是process方法,

有时网页上的url是我们需要的但是格式不对,对url进行加工

package ChoosePackage;public interface ResourseChooser {	public Boolean isNeed(String url);	public Boolean isResourse(String url);	public String process(String url);}

RequsetSet类,用于自定义请求方法的接口,实现getMethod方法获取请求方法

package DownloadPackage;import org.apache.http.client.methods.HttpGet;/* * 一个用于获得Request请求的接口 * 实现getMethod方法获取Get方法 */public interface RequestSet {	public HttpGet getMethod(String url);}Saveutil接口用于自定义保存方式,需要实现save方法package SaveUtil;/* * 数据储存的工具接口,必须实现保存方法 */public interface SaveUtil {	public void save(String url,String html);}

Spider类,有五中构造方法,可以实现多种自定义操作,其中实现了上述自定义接口的默认实现类

package Spider;import java.io.BufferedWriter;import java.io.File;import java.io.FileWriter;import java.io.IOException;import java.util.HashSet;import java.util.Iterator;import java.util.LinkedList;import org.apache.http.client.config.RequestConfig;import org.apache.http.client.methods.HttpGet;import ChoosePackage.MyResourseChooser;import ChoosePackage.ResourseChooser;import DownloadPackage.HtmlDownloader;import DownloadPackage.RequestSet;import DownloadPackage.UrlGet;import SaveUtil.MySaveUtil;import SaveUtil.SaveUtil;/* * 用于爬取资源的类 */public class Spider{	public static void main(String[] args) {		new Spider("http://www.bilibili.net").spiderstart();	}	//种子url	String seed = null;	//用于保存数据的类,需要自己实现	private SaveUtil saveutil = null;	//html下载类	private HtmlDownloader downloader = null;	//url下载类	private UrlGet urldownloader = null;	//资源选择工具	private ResourseChooser resoursechooser = null;	//用于保存未下载的网页	LinkedList<String> unvisited = new LinkedList<String>();	//用于保存已下载的网页	HashSet<String> visited = new HashSet<String>();	//自定义储存方式,请求方式,资源筛选方式的构造方法	public Spider(SaveUtil saveutil,RequestSet request,ResourseChooser resoursechooser,String seed){		this.saveutil = saveutil;		this.downloader = new HtmlDownloader(request);		this.urldownloader = new UrlGet();		this.resoursechooser = resoursechooser;		this.seed = seed;		unvisited.add(seed);	}	//自定义储存方式,资源筛选方式的构造方法	public Spider(SaveUtil saveutil,ResourseChooser resoursechooser,String seed){		this.resoursechooser = resoursechooser;		this.downloader = new HtmlDownloader(new getRequest());		this.saveutil = saveutil;		this.urldownloader = new UrlGet();		this.seed = seed;		unvisited.add(seed);	}	//自定义储存方式,请求的构造方法	public Spider(SaveUtil saveutil,RequestSet requestset,String seed){		this.saveutil = saveutil;		this.downloader = new HtmlDownloader(requestset);		this.resoursechooser = new MyResourseChooser();		this.urldownloader = new UrlGet();		this.seed = seed;		unvisited.add(seed);	}	//自定义储存方式的构造方法	public Spider(SaveUtil saveutil,String seed){		this.saveutil = saveutil;		this.downloader = new HtmlDownloader(new getRequest());		this.resoursechooser = (new MyResourseChooser());		this.urldownloader = new UrlGet();		this.seed = seed;		unvisited.add(seed);	}	//默认的爬虫构造方法	public Spider(String seed){		this.saveutil = new MySaveUtil();		this.downloader = new HtmlDownloader(new getRequest());		this.resoursechooser = (new MyResourseChooser());		this.urldownloader = new UrlGet();		this.seed = seed;		unvisited.add(seed);	}	//开始爬取的方法	private void spiderstart(){		String html = null;		while(!unvisited.isEmpty()){			String url = unvisited.poll();			System.out.println("开始获取"+url);			if(resoursechooser.isNeed(url)){				try{					html = downloader.downloadhtml(url);				}				catch(RuntimeException e){					System.out.println(url+"连接获取失败");					continue;				}				visited.add(url);				LinkedList<String> urls = new LinkedList<String>();				try{					urls = urldownloader.geturls(html);				}				catch(RuntimeException e){					System.out.println(url+"的html页面为空");					continue;				}				Iterator<String> it = urls.iterator();				while(it.hasNext()){					String newurl = it.next();					if(resoursechooser.isNeed(newurl)&&!visited.contains(newurl)&&!unvisited.contains(newurl)){						newurl = resoursechooser.process(newurl);						unvisited.add(newurl);						System.out.println(newurl+"加入页面");					}				}				System.out.println("获取了"+url+"上的所有url");				if(resoursechooser.isResourse(url)){					saveutil.save(url,html);				}			}		}	}	//默认资源筛选类	private class MyResourseChooser implements ResourseChooser{		@Override		public Boolean isNeed(String url) {			// TODO Auto-generated method stub			if(!url.startsWith("/")&&!url.startsWith("http")){				return false;			}			return true;		}		@Override		public Boolean isResourse(String url) {			// TODO Auto-generated method stub			return true;		}		@Override		public String process(String url) {			// TODO Auto-generated method stub			if(!url.startsWith("http")){				url = seed+url;			}			return url;		}	}	public class getRequest implements RequestSet{		public HttpGet getMethod(String url) {			// TODO Auto-generated method stub			//创建一个get请求方法			HttpGet getmethod = new HttpGet(url);			//HttpHost proxy = new HttpHost("124.88.67.81",80);这里不设置代理IP			//设置请求超时时间等 			RequestConfig responseconfig = RequestConfig.custom().setConnectionRequestTimeout(10000).setConnectTimeout(10000).setSocketTimeout(10000).build();			//设置请求头,主要是user-agent			getmethod.addHeader("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36");			//设置请求参数			getmethod.setConfig(responseconfig);			return getmethod;		}	}	//默认的存储类	public class MySaveUtil implements SaveUtil{		@Override		public void save(String url, String html) {			// TODO Auto-generated method stub			String filename = getfilename(url);			BufferedWriter writer = null;			try{				writer = new BufferedWriter(new FileWriter(filename));				writer.write(html);				writer.flush();				System.out.println("文件写入成功");			}			catch(IOException e){				System.out.println("文件写入失败");			}			finally{				try {					if(writer != null)					writer.close();				}				catch (IOException e) {					// TODO Auto-generated catch block					System.out.println("流关闭失败");				}			}		}		private String getfilename(String url){			String fileparentpath = "f://html";			File file = new File(fileparentpath);			if(!file.exists()){				file.mkdir();			}			int last = url.lastIndexOf(".");			int first = url.indexOf(".");			url = url.substring(first,last);			url = url.replaceAll("//.", "");			url = url.replaceAll("/", "");			return fileparentpath+"/"+url+".txt";		}	}}

总结

以上就是本文关于分享一个简单的java爬虫框架的全部内容,希望对大家有所帮助。感兴趣的朋友可以继续参阅本站:Python爬虫实例爬取网站搞笑段子Java线程之锁对象Lock-同步问题更完美的处理方式代码实例Java编程几个循环实例代码分享等,有什么问题可以随时留言,小编会及时回复大家的。感谢朋友们对本站的支持!

发表评论 共有条评论
用户名: 密码:
验证码: 匿名发表