首页 > 学院 > 开发设计 > 正文

URLConnection实现爬虫(解决重定向、设置cookie才能抓取页面等问题)

2019-11-08 01:57:26
字体:
来源:转载
供稿:网友

1.关键方法

/** * 向指定 URL 发送POST方法的请求 * * @param url * 发送请求的 URL * @param param * 请求参数,请求参数应该是 name1=value1&name2=value2 的形式。 * @param encode * 请求页面的字符编码 * @param cookie * cookie * @return 所代表远程资源的响应结果 */ public static String sendPost1(String url, String param, String encode,String cookie) { PRintWriter out = null; BufferedReader in = null; String result = ""; try { URL realUrl = new URL(url); // 打开和URL之间的连接 URLConnection conn = realUrl.openConnection(); // 设置通用的请求属性 conn.setRequestProperty("accept", "*/*"); conn.setRequestProperty("Accept-Language","zh-CN,zh;q=0.8"); conn.setRequestProperty("Cache-Control","max-age=0"); conn.setRequestProperty("connection", "Keep-Alive"); conn.setRequestProperty("Cookie",cookie); //conn.setRequestProperty("Host","www.zjtax.gov.cn"); conn.setRequestProperty("user-agent", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1;SV1)"); // 发送POST请求必须设置如下两行 conn.setDoOutput(true); conn.setDoInput(true); // 获取URLConnection对象对应的输出流 out = new PrintWriter(conn.getOutputStream()); // 发送请求参数 out.print(param); // flush输出流的缓冲 out.flush(); // 定义BufferedReader输入流来读取URL的响应 in = new BufferedReader( new InputStreamReader(conn.getInputStream(),encode)); String line; while ((line = in.readLine()) != null) { result += line; } } catch (Exception e) { System.out.println("发送 POST 请求出现异常!"+e); e.printStackTrace(); } //使用finally块来关闭输出流、输入流 finally{ try{ if(out!=null){ out.close(); } if(in!=null){ in.close(); } } catch(IOException ex){ ex.printStackTrace(); } } return result; } /** * 获取cookie * * @param url * 发送请求的URL * @return key=value;key=value;... */ public static String getCookie2(String url) { HttpURLConnection conn = null; try { URL realUrl = new URL(url); conn = (HttpURLConnection) realUrl.openConnection(); conn.setRequestProperty("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"); conn.setRequestProperty("Accept-Encoding","gzip, deflate, sdch"); conn.setRequestProperty("Accept-Language","zh-CN,zh;q=0.8"); conn.setRequestProperty("Cache-Control","max-age=0"); conn.setRequestProperty("connection", "Keep-Alive"); //conn.setRequestProperty("Host","www.zjtax.gov.cn"); conn.setRequestProperty("user-agent","Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1;SV1)"); //是否自动执行 http 重定向,默认为true //如果实际操作中,不存在重定向问题,不需要设置此行。 conn.setInstanceFollowRedirects(false); conn.setDoInput(true); conn.setDoOutput(true); conn.setRequestMethod("POST"); } catch (Exception e) { e.printStackTrace(); } String sessionId = ""; String cookieVal = ""; String key = null; // Map<String,List<String>> map = conn.getHeaderFields();// for (String key1 : map.keySet()) {// System.out.println(key1 + "--->" + map.get(key1));// } //取cookie for(int i = 1; (key = conn.getHeaderFieldKey(i)) != null; i++){ if(key.equalsIgnoreCase("set-cookie")){ cookieVal = conn.getHeaderField(i); cookieVal = cookieVal.substring(0, cookieVal.indexOf(";")); sessionId = sessionId + cookieVal + ";"; } } //如果实际操作中,不存在重定向问题,不需要以下四行 String location= conn.getHeaderField("Location");//获取 重定向地址 List<String> list = getCookie3(location,sessionId); List<String> list2 = getCookie3(list.get(1),sessionId+list.get(0)); sessionId = sessionId + list2.get(0); return sessionId; } /** * 获取 cookie * @param url * 发送请求的URL * @param cookie * cookie */ public static List<String> getCookie3(String url,String cookie) { HttpURLConnection conn = null; try { URL realUrl = new URL(url); conn = (HttpURLConnection) realUrl.openConnection(); conn.setRequestProperty("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"); conn.setRequestProperty("Accept-Encoding","gzip, deflate, sdch"); conn.setRequestProperty("Accept-Language","zh-CN,zh;q=0.8"); conn.setRequestProperty("Cache-Control","max-age=0"); conn.setRequestProperty("connection", "Keep-Alive"); //conn.setRequestProperty("Host","www.zjtax.gov.cn"); conn.setRequestProperty("user-agent","Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1;SV1)"); conn.setRequestProperty("Cookie",cookie); conn.setInstanceFollowRedirects(false); conn.setDoInput(true); conn.setDoOutput(true); conn.setRequestMethod("POST"); } catch (Exception e) { e.printStackTrace(); } String sessionId = ""; String cookieVal = ""; String key = null; String location= conn.getHeaderField("Location"); for(int i = 1; (key = conn.getHeaderFieldKey(i)) != null; i++){ if(key.equalsIgnoreCase("set-cookie")){ cookieVal = conn.getHeaderField(i); cookieVal = cookieVal.substring(0, cookieVal.indexOf(";")); sessionId = sessionId + cookieVal + ";"; } } List<String> list = new ArrayList<String>(); list.add(sessionId);//存放cookie list.add(location);//存放重定向地址 return list; }

另附,最基本的get抓取、post抓取、获取cookie方法

public class HttpURLContent { /** * 向指定URL发送GET方法的请求 * * @param url * 发送请求的URL * @param param * 请求参数,请求参数应该是 name1=value1&name2=value2 的形式。 * @return URL 所代表远程资源的响应结果 */ public static String sendGet(String url, String param) { String result = ""; BufferedReader in = null; try { String urlNameString = url + "?" + param; URL realUrl = new URL(urlNameString); // 打开和URL之间的连接 URLConnection connection = realUrl.openConnection(); // 设置通用的请求属性 connection.setRequestProperty("accept", "*/*"); connection.setRequestProperty("connection", "Keep-Alive"); connection.setRequestProperty("user-agent","Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1;SV1)"); // 建立实际的连接 connection.connect(); // 定义 BufferedReader输入流来读取URL的响应 in = new BufferedReader(new InputStreamReader( connection.getInputStream())); String line; while ((line = in.readLine()) != null) { result += line; } } catch (Exception e) { System.out.println("发送GET请求出现异常!" + e); e.printStackTrace(); } // 使用finally块来关闭输入流 finally { try { if (in != null) { in.close(); } } catch (Exception e2) { e2.printStackTrace(); } } return result; } /** * 向指定 URL 发送POST方法的请求 * * @param url * 发送请求的 URL * @param param * 请求参数,请求参数应该是 name1=value1&name2=value2 的形式。 * @return 所代表远程资源的响应结果 */ public static String sendPost(String url, String param) { PrintWriter out = null; BufferedReader in = null; String result = ""; try { URL realUrl = new URL(url); // 打开和URL之间的连接 URLConnection conn = realUrl.openConnection(); // 设置通用的请求属性 conn.setRequestProperty("accept", "*/*"); conn.setRequestProperty("connection", "Keep-Alive"); conn.setRequestProperty("user-agent", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1;SV1)"); // 发送POST请求必须设置如下两行 conn.setDoOutput(true); conn.setDoInput(true); // 获取URLConnection对象对应的输出流 out = new PrintWriter(conn.getOutputStream()); // 发送请求参数 out.print(param); // flush输出流的缓冲 out.flush(); // 定义BufferedReader输入流来读取URL的响应 in = new BufferedReader( new InputStreamReader(conn.getInputStream())); String line; while ((line = in.readLine()) != null) { result += line; } } catch (Exception e) { System.out.println("发送 POST 请求出现异常!"+e); e.printStackTrace(); } //使用finally块来关闭输出流、输入流 finally{ try{ if(out!=null){ out.close(); } if(in!=null){ in.close(); } } catch(IOException ex){ ex.printStackTrace(); } } return result; } public static String getCookie(String url) { HttpURLConnection conn = null; try { URL realUrl = new URL(url); conn = (HttpURLConnection) realUrl.openConnection(); conn.setDoInput(true); conn.setDoOutput(true); conn.setRequestMethod("POST"); } catch (Exception e) { e.printStackTrace(); } String sessionId = ""; String cookieVal = ""; String key = null; //取cookie for(int i = 1; (key = conn.getHeaderFieldKey(i)) != null; i++){ if(key.equalsIgnoreCase("set-cookie")){ cookieVal = conn.getHeaderField(i); cookieVal = cookieVal.substring(0, cookieVal.indexOf(";")); sessionId = sessionId + cookieVal + ";"; } } return sessionId; }

2.问题总结 第一步:使用最基本方法,直接抓取,抓取到内容,恭喜你。

第二步:直接抓取页面无果时,通过设置cookie抓取,即conn.setRequestProperty(“Cookie”,cookie);

第三步:新的问题是,如何获取cookie,当第一次访问页面时会产生cookie。所以要先访问一次页面,拿到cookie。即getCookie(String url)方法

第四步:这里就比较复杂了,我接触的大部分页面抓取,目标页面不存在重定向。如果遇到,就需要使用getCookie2()和getCookie3()方法 获取cookie。

这也是我目前遇到最麻烦的抓取,用了二天才解决。加油加油加油!!!

3.测试代码

/** * 出口退税率查询 * 测试url: * http://www.zjtax.gov.cn/wcm/xchaxun/tuishui.jsp?sotype=FULLNAME&sovalue=钢铁&PageIndex=1 */ public HashMap<String,Object> getCktsls(String url){ //先获取cookie String cookie= HttpURLContent.getCookie2("http://www.zjtax.gov.cn/wcm/xchaxun/tuishui.jsp"); HashMap<String,Object> re = new HashMap<String,Object>(); //抓取结果 String result = HttpURLContent.sendPost1(url,null,"utf-8",cookie); //System.out.println(result); //以下代码是对结果的处理了。。。根据实际情况。。。 if(result.contains("<font color='#104194'>共")){//查询到结果 //总页数 String[] result_arr = result.split("<font color='#104194'>共"); String totalPage_str = result_arr[1].substring(0, result_arr[1].indexOf("页")).trim(); List<Map<String,String>> mapList = new ArrayList<Map<String,String>>(); String[] result_arr1 = result.split("class=/"gs_cx4_sp7/">"); for(int i=1;i<result_arr1.length;i++){ Map<String,String> map = new HashMap<String,String>(); map.put("number", result_arr1[i].substring(0, result_arr1[i].indexOf("</span>"))); String[] result_arr2 = result_arr1[i].split("/">"); for(int j=1;j<result_arr2.length;j++){ String value = ""; if(j<=5) value = result_arr2[j].substring(0, result_arr2[j].indexOf("</span>")); switch (j) { case 1: map.put("nsrmc",value ); break; case 2: map.put("type", value); break; case 3: map.put("sdate", value); break; case 4: map.put("edate", value); break; case 5: map.put("sign", value); break; default: break; } } mapList.add(map); } re.put("totalPage_str", totalPage_str); re.put("result", mapList); }else{//未查询到结果 } return re; }
发表评论 共有条评论
用户名: 密码:
验证码: 匿名发表