java 批量抓取网页

在做网页抓取的时候我遇到了抓取超时的问题，在这里问下各位怎么解决

首先，获取网页的内容我主要是建了一个类GrabWebContent.java，具体代码如下：

public class GrabWebContent {



	/**

	 * @use for 获取网页全部内容

	 */



	

	/**

	 * 读取网页全部内容

	 */

	public String getHtmlContent(String htmlurl) {

		URL url;

		String temp;

		StringBuffer sb = new StringBuffer();

		try {

			url = new URL(htmlurl);

			BufferedReader in = new BufferedReader(new InputStreamReader(

					url.openStream(), "utf8"));// 读取网页全部内容

			while ((temp = in.readLine()) != null) {

				sb.append(temp);

			}

			in.close();

		} catch (final MalformedURLException me) {

			System.out.println("你输入的URL格式有问题!");

			me.getMessage();

		} catch (final IOException e) {

			e.printStackTrace();

		}

		return sb.toString();

	}

	

	/**

	 * 写字符串中数据到txt文件

	 * 

	 * @param context

	 * @return

	 * @throws IOException

	 */

	public boolean writeTxt(String context, String filePath) throws IOException {

		System.out.println("开始写文件。。");

		OutputStreamWriter osw = new OutputStreamWriter(new FileOutputStream(

				filePath));



		osw.write(context, 0, context.length());

		osw.flush();

		osw.close();



		return true;

	}｝

URL的来源是从数据库里读取url信息，然后使用for循环，每条url读取里面的网页内容，具体代码请看：

package dataGrab;



import java.sql.*;

import java.util.ArrayList;

import java.util.List;

import java.util.regex.Matcher;

import java.util.regex.Pattern;



import chinapsp.db.ConnectionDB;

import dataToDB.ListAgency;



public class Result_ZBJG_Details {



	/**

	 * @use for 获取指定日期的招标结果的内容 usage: 1.设置date 2.设置url 3.设置content

	 */

	String date = "";// 日期

	List<String> url = new ArrayList<String>();// 日期对应的url

	List<String> content = new ArrayList<String>();// url对应的内容

	String insertIntoZBJG_Content = "";

	String tableName = "ZBJG_Content";

	java.sql.Connection con;



	public Result_ZBJG_Details() {

		this.con = ConnectionDB.getConnection();

	}



	public void setDate(String date) {

		this.date = date;

	}



	public String getDate() {

		return date;

	}



	// 从数据库DataGrab的表ZhaoBiaoJieGuo_All中获取对应日期的网址

	public void setUrl() {

		List<String> list = new ArrayList<String>();

		this.con = ConnectionDB.getConnection();

		if(date!="" && date!=null)

		{

			try {

				Statement sql = con.createStatement();

				ResultSet rs = sql

						.executeQuery("select * from ZhaoBiaoJieGuo_All where "

								+ " date like '%" + date

								+ "%' and title not like '%失败%'");

				while (rs.next()) {

					list.add(rs.getString("link").trim());

				}

				con.close();

			} catch (SQLException e) {

				// TODO Auto-generated catch block

				e.printStackTrace();

			}

		}

		url = list;

	}



	public List<String> getUrl() {

		return url;

	}



	// 获取该日期网址的网页内容

	public void setContent() {

		List<String> temp = new ArrayList<String>();

		GrabWebContent gwv = new GrabWebContent();

		if(url.size()>0){

			for (int i = 0; i < url.size(); i++) {

				String regex;

				List<String> body = new ArrayList<String>();

				regex = "新闻列表.*?数据列表";// 先获取主体内容

				Pattern pa = Pattern.compile(regex, Pattern.DOTALL);

				Matcher ma = pa.matcher(gwv.getHtmlContent(url.get(i)));

				while (ma.find()) {

					body.add(ma.group());

				}

				temp.add(body.toString());

			}

			// 内容格式化

			List<String> content1 = new ArrayList<String>();

			for (int i = 0; i < temp.size(); i++) {

				String regex;

				List<String> body = new ArrayList<String>();

				regex = "<div class=\"title\">.*</div>";// 先获取主体内容

				Pattern pa = Pattern.compile(regex, Pattern.DOTALL);

				Matcher ma = pa.matcher(temp.get(i).replaceAll("style.*?>", ">"));

				while (ma.find()) {

					body.add(ma.group());

				}

				content1.add(body.toString());

			}

			content = content1;

		}

		

	}



	public List<String> getContent() {

		return content;

	}



	// 将招标结果content导入数据库DataGrab的表ZBJG_Content中

	/**

	 * 思路:1.先调用setDate();设置日期

	 * 		2.然后调用setUrl();从数据库中获取该日期下的url链接

	 * 		3.调用setContent;抓取url数组下的网页内容,使用的是for循环,可能出现连接超时,效率不高,待优化

	 * 		4.getInsertIntoZBJG_Content()函数先获取网页内容里面的标题,存到title里

	 * 		5.调用类ListAgency获取代理机构列表agLisy

	 * 		6.将数组content每一条记录与agList的代理机构匹配,成功则agency数组增加该机构,否则增加为"无该代理机构,请人工输入"

	 * 		7.数据库操作,把抓取的数据插入数据库,包括网页标题/内容/代理机构

	 */

	public String getInsertIntoZBJG_Content() {

		String str = "";

		// 获取标题

		String regex;

		List<String> title = new ArrayList<String>();

		regex = "(?<=title\">).*?(?=</)";

		Pattern pa = Pattern.compile(regex, Pattern.DOTALL);

		Matcher ma = pa.matcher(content.toString());

		while (ma.find()) {

			title.add(ma.group());

		}

		// 获取代理机构

		List<String> agency = new ArrayList<String>();

		ListAgency ag = new ListAgency();

		List<String> agList = new ArrayList();

		agList = ag.getAgency();

		for (int i = 0; i < content.size(); i++) {

			for (int k = 0; k < agList.size(); k++) {

				pa = Pattern.compile(agList.get(k), Pattern.DOTALL);

				ma = pa.matcher(content.get(i));

				if (ma.find()) {

					agency.add(agList.get(k));

					break;

				} else if (k == agList.size()) {

					agency.add("无该代理机构,请人工输入");

					break;

				}

			}

		}

		// 数据库操作,把抓取的数据插入数据库,包括网页标题/内容/代理机构

		this.con = ConnectionDB.getConnection();

		if (title.size() == content.size() && title.size() == agency.size()) {

			PreparedStatement sql;

			String s = " INSERT INTO " + tableName + " (title,content,agency) "

					+ " VALUES (?,?,?)";

			try {

				sql = con.prepareStatement(s);

				for (int i = 0; i < title.size(); i++) {

					sql.setString(1, title.get(i).trim());

					sql.setString(2, content.get(i).trim());

					sql.setString(3, agency.get(i).trim());

					sql.addBatch();

				}

				int count = 0;// 计数器,计算插入数据的条数

				int inserts[];

				// 执行插入操作

				inserts = sql.executeBatch();

				con.close();

				// 判断是否全部插入

				for (int k = 0; k < inserts.length; k++) {

					if (inserts[k] != 0) {

						count++;

					}

				}

				if (count == title.size()) {

					str = "成功插入" + count + "条记录成功!";

				} else {

					str = "插入了" + count + "条数据, 但有" + (content.size() - count)

							+ "条记录没有插入, 请返回检查!!!";

				}



			} catch (SQLException e) {

				// TODO Auto-generated catch block

				e.printStackTrace();

			}

		}

		else{

			str = "title、content、agency数量不等，请检查";

		}

		return str;

	}



	// 获取代理机构名称			该方法不完善,网页上如果没有采购代理机构等标注的话捕获不到

	public List<String> getAgency(List<String> content) {

		List<String> agency = new ArrayList<String>();

		String regex;

		// List<String> list = new ArrayList<String>();

		regex = "(采购代理机构：.*?<)|(采购代理机构名称：.*?<)";

		Pattern pa = Pattern.compile(regex, Pattern.DOTALL);

		Matcher ma = pa.matcher(content.toString());

		while (ma.find()) {

			agency.add(ma.group());

		}

		return agency;

	}



	public static void main(String[] args) {

		// TODO Auto-generated method stub

		Result_ZBJG_Details r = new Result_ZBJG_Details();

		r.setDate("2013-09-24");

		// String date = r.getDate();

		r.setUrl();

		// List<String> url = new ArrayList<String>();

		// url = r.getUrl();

		r.setContent();

		List<String> content = new ArrayList<String>();

		content = r.getContent();

		System.out.println(r.getInsertIntoZBJG_Content());

		// System.out.println(r.getUrl());

	}



}

现在的问题是获取url内容时有时会提示超时，有时能正常提取，那么请问该怎么优化呢？ java网页抓取网页抓取超时网页爬虫 --------------------编程问答-------------------- 抓取网页我用httpclient --------------------编程问答--------------------

引用 1 楼 songbgi 的回复:

抓取网页我用httpclient

两者有什么区别呢？如果需要修改，要怎么改呢？

补充：Java ,  Java相关