当前位置:编程学习 > C#/ASP.NET >>

网站数据采集器怎么做

我想做个数据采集器,把中国体育彩票开奖的信息取出来。
不懂怎么动手,请详细指点一下。 网站数据采集 彩票 --------------------编程问答-------------------- 可以用http协议获取中国体育彩票网站的信息,然后根据获取到的html数据进行正则匹配出开奖的信息…… --------------------编程问答--------------------
    protected void Button1_Click(object sender, EventArgs e)
    {
        WebRequest wc = HttpWebRequest.Create("http://www.cznd.gov.cn/node/jrgxq_qnyw/2013-7-5/137512575342148320.html");
        wc.ContentType = "application/x-www-form-urlencoded;charset=gb2312";
        using (WebResponse wq = wc.GetResponse())
        {
            using (Stream s = wq.GetResponseStream())
            {
                using (StreamReader sr = new StreamReader(s, Encoding.GetEncoding("gb2312")))
                {
                    string html = sr.ReadToEnd();
                    Match m = Regex.Match(html, @"(?i)<td[^>]*?class=(['""]?)NewsContent\1[^>]*?>\s*?<p[^>]*?>\s*?([\s\S]*?)</p>");
                    string result = m.Groups[2].Value;
                    Console.Write(result);
                    Console.ReadLine();
                }
            }
        }
    }



前几天看到的一个案例. --------------------编程问答-------------------- </div><TABLE width="366" align="center" cellpadding="0" cellspacing="0" style="color: #4a4a48;"><TR bgcolor="#ececec" align="center"><TD width="54" height="24">玩法</TD><TD width="50">期号</TD><TD width="166">开奖号</TD><TD width="32"><FONT style="font-size: 13px;">详情</FONT></TD><TD width="32"><FONT style="font-size: 13px;">历史</FONT></TD><TD width="32"><FONT style="font-size: 13px;">图表</FONT></TD></TR><TR align="center"><TD height="40"><FONT>大乐透</FONT></TD><TD>13082 </TD><TD align="left"><TABLE width='159' height='21' align='left' cellpadding='0' cellspacing='0' style='color: #ffffff;font-weight:bold;font-family: 宋体;'><TR align='center'><TD width='21' background='/images/20055.gif' style='color: #ffffff'>03</TD><TD width='2'></TD><TD width='21' background='/images/20055.gif' style='color: #ffffff'>09</TD><TD width='2'></TD><TD width='21' background='/images/20055.gif' style='color: #ffffff'>25</TD><TD width='2'></TD><TD width='21' background='/images/20055.gif' style='color: #ffffff'>26</TD><TD width='2'></TD><TD width='21' background='/images/20055.gif' style='color: #ffffff'>33</TD><TD width='2'></TD><TD width='21' background='/images/20056.gif' style='color: #ffffff'>03</TD><TD width='2'></TD><TD width='21' background='/images/20056.gif' style='color: #ffffff'>12</TD></TR></TABLE></TD><TD><A href='/news/11010219.shtml' target='_blank'><IMG src='/images/20014.gif' border='0' /></A></TD><TD><A href='/lottery/dlt/History.aspx' target="_blank"><IMG src="/images/20016.gif" border="0" /></A></TD><TD><A href='http://data.lottery.gov.cn/chart_tc2/chart.shtml?LotID=23529&ChartID=20001&StatType=0&MinIssue=2012026&MaxIssue=2012125&IssueTop=100&tab=0' target="_blank"><IMG src="/images/20017.gif" border="0" /></A></TD></TR><TR><TD colspan='7' height='1' background='/images/20022.gif'></TD></TR><TR align="center"><TD height="40"><FONT>排列3</FONT></TD><TD>13191 </TD><TD align="left"><TABLE width='67' height='21' align='left' cellpadding='0' cellspacing='0' style='color: #000000;font-weight:bold;font-family: 宋体;'><TR align='center'><TD width='21' background='/images/20057.gif' style='color: #ffffff'>4</TD><TD width='2'></TD><TD width='21' background='/images/20057.gif' style='color: #ffffff'>7</TD><TD width='2'></TD><TD width='21' background='/images/20057.gif' style='color: #ffffff'>3</TD></TR></TABLE></TD><TD><A href='/news/11010220.shtml' target='_blank'><IMG src='/images/20014.gif' border='0' /></A></TD><TD><A href='/lottery/pls/History.aspx' target="_blank"><IMG src="/images/20016.gif" border="0" /></A></TD><TD><A href='http://data.lottery.gov.cn/chart_tc2/chart.shtml?LotID=33&ChartID=20001&StatType=0&MinIssue=2012263&MaxIssue=2012292&IssueTop=30&tab=0' target="_blank"><IMG src="/images/20017.gif" border="0" /></A></TD></TR><TR><TD colspan='7' height='1' background='/images/20022.gif'></TD></TR><TR align="center"><TD height="40"><FONT>排列5</FONT></TD><TD>13191 </TD><TD align="left"><TABLE width='113' height='21' align='left' cellpadding='0' cellspacing='0' style='color: #000000;font-weight:bold;font-family: 宋体;'><TR align='center'><TD width='21' background='/images/20057.gif' style='color: #ffffff'>4</TD><TD width='2'></TD><TD width='21' background='/images/20057.gif' style='color: #ffffff'>7</TD><TD width='2'></TD><TD width='21' background='/images/20057.gif' style='color: #ffffff'>3</TD><TD width='2'></TD><TD width='21' background='/images/20057.gif' style='color: #ffffff'>4</TD><TD width='2'></TD><TD width='21' background='/images/20057.gif' style='color: #ffffff'>4</TD></TR></TABLE></TD><TD><A href='/news/11010221.shtml' target='_blank'><IMG src='/images/20014.gif' border='0' /></A></TD><TD><A href='/lottery/plw/History.aspx' target="_blank"><IMG src="/images/20016.gif" border="0" /></A></TD><TD><A href='http://data.lottery.gov.cn/chart_tc2/chart.shtml?LotID=35&ChartID=20001&StatType=0&MinIssue=&MaxIssue=&IssueTop=30' target="_blank"><IMG src="/images/20017.gif" border="0" /></A></TD></TR><TR><TD colspan='7' height='1' background='/images/20022.gif'></TD></TR><TR align="center"><TD height="40"><FONT>22选5</FONT></TD><TD>13172 </TD><TD align="left"><TABLE width='113' height='21' align='left' cellpadding='0' cellspacing='0' style='color: #ffffff;font-weight:bold;font-family: 宋体;'><TR align='center'><TD width='21' background='/images/20055.gif' style='color: #ffffff'>08</TD><TD width='2'></TD><TD width='21' background='/images/20055.gif' style='color: #ffffff'>09</TD><TD width='2'></TD><TD width='21' background='/images/20055.gif' style='color: #ffffff'>14</TD><TD width='2'></TD><TD width='21' background='/images/20055.gif' style='color: #ffffff'>15</TD><TD width='2'></TD><TD width='21' background='/images/20055.gif' style='color: #ffffff'>19</TD></TR></TABLE></TD><TD><A href='/news/11009537.shtml' target='_blank'><IMG src='/images/20014.gif' border='0' /></A></TD><TD><A href='/lottery/eexw/History.aspx' target="_blank"><IMG src="/images/20016.gif" border="0" /></A></TD><TD><A href='http://data.lottery.gov.cn/chart_tc2/chart.shtml?LotID=23525&ChartID=20001&StatType=0&MinIssue=&MaxIssue=&IssueTop=30' target="_blank"><IMG src="/images/20017.gif" border="0" /></A></TD></TR><TR><TD colspan='7' height='1' background='/images/20022.gif'></TD></TR><TR align="center"><TD height="40"><FONT>7星彩</FONT></TD><TD>13082 </TD><TD align="left"><TABLE width='159' height='21' align='left' cellpadding='0' cellspacing='0' style='color: #ffffff;font-weight:bold;font-family: 宋体;'><TR align='center'><TD width='21' background='/images/20055.gif' style='color: #ffffff'>6</TD><TD width='2'></TD><TD width='21' background='/images/20055.gif' style='color: #ffffff'>0</TD><TD width='2'></TD><TD width='21' background='/images/20055.gif' style='color: #ffffff'>8</TD><TD width='2'></TD><TD width='21' background='/images/20055.gif' style='color: #ffffff'>0</TD><TD width='2'></TD><TD width='21' background='/images/20055.gif' style='color: #ffffff'>1</TD><TD width='2'></TD><TD width='21' background='/images/20055.gif' style='color: #ffffff'>4</TD><TD width='2'></TD><TD width='21' background='/images/20055.gif' style='color: #ffffff'>8</TD></TR></TABLE></TD><TD><A href='/news/11010180.shtml' target='_blank'><IMG src='/images/20014.gif' border='0' /></A></TD><TD><A href='/lottery/qxc/History.aspx' target="_blank"><IMG src="/images/20016.gif" border="0" /></A></TD><TD><A href='http://data.lottery.gov.cn/chart_tc2/chart.shtml?LotID=10022&ChartID=20001&StatType=0&MinIssue=&MaxIssue=&IssueTop=30' target="_blank"><IMG src="/images/20017.gif" border="0" /></A></TD></TR></tr><tr><TR><TD colspan="6" height="31" background="/images/20078.gif"><TABLE width="360" align="center" cellpadding="0" cellspacing="0" border="0"><TR><TD colspan="2" height="2"></TD></TR><TR><TD width="65"></TD><TD width="295">超级大乐透 <span id="LabelDLT" class="FontPool">1.71 亿元</span>   派奖 <span id="LabelQXC" class="FontPool">500 万元</span></TD></TR></TABLE></TD></TR></TABLE><SCRIPT type="text/javascript">var _bdhmProtocol = (("https:" == document.location.protocol) ? " https://" : " http://"); document.write(unescape("%3Cscript src='" + _bdhmProtocol + "hm.baidu.com/h.js%3F8929ffae85e1c07a7ded061329fbf441' type='text/javascript'%3E%3C/script%3E")); </SCRIPT></form></BODY></HTML>

怎么写正则取出如下数据

--------------------编程问答-------------------- 其实主页面采用了框架iframe,指向地址为http://www.lottery.gov.cn/lottery/draws/Global.aspx

因此你得到该地址的内容就可以了

示例代码如下
 WebRequest wc = HttpWebRequest.Create("http://www.lottery.gov.cn/lottery/draws/Global.aspx");
            wc.ContentType = "application/x-www-form-urlencoded;charset=gb2312";
            using (WebResponse wq = wc.GetResponse())
            {
                using (Stream s = wq.GetResponseStream())
                {
                    using (StreamReader sr = new StreamReader(s, Encoding.GetEncoding("utf-8")))
                    {
                        string html = sr.ReadToEnd();
                        
                        string pattern=@"(?i)<tr((?!.*?bgcolor)[^>]*?)>\s*?<td[^>]*?>\s*?<font>([^>]*?)</font>\s*?</td>\s*?<td[^>]*?>([^<>]*?)</td>\s*?<td[^>]*?>\s*?<table[^>]*?>[\s\S]*?(<td[^>]*?>((?<Num>\d+)|\s*?)</td>)*?\s*?</tr>\s*?[\s\S]*?</table>";
                        var result = Regex.Matches(html, pattern).OfType<Match>().Select(a => new { 
                            玩法=a.Groups[2].Value,
                            期号=a.Groups[3].Value,
                            开奖号=string.Join(" ",a.Groups["Num"].Captures.OfType<Capture>().Select(b=>b.Value))
                        });
                        /*
                         + [0] { 玩法 = "大乐透", 期号 = "13082 ", 开奖号 = "03 09 25 26 33 03 12" } <Anonymous Type>
                        + [1] { 玩法 = "排列3", 期号 = "13191 ", 开奖号 = "4 7 3" } <Anonymous Type>
                        + [2] { 玩法 = "排列5", 期号 = "13191 ", 开奖号 = "4 7 3 4 4" } <Anonymous Type>
                        + [3] { 玩法 = "22选5", 期号 = "13172 ", 开奖号 = "08 09 14 15 19" } <Anonymous Type>
                        + [4] { 玩法 = "7星彩", 期号 = "13082 ", 开奖号 = "6 0 8 0 1 4 8" } <Anonymous Type>

                         */

                    }
                }
--------------------编程问答-------------------- 爬取数据啊
主要还是html分析 

可以使用 htmlagilitypack

参考 http://www.cnblogs.com/wangchuang/archive/2013/03/11/2953638.html --------------------编程问答-------------------- 我已经得到该地址的内容,但是正则啊,我还是搞不定啊
补充:.NET技术 ,  C#
CopyRight © 2012 站长网 编程知识问答 www.zzzyk.com All Rights Reserved
部份技术文章来自网络,