当前位置:编程学习 > VB >>

VB6.0编写网站非法信息查询程序

 文/图 红色银狐
       朋友使用我给他写的根据特征码抓取网站信息尝到了甜头,但是也带来了一些问题,就是网站的非法信息太多,诸如X药、摇X丸之类的词语全都出现了,方便之处带来了隐患。
       就上面所带来的问题,我又和朋友沟通一番,找到了如下的解决方法:大致是利用百度进行非法关键字搜索,找到所有的网址,然后进行百度翻页,有这样的关键字就翻页到底。把获取的网址放到某个控件里,然后再写入数据库,这样的话就无需手动到自己的网站进行查找了,这个东东就可以解决了,它可以告诉你,你的网站哪个超链接有你不想要的非法关键字。
       好,思路理清,现在开始分析整个流程。
第一:导入关键字,利用百度搜索。
第二:分析搜索的超链接,并分析有效链接和无效链接。
第三:利用Internet Transfer控件抓取网页链接(根据特征码)。
第四:把获取的链接放入ListView控件中。
第五:将获取的链接进行处理,并写入数据库。
大体的流程就是这样的,有了流程,界面也就设计出来了,如图1所示。


图1
下面进入编码阶段,我们利用记事本来存放自己网站的网址。利用CommonDialog控件来导入已经存放好的网址,实现代码如下。
 
单击添加域名事件
Private Sub cmdAddDomain_Click()
Call AddWebOrBadWs(CDialog1, ltWebSite)
End Sub

添加LISTBOX元素
Private Sub AddWebOrBadWs(ByVal objCDlog As CommonDialog, ByVal objLt As ListBox)
Dim strFileName As String, strArr() As String, strFilePath As String, AllStr As String, LineData As String
Dim i As Long
Dim Fso As FileSystemObject, Stm As TextStream
objCDlog.ShowOpen
strFileName = objCDlog.FileName
strFilePath = strFileName
strFileName = Dir(strFileName, vbNormal)
If objCDlog.FilterIndex = 0 Then
Exit Sub
End If
objLt.Clear
strArr = Split(strFileName, ".", -1, vbTextCompare)
If UBound(strArr) > 0 Then
If LCase((strArr(1))) <> "txt" Then
Call MsgBox("文件必须是TXT格式", vbExclamation, TitleInfo)
Exit Sub
End If
End If
Set Fso = CreateObject("Scripting.FileSystemObject")
If Fso.FileExists(strFilePath) = True Then
Set Stm = Fso.OpenTextFile(strFilePath, , ForReading, False)
If Stm.AtEndOfStream = True Then
Call MsgBox("文件没有内容", vbExclamation, TitleInfo)
Exit Sub
End If
AllStr = Stm.ReadAll
strArr = Split(AllStr, vbCrLf, -1, vbTextCompare)
For i = 0 To UBound(strArr)
LineData = strArr(i)
If Len(LineData) > 0 Then
objLt.AddItem LineData
End If
Next
Set Stm = Nothing
End If
Set Fso = Nothing
End Sub

利用这段代码,我们就可以导入想要的网址了。至于非法关键字,朋友已经放在了服务器里的网站上了,下面这段代码可以针对其进行解决。
 
表单初始化
Private Sub Form_Load()
Dim strBadWs As String, strArr() As String, strPath As String, LineData As String
Dim i As Long
Dim Fso As Scripting.FileSystemObject, Stm As TextStream
On Error GoTo Display_Error
picScroll.FontSize = 10
strPath = App.Path & "BadWd.txt"
ScrollText = Inet1.OpenURL("asp">http://110.jizhezhan.com/zjxt.asp", 0)
Inet1.Cancel
strBadWs = Inet1.OpenURL("http://110.jizhezhan.com/BadWs.asp", 0)
Inet1.Cancel
strArr = Split(strBadWs, vbCrLf, -1, vbTextCompare)
For i = 0 To UBound(strArr)
LineData = strArr(i)
If Len(LineData) > 0 Then
ltBadWords.AddItem LineData
End If
Next
Exit Sub
Display_Error:
Call MsgBox(Err.Description, vbCritical, TitleInfo)
Set Fso = CreateObject("Scripting.FileSystemObject")
If Fso.FileExists(strPath) = True Then
Set Stm = Fso.OpenTextFile(strPath, ForReading, False)
Do While Stm.AtEndOfStream = False
LineData = Stm.ReadLine
If Len(LineData) > 0 Then
ltBadWords.AddItem LineData
End If
Loop
Set Stm = Nothing
End If
Set Fso = Nothing
End Sub

打开我写的这个软件时,大家还会看到一段滚动公告。这段滚动公告代码不是我写的,是从网络中搜索的,它利用API函数实现,其实不用API函数也可以实现,通过计时器的Timer控件完全可以实现,因为这个是朋友后来要加的,我也无心再写,就从网络中找到了一个,算是偷懒了。
接着便是我们要谈的重点了,就是如何循环找到每个关键字的所有超链接,并且把百度的翻页进行到底。下面的代码即可解决这个问题。
 
单击搜索事件
Private Sub cmdSearch_Click()
Dim ltWebCount As Long, ltBadCount As Long
StatusBar1.Panels(3).Text = ""
ltWebCount = ltWebSite.ListCount
ltBadCount = ltBadWords.ListCount
If ltWebCount = 0 Or ltBadCount = 0 Then
Exit Sub
End If
Page = 1
Total = 0
LoopCount = 0
PbMax = ltWebCount * ltBadCount
ProgressBar1.Max = PbMax
ltvValid.ListItems.Clear
ltvNotValid.ListItems.Clear
FrmSet.Enabled = False
FrmResult.Enabled = False
Call LoopOpen(ltWebSite, ltBadWords)
End Sub

循环打开所有查询
Private Sub LoopOpen(ByVal objLtweb As ListBox, ByVal objLtbad As ListBox)
Dim i As Long, k As Long, webLength As Long, badLength As Long, NotValids As Long
webLength = objLtweb.ListCount
badLength = objLtbad.ListCount
Dim NextPage As String, TempTypeUrl As String
Dim Pos As Long
Const strNexPage As String = "<font size=3>下一页</font>"
Const strB As String = "this.innerHTML,this.href"
Const strE As String = "target=""_blank"""
NotValids = 1
For i = 0 To webLength - 1
DoEvents
For k = 0 To badLength - 1
DoEvents
TempTypeUrl = URL & "site:" & objLtweb.List(i) & " " & URLEncode(objLtbad.List(k))
NextPage = Inet1.OpenURL(TempTypeUrl, 0)
Inet1.Cancel
Call HtmlBodyA(NextPage, strB, strE, k)
Pos = InStr(1, NextPage, strNexPage, vbTextCompare)
Do While Pos > 0
调用睡眠函数
Call Sleep(2000)
Page = Page + 1
NextPage = Inet1.OpenURL(TempTypeUrl & "&pn=" & (Page - 1) * 10, 0)
Call HtmlBodyA(NextPage, strB, strE, k)
Pos = InStr(1, NextPage, strNexPage, vbTextCompare)
Inet1.Cancel
Loop
LoopCount = LoopCount + 1
ProgressBar1.Value = LoopCount
Next
Next
ProgressBar1.Value = 0
StatusBar1.Panels(3).Text = "已经成功获取完数据"
Call WriteData(ltvValid, ltvNotValid, NotValids)
FrmSet.Enabled = True
FrmResult.Enabled = True
End Sub

不使用正则表达式来获取BODY数据
Private Sub HtmlBodyA(ByVal strCode As String, ByVal startStr As String, ByVal overStr As String, ByVal BadCounts As Long)
Dim PosA As Long, PosB As Long, PosC As Long, PosD As Long, PosE As Long
Dim tempStr As String, strArr() As String, strHref As String
Dim k As Long, i As Long, tempPos As Long
tempStr = LCase(strCode)
strArr = Split(tempStr, startStr, -1, vbTextCompare)
tempStr = Replace(tempStr, vbCrLf, "", vbTextCompare)
tempStr = Replace(tempStr, vbNewLine, "", vbTextCompare)
tempStr = Replace(tempStr, Chr(9), "", vbTextCompare)
k = UBound(strArr)
tempPos = 1
If k > 0 Then
For i = 0 To k - 1
DoEvents
PosA = InStr(tempPos, tempStr, startStr, vbTextCompare)
PosB = InStr(PosA, tempStr, overStr, vbTextCompare)
PosC = Len(startStr)
strHref = Mid$(tempStr, PosA + PosC, PosB - PosA)
strHref = Replace(strHref, startStr, "", 1, -1)
strHref = Replace(strHref, Chr(32), "", 1, -1)
PosD = InStr(1, strHref, "href=", vbTextCompare)
PosE = InStr(1, strHref, overStr, vbTextCompare)
strHref = Mid$(strHref, PosD, PosE - PosD)
strHref = Replace(strHref, Chr(32), "", 1, -1, vbTextCompare)
strHref = Replace(strHref, "href=", "", 1, -1, vbTextCompare)
strHref = Replace(strHref, Chr(34), "", 1, -1, vbTextCompare)
ltvValid.ListItems.Add , , ltBadWords.List(BadCounts)
ltvValid.ListItems(Total + 1).ListSubItems.Add 1, , strHref
ltvValid.ListItems(Total + 1).ListSubItems.Add 2, , "双击打开网页"
tempPos = PosB
Total = Total + 1
Debug.Print strHref
Next
End If
End Sub

这里必须根据网页来分析特征码,因为都是利用百度搜索,无论哪个关键字特征码都是一样的,我自己找的开始特征码和结束特征码如下:
(1)

补充:软件开发 , Vb ,
CopyRight © 2012 站长网 编程知识问答 www.zzzyk.com All Rights Reserved
部份技术文章来自网络,