Sub GetCatalogPages()
For n = 1 To 20
CatalogURL = "http://blog.sina.com.cn/s/_" & n & ".html"
Call GetCatalogByUrl(CatalogURL)
Next n
End Sub
Sub GetCatalogByUrl(ByVal CatalogURL As String)
‘Dim CatalogURL As String
Dim WebText As String
Dim OneSpan As Object
Dim OneA As Object
Dim Wb As Workbook
Dim Sht As Worksheet
Dim Rng As Range
Dim i As Long, j As Long
Dim StartTime As Variant ‘开始时间
Dim UsedTime As Variant ‘使用时间
StartTime = VBA.Timer ‘记录开始时间
AppSettings
Set Wb = Application.ThisWorkbook
Set Sht = Wb.Worksheets("Catalog")
With Sht
‘.UsedRange.Offset(1).ClearContents
‘i = 1
endrow = .Cells(.Cells.Rows.Count, 1).End(xlUp).Row
i = endrow
‘发送请求
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", CatalogURL, False
.Send
WebText = .responsetext
End With
‘创建网页文件 创建 Html Dom
‘Microsoft HTML Object Library
With CreateObject("htmlfile")
.write WebText
For Each OneA In .getElementsByTagName("a")
href = OneA.href
If href Like "*http://blog.sina.com.cn/s/blog_*" Then
i = i + 1
Sht.Cells(i, 2).Value = href
‘ Sht.Hyperlinks.Add Sht.Cells(i, 2), href ‘, href
End If
Next OneA
i = endrow
For Each OneMeta In .getElementsByTagName("meta")
If OneMeta.Name = "description" Then
cnt = OneMeta.Content
‘Debug.Print cnt
titles = Split(Split(cnt, "xxxx,")(1), ",")
For n = LBound(titles) To UBound(titles) Step 1
i = i + 1
Sht.Cells(i, 1).Value = titles(n)
Next n
End If
Next OneMeta
End With
End With
AppSettings False
UsedTime = VBA.Timer - StartTime
Debug.Print "采集 " & CatalogURL; " : " & Format(UsedTime, "#0.0000秒")
‘MsgBox "本次运行耗时:" & Format(UsedTime, "#0.0000秒")
End Sub
Sub GetQuestionsByExamUrl()
Dim Wb As Workbook
Dim Sht As Worksheet
Dim oSht As Worksheet
Set Wb = Application.ThisWorkbook
Set Sht = Wb.Worksheets("Catalog")
Set oSht = Wb.Worksheets("Question")
With Sht
endrow = .Cells(.Cells.Rows.Count, 1).End(xlUp).Row
Set Rng = .Range("A2:B" & endrow)
Arr = Rng.Value
End With
With oSht
r = 1
For i = LBound(Arr) To UBound(Arr)
ExamTitle = Arr(i, 1)
ExamUrl = Arr(i, 2)
ExamText = GetExamTextByUrl(ExamUrl)
Ques = RegGetArray(ExamText, "([\((]\d[\))][^\r\n]*)[\r\n]")
For n = LBound(Ques) To UBound(Ques) Step 1
r = r + 1
.Cells(r, 1).Value = ExamTitle
.Cells(r, 2).Value = ExamUrl
.Cells(r, 3).Value = Ques(n)
Next n
Next i
End With
Set Wb = Nothing
Set Sht = Nothing
Set oSht = Nothing
End Sub
Function GetExamTextByUrl(ByVal ExamUrl As String) As String
‘发送请求
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", ExamUrl, False
.Send
WebText = .responsetext
‘Debug.Print WebText
End With
With CreateObject("htmlfile")
.write WebText
Set examdiv = .getElementById("sina_keyword_ad_area2")
‘ Debug.Print examdiv.innerText
GetExamTextByUrl = examdiv.innerText
End With
End Function
Private Sub AppSettings(Optional IsStart As Boolean = True)
If IsStart Then
Application.ScreenUpdating = False
Application.DisplayAlerts = False
Application.Calculation = xlCalculationManual
Application.StatusBar = ">>>>>>>>Macro Is Running>>>>>>>>"
Else
Application.ScreenUpdating = True
Application.DisplayAlerts = True
Application.Calculation = xlCalculationAutomatic
Application.StatusBar = False
End If
End Sub
Public Function RegGetArray(ByVal OrgText As String, ByVal Pattern As String) As String()
Dim Reg As Object, Mh As Object, OneMh As Object
Dim Arr() As String, Index As Long
Dim Elm As String
Set Reg = CreateObject("Vbscript.Regexp")
With Reg
.MultiLine = True
.Global = True
.Ignorecase = False
.Pattern = Pattern
Set Mh = .Execute(OrgText)
Index = 0
ReDim Arr(1 To 1)
For Each OneMh In Mh
Index = Index + 1
ReDim Preserve Arr(1 To Index)
‘If OneMh.submatches(0) <> "" Then Elm = OneMh.submatches(0)
Arr(Index) = OneMh.submatches(0)
‘Debug.Print OneMh.submatches(0)
Next OneMh
End With
RegGetArray = Arr
Set Reg = Nothing
Set Mh = Nothing
End Function
Public Function RegGet(ByVal OrgText As String, ByVal Pattern As String) As String
‘传递参数 :原字符串, 匹配模式
Dim Regex As Object
Dim Mh As Object
Set Regex = CreateObject("VBScript.RegExp")
With Regex
.Global = True
.Pattern = Pattern
End With
If Regex.test(OrgText) Then
Set Mh = Regex.Execute(OrgText)
RegGet = Mh.Item(0).submatches(0)
Else
RegGet = ""
End If
Set Regex = Nothing
End Function