The scraper cannot parse the content from the first page

I wrote a code to parse the name, address and phone numbers of different stores from yell.com. If any link is passed to my crawler, it analyzes all content no matter how many pages it spread. However, the only problem I was able to figure out is that it always skips the content of the first page, since if there are 10 pages, my workarounds are 9 pages long. A little twitch could make me find a workaround. Here is the complete code. Thanks in advance.

Sub YellUK()
Const mlink = "https://www.yell.com"
Dim http As New MSXML2.XMLHTTP60, html As New HTMLDocument, htm As New HTMLDocument
Dim post As HTMLHtmlElement, page As Object, newlink As String

With http
    .Open "GET", "https://www.yell.com/ucs/UcsSearchAction.do?keywords=pizza&location=United+Kingdom&scrambleSeed=1426936001", False
    .send
    html.body.innerHTML = .responseText
End With
Set page = html.getElementsByClassName("row pagination")(0).getElementsByTagName("a")
For i = 0 To page.Length - 2
    newlink = mlink & Replace(page(i).href, "about:", "")
    With http
        .Open "GET", newlink, False
        .send
        htm.body.innerHTML = .responseText
    End With

    For Each post In htm.getElementsByClassName("js-LocalBusiness")
        x = x + 1
        With post.getElementsByClassName("row businessCapsule--title")(0).getElementsByTagName("a")
            If .Length Then Cells(x + 1, 1) = .Item(0).innerText
        End With
        With post.getElementsByClassName("col-sm-10 col-md-11 col-lg-12 businessCapsule--address")(0).getElementsByTagName("span")
            If .Length > 1 Then Cells(x + 1, 2) = .Item(1).innerText
        End With
        With post.getElementsByClassName("col-sm-10 col-md-11 col-lg-12 businessCapsule--address")(0).getElementsByTagName("span")
            If .Length > 2 Then Cells(x + 1, 3) = .Item(2).innerText
        End With
        With post.getElementsByClassName("col-sm-10 col-md-11 col-lg-12 businessCapsule--address")(0).getElementsByTagName("span")
            If .Length > 3 Then Cells(x + 1, 4) = .Item(3).innerText
        End With
        With post.getElementsByClassName("businessCapsule--tel")
            If .Length > 1 Then Cells(x + 1, 5) = .Item(1).innerText
        End With
    Next post
Next i
End Sub

      

Here are the elements that store the page number for the next page:

<div class="row pagination">
<div class="col-sm-24">
&nbsp;<span class="pagination--page is-selected">1</span>
&nbsp;<a class="pagination--page" rel="nofollow" href="/ucs/UcsSearchAction.do?location=United+Kingdom&amp;keywords=pizza&amp;scrambleSeed=721890588&amp;pageNum=2" data-tracking="DISPLAY:PAGINATION:NUMBER">2</a>
&nbsp;<a class="pagination--page" rel="nofollow" href="/ucs/UcsSearchAction.do?location=United+Kingdom&amp;keywords=pizza&amp;scrambleSeed=721890588&amp;pageNum=3" data-tracking="DISPLAY:PAGINATION:NUMBER">3</a>
&nbsp;<a class="pagination--page" rel="nofollow" href="/ucs/UcsSearchAction.do?location=United+Kingdom&amp;keywords=pizza&amp;scrambleSeed=721890588&amp;pageNum=4" data-tracking="DISPLAY:PAGINATION:NUMBER">4</a>
&nbsp;<a class="pagination--page" rel="nofollow" href="/ucs/UcsSearchAction.do?location=United+Kingdom&amp;keywords=pizza&amp;scrambleSeed=721890588&amp;pageNum=5" data-tracking="DISPLAY:PAGINATION:NUMBER">5</a>
&nbsp;<a class="pagination--page" rel="nofollow" href="/ucs/UcsSearchAction.do?location=United+Kingdom&amp;keywords=pizza&amp;scrambleSeed=721890588&amp;pageNum=6" data-tracking="DISPLAY:PAGINATION:NUMBER">6</a>
&nbsp;<a class="pagination--page" rel="nofollow" href="/ucs/UcsSearchAction.do?location=United+Kingdom&amp;keywords=pizza&amp;scrambleSeed=721890588&amp;pageNum=7" data-tracking="DISPLAY:PAGINATION:NUMBER">7</a>
&nbsp;<a class="pagination--page" rel="nofollow" href="/ucs/UcsSearchAction.do?location=United+Kingdom&amp;keywords=pizza&amp;scrambleSeed=721890588&amp;pageNum=8" data-tracking="DISPLAY:PAGINATION:NUMBER">8</a>
&nbsp;<a class="pagination--page" rel="nofollow" href="/ucs/UcsSearchAction.do?location=United+Kingdom&amp;keywords=pizza&amp;scrambleSeed=721890588&amp;pageNum=9" data-tracking="DISPLAY:PAGINATION:NUMBER">9</a>
&nbsp;<a class="pagination--page" rel="nofollow" href="/ucs/UcsSearchAction.do?location=United+Kingdom&amp;keywords=pizza&amp;scrambleSeed=721890588&amp;pageNum=10" data-tracking="DISPLAY:PAGINATION:NUMBER">10</a>
&nbsp;<a rel="nofollow" class="pagination--next" href="/ucs/UcsSearchAction.do?location=United+Kingdom&amp;keywords=pizza&amp;scrambleSeed=721890588&amp;pageNum=2" data-tracking="DISPLAY:PAGINATION:NEXT">Next</a>
</div>
</div>

      

+3


source to share


1 answer


The problem is that the first page is already selected and therefore does not have an anchor in the pagination. The solution would be to process the first page first and then process the remaining pages with pagination. NTN

Option Explicit

Sub YellUK()
Const mlink = "https://www.yell.com"
Dim http As New MSXML2.XMLHTTP60
Dim html As New HTMLDocument
Dim page As Object, newlink As String

With http
    .Open "GET", "https://www.yell.com/ucs/UcsSearchAction.do?keywords=pizza&location=United+Kingdom&scrambleSeed=1426936001", False
    .send
    html.body.innerHTML = .responseText
End With

Set page = html.getElementsByClassName("row pagination")(0).getElementsByTagName("a")

Dim i, x
' First page first, is selected already, 'row pagination' doesn't have 'a' for it
GetPageData x, html

' Next pages then
Dim html2 As New HTMLDocument
For i = 0 To page.Length - 2
    newlink = mlink & Replace(page(i).href, "about:", "")
    With http
        .Open "GET", newlink, False
        .send
        html2.body.innerHTML = .responseText
    End With
    GetPageData x, html2
Next i
End Sub

Private Sub GetPageData(ByRef x, ByRef html As HTMLDocument)
    Dim post As HTMLHtmlElement
    For Each post In html.getElementsByClassName("js-LocalBusiness")
        x = x + 1
        With post.getElementsByClassName("row businessCapsule--title")(0).getElementsByTagName("a")
            If .Length Then Cells(x + 1, 1) = .Item(0).innerText
        End With
        With post.getElementsByClassName("col-sm-10 col-md-11 col-lg-12 businessCapsule--address")(0).getElementsByTagName("span")
            If .Length > 1 Then Cells(x + 1, 2) = .Item(1).innerText
        End With
        With post.getElementsByClassName("col-sm-10 col-md-11 col-lg-12 businessCapsule--address")(0).getElementsByTagName("span")
            If .Length > 2 Then Cells(x + 1, 3) = .Item(2).innerText
        End With
        With post.getElementsByClassName("col-sm-10 col-md-11 col-lg-12 businessCapsule--address")(0).getElementsByTagName("span")
            If .Length > 3 Then Cells(x + 1, 4) = .Item(3).innerText
        End With
        With post.getElementsByClassName("businessCapsule--tel")
            If .Length > 1 Then Cells(x + 1, 5) = .Item(1).innerText
        End With
    Next post
End Sub

      



EDIT: Maybe something like this. The first link for the page is created for i=-1

, and then the next pages as normal.

For i = -1 To page.Length - 2
    If i = -1 Then
        newlink = mlink & Replace(page(i + 1).href, "about:", "")
        newlink = Left(newlink, Len(newlink) - 1) & "1"
    Else
        newlink = mlink & Replace(page(i).href, "about:", "")
    End If
    Debug.Print i & ", " & newlink ' Prints the links for all the pages
    With http
        .Open "GET", newlink, False
        .send
        htm.body.innerHTML = .responseText
    End With
    ' Get page data here ...
Next i

      

+1


source







All Articles