Scrambling content from google search results with query in Node.Js

For my Node.Js app, I need to get the first page of google search results, but from a domain .com

, because I need knowledge graph information "People also search for"

that only appears on Google.Com.

I assumed I could use modules request

and cheerio

to remove content from the google search results page, but when I try to access the url I want , that is, https://www.google.com/search?gws_rd=ssl&site=&source=hp&q=google&oq=google

google automatically redirects me to the .de

domain (since I am based in Germany).

I tried to set it for the first http://www.google.com/ncr

url load , which automatically disables country redirects in browsers, but that didn't work ...

Does anyone know what I can do differently to make it work?

Here is my code ... Thanks!

var request = require("request");
var cheerio = require("cheerio");

function dataCookieToString(dataCookie) {
    var t = "";
    for (var x = 0; x < dataCookie.length; x++) {
        t += ((t != "") ? "; " : "") + dataCookie[x].key + "=" + dataCookie[x].value;
    }
    return t;
}

function mkdataCookie(cookie) {
    var t, j;
    cookie = cookie.toString().replace(/,([^ ])/g, ",[12],$1").split(",[12],");
    for (var x = 0; x < cookie.length; x++) {
        cookie[x] = cookie[x].split("; ");
        j = cookie[x][0].split("=");
        t = {
            key: j[0],
            value: j[1]
        };
        for (var i = 1; i < cookie[x].length; i++) {
            j = cookie[x][i].split("=");
            t[j[0]] = j[1];
        }
        cookie[x] = t;
    }

    return cookie;
}

var dataCookie = mkdataCookie('MC_STORE_ID=66860; expires=' + new Date(new Date().getTime() + 86409000));


request({
    uri: "https://www.google.com/ncr",
    headers: {
        'User-Agent': 'Mozilla/5.0',
        "Cookie": dataCookieToString(dataCookie)
    }
}, function(error, response, body) {

    request({
        uri: "https://www.google.com/search?gws_rd=ssl&site=&source=hp&q=google&oq=google",
        headers: {
            'User-Agent': 'Mozilla/5.0'
        }
    }, function(error, response, body) {
        console.log(body);
        var $ = cheerio.load(body);

        $(".kno-fb-ctx").each(function() {
            var link = $(this);
            var text = link.text();

            console.log(text);
        });
    });
});

      

+3


source to share


1 answer


Here's the solution: it's much easier than I thought.

However, I still have the problem that what body

I am getting does not contain material that only appears when javascript is enabled.



Does anyone know how to change the code below so it also includes javascript enabled content in the body?

var request = require('request');
var cheerio = require("cheerio");

request = request.defaults({jar: true});

var options = {
    url: 'http://www.google.com/ncr',
    headers: {
        'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6; rv:1.9.2.16) Gecko/20110319 Firefox/3.6.16'
    }
};

request(options, function () {

    request('https://www.google.com/search?gws_rd=ssl&site=&source=hp&q=google&oq=google', function (error, response, body) {

        var $ = cheerio.load(body);

        $("li").each(function() {
            var link = $(this);
            var text = link.text();

            console.log(text);
        });
    });
});

      

+1


source







All Articles