diff --git a/.idea/encodings.xml b/.idea/encodings.xml
new file mode 100644
index 0000000..15a15b2
--- /dev/null
+++ b/.idea/encodings.xml
@@ -0,0 +1,4 @@
+
+
+
+
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
new file mode 100644
index 0000000..28a804d
--- /dev/null
+++ b/.idea/misc.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/modules.xml b/.idea/modules.xml
new file mode 100644
index 0000000..0d4b4cb
--- /dev/null
+++ b/.idea/modules.xml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/node-google-search-scraper.iml b/.idea/node-google-search-scraper.iml
new file mode 100644
index 0000000..24643cc
--- /dev/null
+++ b/.idea/node-google-search-scraper.iml
@@ -0,0 +1,12 @@
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
new file mode 100644
index 0000000..94a25f7
--- /dev/null
+++ b/.idea/vcs.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/workspace.xml b/.idea/workspace.xml
new file mode 100644
index 0000000..8e50374
--- /dev/null
+++ b/.idea/workspace.xml
@@ -0,0 +1,211 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 1555788342006
+
+
+ 1555788342006
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/README.md b/README.md
index 6feae59..fa46c3f 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,27 @@
google-search-scraper
=============
+
+### SamuelFaj FORK
+Now it has promise support.
+``` javascript
+const scraper = require("samuelfaj-google-scraper");
+var options = {
+ query: 'nodejs',
+ limit: 10
+};
+
+scraper.search(options, function(err, url, meta) {
+ // This is called for each result
+ if(err) throw err;
+ console.log(url);
+ console.log(meta.title);
+ console.log(meta.meta);
+ console.log(meta.desc)
+}).then((results) => {
+ console.log("All searches ended", results);
+});
+```
+
### Google search scraper with captcha solving support
This module allows google search results extraction in a simple yet flexible way, and handles captcha solving transparently (through external services or your own hand-made solver).
diff --git a/index.js b/index.js
index c0d32ae..84da690 100644
--- a/index.js
+++ b/index.js
@@ -3,7 +3,6 @@ var cheerio = require('cheerio');
var url = require('url');
function search(options, callback) {
-
var session = request.defaults({ jar : true });
var host = options.host || 'www.google.com';
var solver = options.solver;
@@ -17,77 +16,41 @@ function search(options, callback) {
params.start = params.start || 0;
- getPage(params, function onPage(err, body) {
- if(err) {
- if(err.code !== 'ECAPTCHA' || !solver) return callback(err);
-
- solveCaptcha(err.location, function(err, page) {
- if(err) return callback(err);
- onPage(null, page);
- });
-
- return;
- }
-
- var currentResults = extractResults(body);
-
- var newResults = currentResults.filter(function(result) {
- return results.indexOf(result) === -1;
- });
-
- newResults.forEach(function(result) {
- callback(null, result['url'], result);
- });
-
- if(newResults.length === 0) {
- return;
- }
-
- results = results.concat(newResults);
-
- if(!options.limit || results.length < options.limit) {
- params.start = results.length;
- getPage(params, onPage);
- }
- });
-
-
function getPage(params, callback) {
session.get({
- uri: 'https://' + host + '/search',
- qs: params,
- followRedirect: false
- },
- function(err, res) {
- if(err) return callback(err);
+ uri: 'https://' + host + '/search',
+ qs: params,
+ followRedirect: false
+ },
+ function(err, res) {
+ if(err) return callback(err);
- if(res.statusCode === 302) {
- var parsed = url.parse(res.headers.location, true);
-
- if(parsed.pathname !== '/search') {
- var err = new Error('Captcha');
- err.code = 'ECAPTCHA';
- err.location = res.headers.location;
- this.abort();
- return callback(err);
- } else {
- session.get({
- uri: res.headers.location,
- qs: params,
- followRedirect: false
- }, function(err, res) {
- if(err) return callback(err);
- callback(null, res.body);
- });
- return;
+ if(res.statusCode === 302) {
+ var parsed = url.parse(res.headers.location, true);
+
+ if(parsed.pathname !== '/search') {
+ var err = new Error('Captcha');
+ err.code = 'ECAPTCHA';
+ err.location = res.headers.location;
+ this.abort();
+ return callback(err);
+ } else {
+ session.get({
+ uri: res.headers.location,
+ qs: params,
+ followRedirect: false
+ }, function(err, res) {
+ if(err) return callback(err);
+ callback(null, res.body);
+ });
+ return;
+ }
}
- }
- callback(null, res.body);
- }
+ callback(null, res.body);
+ }
);
}
-
function extractResults(body) {
var results = [];
var $ = cheerio.load(body);
@@ -107,11 +70,10 @@ function search(options, callback) {
item['desc'] = elemDesc.text();
results.push(item);
- });
+ });
return results;
}
-
function solveCaptcha(captchaUrl, callback) {
var tmp = url.parse(captchaUrl);
@@ -140,17 +102,17 @@ function search(options, callback) {
// Try solution
session.get({
- uri: baseUrl + '/sorry/' + formAction,
- qs: {
- id: captchaId,
- captcha: solution,
- continue: continueUrl
+ uri: baseUrl + '/sorry/' + formAction,
+ qs: {
+ id: captchaId,
+ captcha: solution,
+ continue: continueUrl
+ }
+ },
+ function(err, res) {
+ if(res.statusCode !== 200) return callback(new Error('Captcha decoding failed'));
+ callback(null, res.body);
}
- },
- function(err, res) {
- if(res.statusCode !== 200) return callback(new Error('Captcha decoding failed'));
- callback(null, res.body);
- }
);
});
@@ -161,6 +123,46 @@ function search(options, callback) {
}
+ const _results = [];
+
+ return new Promise(resolve => {
+ getPage(params, function onPage(err, body) {
+ if(err) {
+ if(err.code !== 'ECAPTCHA' || !solver) return callback(err);
+
+ solveCaptcha(err.location, function(err, page) {
+ if(err) return callback(err);
+ onPage(null, page);
+ });
+
+ return;
+ }
+
+ var currentResults = extractResults(body);
+
+ var newResults = currentResults.filter(function(result) {
+ return results.indexOf(result) === -1;
+ });
+
+ newResults.forEach(function(result) {
+ _results.push([null, result['url'], result]);
+ callback(null, result['url'], result);
+ });
+
+ if(newResults.length === 0) {
+ resolve(_results);
+ }
+
+ results = results.concat(newResults);
+
+ if(!options.limit || results.length < options.limit) {
+ params.start = results.length;
+ getPage(params, onPage);
+ }else{
+ resolve(_results);
+ }
+ });
+ });
}
module.exports.search = search;
diff --git a/package-lock.json b/package-lock.json
new file mode 100644
index 0000000..0184c43
--- /dev/null
+++ b/package-lock.json
@@ -0,0 +1,311 @@
+{
+ "name": "google-search-scraper",
+ "version": "0.1.0",
+ "lockfileVersion": 1,
+ "requires": true,
+ "dependencies": {
+ "CSSselect": {
+ "version": "0.4.1",
+ "resolved": "https://registry.npmjs.org/CSSselect/-/CSSselect-0.4.1.tgz",
+ "integrity": "sha1-+Kt+H4QYzmPNput713ioXX7EkrI=",
+ "requires": {
+ "CSSwhat": "0.4",
+ "domutils": "1.4"
+ }
+ },
+ "CSSwhat": {
+ "version": "0.4.7",
+ "resolved": "https://registry.npmjs.org/CSSwhat/-/CSSwhat-0.4.7.tgz",
+ "integrity": "sha1-hn2g/zn3eGEyQsRM/qg/CqTr35s="
+ },
+ "asn1": {
+ "version": "0.1.11",
+ "resolved": "https://registry.npmjs.org/asn1/-/asn1-0.1.11.tgz",
+ "integrity": "sha1-VZvhg3bQik7E2+gId9J4GGObLfc=",
+ "optional": true
+ },
+ "assert-plus": {
+ "version": "0.1.5",
+ "resolved": "https://registry.npmjs.org/assert-plus/-/assert-plus-0.1.5.tgz",
+ "integrity": "sha1-7nQAlBMALYTOxyGcasgRgS5yMWA=",
+ "optional": true
+ },
+ "async": {
+ "version": "0.9.2",
+ "resolved": "https://registry.npmjs.org/async/-/async-0.9.2.tgz",
+ "integrity": "sha1-rqdNXmHB+JlhO/ZL2mbUx48v0X0=",
+ "optional": true
+ },
+ "aws-sign2": {
+ "version": "0.5.0",
+ "resolved": "https://registry.npmjs.org/aws-sign2/-/aws-sign2-0.5.0.tgz",
+ "integrity": "sha1-xXED96F/wDfwLXwuZLYC6iI/fWM=",
+ "optional": true
+ },
+ "boom": {
+ "version": "0.4.2",
+ "resolved": "https://registry.npmjs.org/boom/-/boom-0.4.2.tgz",
+ "integrity": "sha1-emNune1O/O+xnO9JR6PGffrukRs=",
+ "requires": {
+ "hoek": "0.9.x"
+ }
+ },
+ "cheerio": {
+ "version": "0.13.1",
+ "resolved": "https://registry.npmjs.org/cheerio/-/cheerio-0.13.1.tgz",
+ "integrity": "sha1-SK8RNFYbNSf4PZFWxPmo69grBuw=",
+ "requires": {
+ "CSSselect": "~0.4.0",
+ "entities": "0.x",
+ "htmlparser2": "~3.4.0",
+ "underscore": "~1.5"
+ }
+ },
+ "combined-stream": {
+ "version": "0.0.7",
+ "resolved": "https://registry.npmjs.org/combined-stream/-/combined-stream-0.0.7.tgz",
+ "integrity": "sha1-ATfmV7qlp1QcV6w3rF/AfXO03B8=",
+ "optional": true,
+ "requires": {
+ "delayed-stream": "0.0.5"
+ }
+ },
+ "core-util-is": {
+ "version": "1.0.2",
+ "resolved": "https://registry.npmjs.org/core-util-is/-/core-util-is-1.0.2.tgz",
+ "integrity": "sha1-tf1UIgqivFq1eqtxQMlAdUUDwac="
+ },
+ "cryptiles": {
+ "version": "0.2.2",
+ "resolved": "https://registry.npmjs.org/cryptiles/-/cryptiles-0.2.2.tgz",
+ "integrity": "sha1-7ZH/HxetE9N0gohZT4pIoNJvMlw=",
+ "optional": true,
+ "requires": {
+ "boom": "0.4.x"
+ }
+ },
+ "ctype": {
+ "version": "0.5.3",
+ "resolved": "https://registry.npmjs.org/ctype/-/ctype-0.5.3.tgz",
+ "integrity": "sha1-gsGMJGH3QRTvFsE1IkrQuRRMoS8=",
+ "optional": true
+ },
+ "delayed-stream": {
+ "version": "0.0.5",
+ "resolved": "https://registry.npmjs.org/delayed-stream/-/delayed-stream-0.0.5.tgz",
+ "integrity": "sha1-1LH0OpPoKW3+AmlPRoC8N6MTxz8=",
+ "optional": true
+ },
+ "domelementtype": {
+ "version": "1.3.1",
+ "resolved": "https://registry.npmjs.org/domelementtype/-/domelementtype-1.3.1.tgz",
+ "integrity": "sha512-BSKB+TSpMpFI/HOxCNr1O8aMOTZ8hT3pM3GQ0w/mWRmkhEDSFJkkyzz4XQsBV44BChwGkrDfMyjVD0eA2aFV3w=="
+ },
+ "domhandler": {
+ "version": "2.2.1",
+ "resolved": "https://registry.npmjs.org/domhandler/-/domhandler-2.2.1.tgz",
+ "integrity": "sha1-Wd+dzSJ+gIs2Wuc+H2aErD2Ub8I=",
+ "requires": {
+ "domelementtype": "1"
+ }
+ },
+ "domutils": {
+ "version": "1.4.3",
+ "resolved": "https://registry.npmjs.org/domutils/-/domutils-1.4.3.tgz",
+ "integrity": "sha1-CGVRN5bGswYDGFDhdVFrr4C3Km8=",
+ "requires": {
+ "domelementtype": "1"
+ }
+ },
+ "entities": {
+ "version": "0.5.0",
+ "resolved": "https://registry.npmjs.org/entities/-/entities-0.5.0.tgz",
+ "integrity": "sha1-9hHLWuIhBQ4AEsZpeVA/164ZzEk="
+ },
+ "forever-agent": {
+ "version": "0.5.2",
+ "resolved": "https://registry.npmjs.org/forever-agent/-/forever-agent-0.5.2.tgz",
+ "integrity": "sha1-bQ4JxJIflKJ/Y9O0nF/v8epMUTA="
+ },
+ "form-data": {
+ "version": "0.1.4",
+ "resolved": "https://registry.npmjs.org/form-data/-/form-data-0.1.4.tgz",
+ "integrity": "sha1-kavXiKupcCsaq/qLwBAxoqyeOxI=",
+ "optional": true,
+ "requires": {
+ "async": "~0.9.0",
+ "combined-stream": "~0.0.4",
+ "mime": "~1.2.11"
+ }
+ },
+ "hawk": {
+ "version": "1.0.0",
+ "resolved": "https://registry.npmjs.org/hawk/-/hawk-1.0.0.tgz",
+ "integrity": "sha1-uQuxaYByhUEdp//LjdJZhQLTtS0=",
+ "optional": true,
+ "requires": {
+ "boom": "0.4.x",
+ "cryptiles": "0.2.x",
+ "hoek": "0.9.x",
+ "sntp": "0.2.x"
+ }
+ },
+ "hoek": {
+ "version": "0.9.1",
+ "resolved": "https://registry.npmjs.org/hoek/-/hoek-0.9.1.tgz",
+ "integrity": "sha1-PTIkYrrfB3Fup+uFuviAec3c5QU="
+ },
+ "htmlparser2": {
+ "version": "3.4.0",
+ "resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-3.4.0.tgz",
+ "integrity": "sha1-oc1l9YI60oXhnWOwha1yLQpR6uc=",
+ "requires": {
+ "domelementtype": "1",
+ "domhandler": "2.2",
+ "domutils": "1.3",
+ "readable-stream": "1.1"
+ },
+ "dependencies": {
+ "domutils": {
+ "version": "1.3.0",
+ "resolved": "https://registry.npmjs.org/domutils/-/domutils-1.3.0.tgz",
+ "integrity": "sha1-mtTVm1r2ymhMYv5tdo7xcOcN8ZI=",
+ "requires": {
+ "domelementtype": "1"
+ }
+ }
+ }
+ },
+ "http-signature": {
+ "version": "0.10.1",
+ "resolved": "https://registry.npmjs.org/http-signature/-/http-signature-0.10.1.tgz",
+ "integrity": "sha1-T72sEyVZqoMjEh5UB3nAoBKyfmY=",
+ "optional": true,
+ "requires": {
+ "asn1": "0.1.11",
+ "assert-plus": "^0.1.5",
+ "ctype": "0.5.3"
+ }
+ },
+ "inherits": {
+ "version": "2.0.3",
+ "resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.3.tgz",
+ "integrity": "sha1-Yzwsg+PaQqUC9SRmAiSA9CCCYd4="
+ },
+ "ip-regex": {
+ "version": "2.1.0",
+ "resolved": "https://registry.npmjs.org/ip-regex/-/ip-regex-2.1.0.tgz",
+ "integrity": "sha1-+ni/XS5pE8kRzp+BnuUUa7bYROk=",
+ "optional": true
+ },
+ "isarray": {
+ "version": "0.0.1",
+ "resolved": "https://registry.npmjs.org/isarray/-/isarray-0.0.1.tgz",
+ "integrity": "sha1-ihis/Kmo9Bd+Cav8YDiTmwXR7t8="
+ },
+ "json-stringify-safe": {
+ "version": "5.0.1",
+ "resolved": "https://registry.npmjs.org/json-stringify-safe/-/json-stringify-safe-5.0.1.tgz",
+ "integrity": "sha1-Epai1Y/UXxmg9s4B1lcB4sc1tus="
+ },
+ "mime": {
+ "version": "1.2.11",
+ "resolved": "https://registry.npmjs.org/mime/-/mime-1.2.11.tgz",
+ "integrity": "sha1-WCA+7Ybjpe8XrtK32evUfwpg3RA="
+ },
+ "node-uuid": {
+ "version": "1.4.8",
+ "resolved": "https://registry.npmjs.org/node-uuid/-/node-uuid-1.4.8.tgz",
+ "integrity": "sha1-sEDrCSOWivq/jTL7HxfxFn/auQc="
+ },
+ "oauth-sign": {
+ "version": "0.3.0",
+ "resolved": "https://registry.npmjs.org/oauth-sign/-/oauth-sign-0.3.0.tgz",
+ "integrity": "sha1-y1QPk7srIqfVlBaRoojWDo6pOG4=",
+ "optional": true
+ },
+ "psl": {
+ "version": "1.1.31",
+ "resolved": "https://registry.npmjs.org/psl/-/psl-1.1.31.tgz",
+ "integrity": "sha512-/6pt4+C+T+wZUieKR620OpzN/LlnNKuWjy1iFLQ/UG35JqHlR/89MP1d96dUfkf6Dne3TuLQzOYEYshJ+Hx8mw==",
+ "optional": true
+ },
+ "punycode": {
+ "version": "2.1.1",
+ "resolved": "https://registry.npmjs.org/punycode/-/punycode-2.1.1.tgz",
+ "integrity": "sha512-XRsRjdf+j5ml+y/6GKHPZbrF/8p2Yga0JPtdqTIY2Xe5ohJPD9saDJJLPvp9+NSBprVvevdXZybnj2cv8OEd0A==",
+ "optional": true
+ },
+ "qs": {
+ "version": "0.6.6",
+ "resolved": "https://registry.npmjs.org/qs/-/qs-0.6.6.tgz",
+ "integrity": "sha1-bgFQmP9RlouKPIGQAdXyyJvEsQc="
+ },
+ "readable-stream": {
+ "version": "1.1.14",
+ "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-1.1.14.tgz",
+ "integrity": "sha1-fPTFTvZI44EwhMY23SB54WbAgdk=",
+ "requires": {
+ "core-util-is": "~1.0.0",
+ "inherits": "~2.0.1",
+ "isarray": "0.0.1",
+ "string_decoder": "~0.10.x"
+ }
+ },
+ "request": {
+ "version": "2.33.0",
+ "resolved": "https://registry.npmjs.org/request/-/request-2.33.0.tgz",
+ "integrity": "sha1-UWeHgTFyYHDsYzdS6iMKI3ncZf8=",
+ "requires": {
+ "aws-sign2": "~0.5.0",
+ "forever-agent": "~0.5.0",
+ "form-data": "~0.1.0",
+ "hawk": "~1.0.0",
+ "http-signature": "~0.10.0",
+ "json-stringify-safe": "~5.0.0",
+ "mime": "~1.2.9",
+ "node-uuid": "~1.4.0",
+ "oauth-sign": "~0.3.0",
+ "qs": "~0.6.0",
+ "tough-cookie": ">=0.12.0",
+ "tunnel-agent": "~0.3.0"
+ }
+ },
+ "sntp": {
+ "version": "0.2.4",
+ "resolved": "https://registry.npmjs.org/sntp/-/sntp-0.2.4.tgz",
+ "integrity": "sha1-+4hfGLDzqtGJ+CSGJTa87ux1CQA=",
+ "optional": true,
+ "requires": {
+ "hoek": "0.9.x"
+ }
+ },
+ "string_decoder": {
+ "version": "0.10.31",
+ "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-0.10.31.tgz",
+ "integrity": "sha1-YuIDvEF2bGwoyfyEMB2rHFMQ+pQ="
+ },
+ "tough-cookie": {
+ "version": "3.0.1",
+ "resolved": "https://registry.npmjs.org/tough-cookie/-/tough-cookie-3.0.1.tgz",
+ "integrity": "sha512-yQyJ0u4pZsv9D4clxO69OEjLWYw+jbgspjTue4lTQZLfV0c5l1VmK2y1JK8E9ahdpltPOaAThPcp5nKPUgSnsg==",
+ "optional": true,
+ "requires": {
+ "ip-regex": "^2.1.0",
+ "psl": "^1.1.28",
+ "punycode": "^2.1.1"
+ }
+ },
+ "tunnel-agent": {
+ "version": "0.3.0",
+ "resolved": "https://registry.npmjs.org/tunnel-agent/-/tunnel-agent-0.3.0.tgz",
+ "integrity": "sha1-rWgbaPUyGtKCfEz7G31d8s/pQu4=",
+ "optional": true
+ },
+ "underscore": {
+ "version": "1.5.2",
+ "resolved": "https://registry.npmjs.org/underscore/-/underscore-1.5.2.tgz",
+ "integrity": "sha1-EzXF5PXm0zu7SwBrqMhqAPVW3gg="
+ }
+ }
+}
diff --git a/package.json b/package.json
index 70580c9..a79a268 100644
--- a/package.json
+++ b/package.json
@@ -1,17 +1,20 @@
{
- "name": "google-search-scraper",
- "version": "0.1.0",
- "description": "Google search scraper with captcha solving support",
- "author": "thibauts",
+ "name": "samuelfaj-google-scraper",
+ "version": "0.1.1",
+ "description": "An fork of Google search scraper with captcha solving support with promises",
+ "author": "Samuel Fajreldines",
"license": "MIT",
"main": "index.js",
"dependencies": {
"cheerio": "~0.13.1",
"request": "~2.33.0"
},
+ "scripts": {
+ "test": "node test.js"
+ },
"repository": {
"type": "git",
- "url": "git://github.com/thibauts/node-google-search-scraper.git"
+ "url": "git://github.com/samuelfaj/node-google-search-scraper"
},
"keywords": [
"google",