commit f9703fff5b9b8b2de5db79e7c5577052af156254 Author: Jeremy Singer-Vine Date: Fri Jan 31 01:46:26 2014 -0500 Initial commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..8aefcfe --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +*.pyc +*.pyo +*.egg-info +dist/* +build/* +.DS_Store diff --git a/LICENSE.txt b/LICENSE.txt new file mode 100644 index 0000000..f0799eb --- /dev/null +++ b/LICENSE.txt @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2014, Jeremy Singer-Vine + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..127ef98 --- /dev/null +++ b/README.md @@ -0,0 +1,72 @@ +# compleat + +Fetch autocomplete suggestions from Google Search. Use responsibly. Not affiliated with Google. + +## Usage + +`compleat` can be used as either a Python library or command-line tool. + +### Library + +```python +>>> import compleat +>>> q = compleat.suggest("is allen iverson ") +>>> q.meta +{ + 'lang': 'en', + 'query': 'is allen iverson ', + 'uid': '2c83d58b7350a9f55066cf4f49d16fd9', + 'timestamp': 'Fri Jan 31 00:13:00 2014' +} +>>> len(q.suggestions) +20 +>>> q.suggestions[0:5] + +[{'relevance': 800, + 'text': u'is allen iverson broke', + 'title': u'', + 'type': u'QUERY'}, + {'relevance': 601, + 'text': u'is allen iverson really broke', + 'title': u'', + 'type': u'QUERY'}, + {'relevance': 600, + 'text': u'is allen iverson still broke', + 'title': u'', + 'type': u'QUERY'}, + {'relevance': 566, + 'text': u'is allen iverson still in the nba', + 'title': u'', + 'type': u'QUERY'}, + {'relevance': 565, + 'text': u'is allen iverson back in the nba', + 'title': u'', + 'type': u'QUERY'}] +``` + +Note: `compleat.suggest()` also accepts an optional `lang` parameter, which is "en" (English) by default. + +```python +>>> import compleat +>>> [ s["text"] for s in compleat.suggest("bon", lang="en").suggestions[:5] ] +['bones', 'bonnie and clyde', 'bonobos', 'bon appetit', 'bonefish grill'] +>>> [ s["text"] for s in compleat.suggest("bon", lang="fr").suggestions[:5] ] +['bon coin', 'bonobo', 'bon prix', 'bon patron', 'bonnet'] +>>> [ s["text"] for s in compleat.suggest("bon", lang="es").suggestions[:5] ] +['bonoloto', 'bonus', 'bon jovi', 'bones', 'bonsai'] +``` + +### Command-line tool + +Run `compleat -h` from the command line for full set of options. Examples: + +`compleat -q "where is "` + +`compleat -q "where is " --json` + +`compleat -q "where is " --db sqlite:///whereis.sqlite` + +`compleat -q "who is " "why is " "where is "` + +`compleat --template "is {} " -q "allen iverson" "marie curie" "meryl streep"` + diff --git a/compleat/__init__.py b/compleat/__init__.py new file mode 100644 index 0000000..246606e --- /dev/null +++ b/compleat/__init__.py @@ -0,0 +1,7 @@ +from query import Query + +VERSION = (0, 0, 0) +__version__ = ".".join(map(str,VERSION)) + +def suggest(query_string, lang="en"): + return Query(query_string, lang) diff --git a/compleat/output/__init__.py b/compleat/output/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/compleat/output/cli.py b/compleat/output/cli.py new file mode 100644 index 0000000..18ad9e7 --- /dev/null +++ b/compleat/output/cli.py @@ -0,0 +1,100 @@ +import compleat +import argparse +import itertools +import time +import sys +import codecs + +parser = argparse.ArgumentParser( + description="Retreive autocomplete suggetions, and output results as CSV [default] or JSON, or store in a database.") + +parser.add_argument("--queries", + "-q", + nargs="+", + required=True, + help="Queries to execute. Space-delimited. If --template is provided, query is first substituted into the template.") + +parser.add_argument("--template", + "-t", + default="{}", + help="String describing the shape of the query. {}s will be replaced by the values supplied to --queries. Default: '{}'") + +parser.add_argument("--languages", + "-l", + nargs="+", + default=["en"], + help="Languages (as two-letter codes) to search with. Default: 'en'") + +parser.add_argument("--db", + help="SQLAlchemy connection string to database. If supplied, outputs results to this database, instead of JSON or CSV. Requires `dataset` package.") + +parser.add_argument("--json", + action="store_true", + help="Output as JSON.") + +parser.add_argument("--silent", + "-s", + action="store_true", + help="Don't print query progress to STDERR.") + +parser.add_argument("--wait", + type=float, + default=0, + help="Seconds to wait between queries. Default: 0.") + +args = parser.parse_args() + +def to_db(db_str, queries): + import dataset + db = dataset.connect(args.db) + query_table = db.get_table("queries") + suggestion_table = db.get_table("suggestions") + for q in queries: + query_table.insert(q.meta) + suggestion_table.insert_many([ + dict(s.items() + { + "query_uid": q.uid + }.items()) for s in q.suggestions + ]) + +def to_csv(queries): + import unicodecsv + fieldnames = queries[0].meta.keys() + \ + queries[0].suggestions[0].keys() + writer = unicodecsv.DictWriter(sys.stdout, fieldnames) + writer.writeheader() + for q in queries: + for s in q.suggestions: + row_dict = dict(q.meta.items() + s.items()) + writer.writerow(row_dict) + +def to_json(queries): + import json + def convert(query): + sugg_tuple = ("suggestions", query.suggestions) + return dict(query.meta.items() + [ sugg_tuple ]) + obj = map(convert, queries) + json.dump(obj, sys.stdout) + +def log_query(query_string, lang): + if args.silent: return + sys.stderr.write("{lang}: {query}\n".format( + lang=lang, + query=query_string)) + +def exec_query(query_string, lang): + time.sleep(args.wait) + log_query(query_string, lang) + q = compleat.suggest(query_string, lang) + return q + +def main(): + templated = map(args.template.format, args.queries) + combos = itertools.product(templated, args.languages) + queries = [ exec_query(*c) for c in combos ] + if args.db: to_db(args.db, queries) + elif args.json: to_json(queries) + else: to_csv(queries) + +if __name__ == "__main__": + main() diff --git a/compleat/query.py b/compleat/query.py new file mode 100644 index 0000000..f657164 --- /dev/null +++ b/compleat/query.py @@ -0,0 +1,59 @@ +import urllib +import hashlib +import requests +import json +import datetime +import random + +class Query(object): + URL_TEMPLATE = "http://suggestqueries.google.com/complete/search?client=chrome&hl={lang}&q={query}" + def __init__(self, query, lang="en"): + self.query = query + self.lang = lang + self.timestamp = datetime.datetime.now() + self.rand = str(random.random()) + req = requests.get(self.url) + utf8 = req.content.decode("latin-1").encode("utf-8") + self.response = json.loads(utf8) + + @property + def url(self): + escaped = urllib.quote(self.query) + return self.URL_TEMPLATE.format( + query=escaped, + lang=self.lang) + + @property + def suggestions(self): + query, sugg_texts, sugg_titles, _, meta = self.response + zipped = zip( + sugg_texts,\ + sugg_titles,\ + meta["google:suggesttype"],\ + meta["google:suggestrelevance"]) + dicts = [ { + "text": z[0], + "title": z[1], + "type": z[2], + "relevance": z[3] + } for z in zipped ] + return dicts + + @property + def uid(self): + _ = ":".join([ + self.query, + self.lang, + self.timestamp.ctime(), + self.rand + ]) + return hashlib.md5(_).hexdigest() + + @property + def meta(self): + return { + "query": self.query, + "lang": self.lang, + "timestamp": self.timestamp.ctime(), + "uid": self.uid + } diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..8054d45 --- /dev/null +++ b/setup.py @@ -0,0 +1,45 @@ +import sys +from setuptools import setup, find_packages + +py26_dependency = [] +if sys.version_info <= (2, 6): + py26_dependency = ["argparse >= 1.2.1"] + +setup( + name='compleat', + version='0.0.0', + description="Fetch autocomplete suggestions from Google Search. Use responsibly. Not affiliated with Google.", + long_description="", + classifiers=[ + "Development Status :: 3 - Alpha", + "Intended Audience :: Developers", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + 'Programming Language :: Python :: 2.6', + 'Programming Language :: Python :: 2.7', + 'Programming Language :: Python :: 3.3' + ], + keywords='autocomplete google search', + author='Jeremy Singer-Vine', + author_email='jsvine@gmail.com', + url='http://github.com/jsvine/compleat/', + license='MIT', + packages=find_packages(exclude=['test',]), + namespace_packages=[], + include_package_data=False, + zip_safe=False, + install_requires=[ + "requests >= 2.2.1", + "unicodecsv >= 0.9.4" + ] + py26_dependency, + extras_require={ + "dataset": [ "dataset" ] + }, + tests_require=[], + test_suite='test', + entry_points={ + 'console_scripts': [ + 'compleat = compleat.output.cli:main', + ] + } +) diff --git a/test/__init__.py b/test/__init__.py new file mode 100644 index 0000000..e69de29