From ea29884396eaa9b38bd0ca633873d24084742fe2 Mon Sep 17 00:00:00 2001 From: Madison May Date: Mon, 3 Nov 2014 15:42:23 -0500 Subject: [PATCH 1/5] NER + document classification --- indicoio/__init__.py | 4 ++++ indicoio/local/__init__.py | 4 ++++ indicoio/text/classification.py | 25 +++++++++++++++++++++++++ indicoio/text/lang.py | 3 --- indicoio/text/ner.py | 23 +++++++++++++++++++++++ indicoio/text/sentiment.py | 3 --- tests/local/test_local.py | 12 ++++++++++++ tests/remote/test_remote.py | 15 ++++++++++++++- 8 files changed, 82 insertions(+), 7 deletions(-) create mode 100644 indicoio/text/classification.py create mode 100644 indicoio/text/ner.py diff --git a/indicoio/__init__.py b/indicoio/__init__.py index d92b8ba..d9965a4 100644 --- a/indicoio/__init__.py +++ b/indicoio/__init__.py @@ -8,6 +8,8 @@ Version, version, __version__, VERSION = ('0.4.5',) * 4 from indicoio.text.sentiment import political, posneg from indicoio.text.sentiment import posneg as sentiment from indicoio.text.lang import language +from indicoio.text.classification import classification +from indicoio.text.ner import named_entities from indicoio.images.fer import fer from indicoio.images.features import facial_features from indicoio.images.features import image_features @@ -20,3 +22,5 @@ language = partial(language, config.api_root) fer = partial(fer, config.api_root) facial_features = partial(facial_features, config.api_root) image_features = partial(image_features, config.api_root) +classification = partial(classification, config.api_root) +named_entities = partial(named_entities, config.api_root) diff --git a/indicoio/local/__init__.py b/indicoio/local/__init__.py index 9d44439..63d55bd 100644 --- a/indicoio/local/__init__.py +++ b/indicoio/local/__init__.py @@ -6,6 +6,8 @@ JSON_HEADERS = {'Content-type': 'application/json', 'Accept': 'text/plain'} from indicoio.text.sentiment import political, posneg from indicoio.text.sentiment import posneg as sentiment from indicoio.text.lang import language +from indicoio.text.classification import classification +from indicoio.text.ner import named_entities from indicoio.images.fer import fer from indicoio.images.features import facial_features from indicoio.images.features import image_features @@ -18,3 +20,5 @@ language = partial(language, config.local_api_root) fer = partial(fer, config.local_api_root) facial_features = partial(facial_features, config.local_api_root) image_features = partial(image_features, config.local_api_root) +classification = partial(classification, config.local_api_root) +named_entities = partial(named_entities, config.local_api_root) diff --git a/indicoio/text/classification.py b/indicoio/text/classification.py new file mode 100644 index 0000000..3d7bfd0 --- /dev/null +++ b/indicoio/text/classification.py @@ -0,0 +1,25 @@ +from indicoio.utils import api_handler + +def classification(api_root, text): + """ + Given input text, returns a probability distribution over 100 document categories + + Example usage: + + .. code-block:: python + + >>> import indicoio + >>> import numpy as np + >>> text = 'Monday: Delightful with mostly sunny skies. Highs in the low 70s.' + >>> possible = indicoio.classification(text) + >>> category = possible.keys()[np.argmax(possible.values())] + >>> probability = np.max(possible.values()) + >>> "Predicted category '%s' with probability %.4f"%(category,probability) + u'Predicted 'Weather' with probability 0.8548' + + :param text: The text to be analyzed. + :type text: str or unicode + :rtype: Dictionary of class probability pairs + """ + + return api_handler(text, api_root + "documentclassification") diff --git a/indicoio/text/lang.py b/indicoio/text/lang.py index 169b415..068d45d 100644 --- a/indicoio/text/lang.py +++ b/indicoio/text/lang.py @@ -1,6 +1,3 @@ -import requests -import json - from indicoio.utils import api_handler def language(api_root, text): diff --git a/indicoio/text/ner.py b/indicoio/text/ner.py new file mode 100644 index 0000000..23b2792 --- /dev/null +++ b/indicoio/text/ner.py @@ -0,0 +1,23 @@ +from indicoio.utils import api_handler + +def named_entities(api_root, text): + """ + Given input text, returns a mapping from named entities to + named entity categories. + + Example usage: + + .. code-block:: python + + >>> import indicoio + >>> import numpy as np + >>> text = 'On Monday, president Barack Obama will be...' + >>> indicoio.named_entities(text) + >>> "{'Monday': 'Time', 'Barack Obama': 'Person'}" + + :param text: The text to be analyzed. + :type text: str or unicode + :rtype: Dictionary of named entity, category pairs + """ + + return api_handler(text, api_root + "ner") diff --git a/indicoio/text/sentiment.py b/indicoio/text/sentiment.py index e60cd42..81f0363 100644 --- a/indicoio/text/sentiment.py +++ b/indicoio/text/sentiment.py @@ -1,6 +1,3 @@ -import requests -import json - from indicoio import JSON_HEADERS from indicoio.utils import api_handler diff --git a/tests/local/test_local.py b/tests/local/test_local.py index f8bc5dd..7d382d6 100644 --- a/tests/local/test_local.py +++ b/tests/local/test_local.py @@ -20,6 +20,18 @@ class FullAPIRun(unittest.TestCase): self.assertTrue(vector.min() < minimum) self.assertTrue(np.ptp(vector) > span) + def test_document_classification(self): + categories = set(['arts']) + text = "On Monday, president Barack Obama will be..." + results = classification(text) + self.assertTrue(categories < set(results.keys())) + + def test_named_entity_recognition(self): + categories = set(['arts']) + text = "On Monday, president Barack Obama will be..." + results = named_entities(text) + self.assertTrue('named entity' in set(results.keys())) + def test_political(self): political_set = set(['Libertarian', 'Liberal', 'Conservative', 'Green']) test_string = "Guns don't kill people, people kill people." diff --git a/tests/remote/test_remote.py b/tests/remote/test_remote.py index 9c75c3b..ed2ea06 100644 --- a/tests/remote/test_remote.py +++ b/tests/remote/test_remote.py @@ -4,7 +4,8 @@ import os import numpy as np import skimage.io -from indicoio import political, sentiment, fer, facial_features, language, image_features +from indicoio import political, sentiment, fer, facial_features, language, image_features, \ + classification, named_entities DIR = os.path.dirname(os.path.realpath(__file__)) @@ -22,6 +23,18 @@ class FullAPIRun(unittest.TestCase): self.assertTrue(vector.min() < minimum) self.assertTrue(np.ptp(vector) > span) + def test_document_classification(self): + categories = set(['arts']) + text = "On Monday, president Barack Obama will be..." + results = classification(text) + self.assertTrue(categories < set(results.keys())) + + def test_named_entity_recognition(self): + categories = set(['arts']) + text = "On Monday, president Barack Obama will be..." + results = named_entities(text) + self.assertTrue('named entity' in set(results.keys())) + def test_political(self): political_set = set(['Libertarian', 'Liberal', 'Conservative', 'Green']) test_string = "Guns don't kill people, people kill people." From 5ef6c2ad176ab275bfee119d0b9104ca893a768a Mon Sep 17 00:00:00 2001 From: Madison May Date: Wed, 5 Nov 2014 13:37:15 -0500 Subject: [PATCH 2/5] Remove ner --- indicoio/__init__.py | 2 -- indicoio/local/__init__.py | 2 -- indicoio/text/ner.py | 23 ----------------------- tests/local/test_local.py | 6 ------ tests/remote/test_remote.py | 9 +-------- 5 files changed, 1 insertion(+), 41 deletions(-) delete mode 100644 indicoio/text/ner.py diff --git a/indicoio/__init__.py b/indicoio/__init__.py index d9965a4..df5a31e 100644 --- a/indicoio/__init__.py +++ b/indicoio/__init__.py @@ -9,7 +9,6 @@ from indicoio.text.sentiment import political, posneg from indicoio.text.sentiment import posneg as sentiment from indicoio.text.lang import language from indicoio.text.classification import classification -from indicoio.text.ner import named_entities from indicoio.images.fer import fer from indicoio.images.features import facial_features from indicoio.images.features import image_features @@ -23,4 +22,3 @@ fer = partial(fer, config.api_root) facial_features = partial(facial_features, config.api_root) image_features = partial(image_features, config.api_root) classification = partial(classification, config.api_root) -named_entities = partial(named_entities, config.api_root) diff --git a/indicoio/local/__init__.py b/indicoio/local/__init__.py index 63d55bd..0d54371 100644 --- a/indicoio/local/__init__.py +++ b/indicoio/local/__init__.py @@ -7,7 +7,6 @@ from indicoio.text.sentiment import political, posneg from indicoio.text.sentiment import posneg as sentiment from indicoio.text.lang import language from indicoio.text.classification import classification -from indicoio.text.ner import named_entities from indicoio.images.fer import fer from indicoio.images.features import facial_features from indicoio.images.features import image_features @@ -21,4 +20,3 @@ fer = partial(fer, config.local_api_root) facial_features = partial(facial_features, config.local_api_root) image_features = partial(image_features, config.local_api_root) classification = partial(classification, config.local_api_root) -named_entities = partial(named_entities, config.local_api_root) diff --git a/indicoio/text/ner.py b/indicoio/text/ner.py deleted file mode 100644 index 23b2792..0000000 --- a/indicoio/text/ner.py +++ /dev/null @@ -1,23 +0,0 @@ -from indicoio.utils import api_handler - -def named_entities(api_root, text): - """ - Given input text, returns a mapping from named entities to - named entity categories. - - Example usage: - - .. code-block:: python - - >>> import indicoio - >>> import numpy as np - >>> text = 'On Monday, president Barack Obama will be...' - >>> indicoio.named_entities(text) - >>> "{'Monday': 'Time', 'Barack Obama': 'Person'}" - - :param text: The text to be analyzed. - :type text: str or unicode - :rtype: Dictionary of named entity, category pairs - """ - - return api_handler(text, api_root + "ner") diff --git a/tests/local/test_local.py b/tests/local/test_local.py index 7d382d6..4f5748c 100644 --- a/tests/local/test_local.py +++ b/tests/local/test_local.py @@ -26,12 +26,6 @@ class FullAPIRun(unittest.TestCase): results = classification(text) self.assertTrue(categories < set(results.keys())) - def test_named_entity_recognition(self): - categories = set(['arts']) - text = "On Monday, president Barack Obama will be..." - results = named_entities(text) - self.assertTrue('named entity' in set(results.keys())) - def test_political(self): political_set = set(['Libertarian', 'Liberal', 'Conservative', 'Green']) test_string = "Guns don't kill people, people kill people." diff --git a/tests/remote/test_remote.py b/tests/remote/test_remote.py index ed2ea06..8b6e9d3 100644 --- a/tests/remote/test_remote.py +++ b/tests/remote/test_remote.py @@ -4,8 +4,7 @@ import os import numpy as np import skimage.io -from indicoio import political, sentiment, fer, facial_features, language, image_features, \ - classification, named_entities +from indicoio import political, sentiment, fer, facial_features, language, image_features, classification DIR = os.path.dirname(os.path.realpath(__file__)) @@ -29,12 +28,6 @@ class FullAPIRun(unittest.TestCase): results = classification(text) self.assertTrue(categories < set(results.keys())) - def test_named_entity_recognition(self): - categories = set(['arts']) - text = "On Monday, president Barack Obama will be..." - results = named_entities(text) - self.assertTrue('named entity' in set(results.keys())) - def test_political(self): political_set = set(['Libertarian', 'Liberal', 'Conservative', 'Green']) test_string = "Guns don't kill people, people kill people." From 1d66a039a2d0b17bcb70a6db08f0045e6da66dd4 Mon Sep 17 00:00:00 2001 From: Madison May Date: Fri, 7 Nov 2014 15:11:44 -0500 Subject: [PATCH 3/5] Update api names --- indicoio/__init__.py | 4 ++-- indicoio/local/__init__.py | 4 ++-- indicoio/text/{classification.py => tagging.py} | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) rename indicoio/text/{classification.py => tagging.py} (88%) diff --git a/indicoio/__init__.py b/indicoio/__init__.py index df5a31e..2309126 100644 --- a/indicoio/__init__.py +++ b/indicoio/__init__.py @@ -8,7 +8,7 @@ Version, version, __version__, VERSION = ('0.4.5',) * 4 from indicoio.text.sentiment import political, posneg from indicoio.text.sentiment import posneg as sentiment from indicoio.text.lang import language -from indicoio.text.classification import classification +from indicoio.text.tagging import text_tags from indicoio.images.fer import fer from indicoio.images.features import facial_features from indicoio.images.features import image_features @@ -21,4 +21,4 @@ language = partial(language, config.api_root) fer = partial(fer, config.api_root) facial_features = partial(facial_features, config.api_root) image_features = partial(image_features, config.api_root) -classification = partial(classification, config.api_root) +text_tags = partial(text_tags, config.api_root) diff --git a/indicoio/local/__init__.py b/indicoio/local/__init__.py index 0d54371..428186e 100644 --- a/indicoio/local/__init__.py +++ b/indicoio/local/__init__.py @@ -6,7 +6,7 @@ JSON_HEADERS = {'Content-type': 'application/json', 'Accept': 'text/plain'} from indicoio.text.sentiment import political, posneg from indicoio.text.sentiment import posneg as sentiment from indicoio.text.lang import language -from indicoio.text.classification import classification +from indicoio.text.tagging import text_tags from indicoio.images.fer import fer from indicoio.images.features import facial_features from indicoio.images.features import image_features @@ -19,4 +19,4 @@ language = partial(language, config.local_api_root) fer = partial(fer, config.local_api_root) facial_features = partial(facial_features, config.local_api_root) image_features = partial(image_features, config.local_api_root) -classification = partial(classification, config.local_api_root) +text_tags = partial(text_tags, config.local_api_root) diff --git a/indicoio/text/classification.py b/indicoio/text/tagging.py similarity index 88% rename from indicoio/text/classification.py rename to indicoio/text/tagging.py index 3d7bfd0..3f182ad 100644 --- a/indicoio/text/classification.py +++ b/indicoio/text/tagging.py @@ -1,6 +1,6 @@ from indicoio.utils import api_handler -def classification(api_root, text): +def text_tags(api_root, text): """ Given input text, returns a probability distribution over 100 document categories @@ -22,4 +22,4 @@ def classification(api_root, text): :rtype: Dictionary of class probability pairs """ - return api_handler(text, api_root + "documentclassification") + return api_handler(text, api_root + "texttags") From 50a8ef6bf67d09f403e6d77b87484c58e68c48b6 Mon Sep 17 00:00:00 2001 From: Madison May Date: Fri, 7 Nov 2014 17:18:59 -0500 Subject: [PATCH 4/5] Updates to test suite for text tags --- tests/local/test_local.py | 22 +++++++++++++++++----- tests/remote/test_remote.py | 22 +++++++++++++++++----- 2 files changed, 34 insertions(+), 10 deletions(-) diff --git a/tests/local/test_local.py b/tests/local/test_local.py index 4f5748c..48cc0f5 100644 --- a/tests/local/test_local.py +++ b/tests/local/test_local.py @@ -2,7 +2,7 @@ import unittest import numpy as np -from indicoio.local import political, sentiment, fer, facial_features, language, image_features +from indicoio.local import political, sentiment, fer, facial_features, language, image_features, text_tags DIR = os.path.dirname(os.path.realpath(__file__)) @@ -20,11 +20,23 @@ class FullAPIRun(unittest.TestCase): self.assertTrue(vector.min() < minimum) self.assertTrue(np.ptp(vector) > span) - def test_document_classification(self): - categories = set(['arts']) + def test_text_tags(self): + expected_keys = set(['fashion', 'art', 'energy', 'economics', 'entreprener', + 'books', 'politics', 'gardening', 'nba', 'conservative', + 'technology', 'startps', 'relationships', 'edcation', + 'hmor', 'psychology', 'bicycling', 'investing', 'travel', + 'cooking', 'christianity', 'environment', 'religion', 'health', + 'hockey', 'pets', 'msic', 'soccer', 'gns', 'gaming', 'jobs', + 'bsiness', 'natre', 'food', 'cars', 'photography', 'philosophy', + 'geek', 'sports', 'baseball', 'news', 'television', 'entertainment', + 'parenting', 'comics', 'science', 'nfl','programming', + 'personalfinance', 'atheism', 'movies', 'anime', 'fitness', + 'military', 'realestate', 'history']) text = "On Monday, president Barack Obama will be..." - results = classification(text) - self.assertTrue(categories < set(results.keys())) + results = text_tags(text) + max_keys = sorted(results.keys(), key=lambda x:results.get(x), reverse=True) + assert 'politics' in max_keys[:3] + self.assertTrue(expected_keys == set(results.keys())) def test_political(self): political_set = set(['Libertarian', 'Liberal', 'Conservative', 'Green']) diff --git a/tests/remote/test_remote.py b/tests/remote/test_remote.py index 8b6e9d3..1477822 100644 --- a/tests/remote/test_remote.py +++ b/tests/remote/test_remote.py @@ -4,7 +4,7 @@ import os import numpy as np import skimage.io -from indicoio import political, sentiment, fer, facial_features, language, image_features, classification +from indicoio import political, sentiment, fer, facial_features, language, image_features, text_tags DIR = os.path.dirname(os.path.realpath(__file__)) @@ -22,11 +22,23 @@ class FullAPIRun(unittest.TestCase): self.assertTrue(vector.min() < minimum) self.assertTrue(np.ptp(vector) > span) - def test_document_classification(self): - categories = set(['arts']) + def test_text_tags(self): + expected_keys = set(['fashion', 'art', 'energy', 'economics', 'entrepreneur', + 'books', 'politics', 'gardening', 'nba', 'conservative', + 'technology', 'startups', 'relationships', 'education', + 'humor', 'psychology', 'bicycling', 'investing', 'travel', + 'cooking', 'christianity', 'environment', 'religion', 'health', + 'hockey', 'pets', 'music', 'soccer', 'guns', 'gaming', 'jobs', + 'business', 'nature', 'food', 'cars', 'photography', 'philosophy', + 'geek', 'sports', 'baseball', 'news', 'television', 'entertainment', + 'parenting', 'comics', 'science', 'nfl','programming', + 'personalfinance', 'atheism', 'movies', 'anime', 'fitness', + 'military', 'realestate', 'history']) text = "On Monday, president Barack Obama will be..." - results = classification(text) - self.assertTrue(categories < set(results.keys())) + results = text_tags(text) + max_keys = sorted(results.keys(), key=lambda x:results.get(x), reverse=True) + assert 'politics' in max_keys[:3] + self.assertFalse(set(results.keys()) - expected_keys) def test_political(self): political_set = set(['Libertarian', 'Liberal', 'Conservative', 'Green']) From 92da96e6c8bd6cedbba9b9ae056948a9adc72183 Mon Sep 17 00:00:00 2001 From: Madison May Date: Fri, 7 Nov 2014 17:22:09 -0500 Subject: [PATCH 5/5] Updates to README --- CHANGES.txt | 1 + README | 3 +++ README.md | 3 +++ setup.py | 2 +- 4 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGES.txt b/CHANGES.txt index c3f8f44..0e02727 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -12,3 +12,4 @@ v0.4.3, Thu Sep 11 -- Added image features api and sphinx compliant documentatio v0.4.4, Thu Sep 25 -- Added dependencies installation to setup.py v0.4.5, Thu Sep 25 -- Added interface to local indico server v0.4.6, Fri Oct 27 -- Updated to point to new indico api servers, cleaner REST API +v0.4.8, Fri Nov 7 -- Updated API interface to include new text tags API diff --git a/README b/README index 95cec92..4b76b3a 100644 --- a/README +++ b/README @@ -41,6 +41,9 @@ Examples >>> sentiment('Really enjoyed the movie.') {u'Sentiment': 0.8105182526856075} +>>> text_tags("On Monday, president Barack Obama will be...") +{u'fashion': 0.024739582352183764, u'art': 0.008637280256320275, u'energy': 0.013183388999943419, ...} + >>> test_face = np.linspace(0,50,48*48).reshape(48,48).tolist() >>> fer(test_face) diff --git a/README.md b/README.md index 272b0bd..6a16e80 100644 --- a/README.md +++ b/README.md @@ -41,6 +41,9 @@ Examples >>> sentiment('Really enjoyed the movie.') {u'Sentiment': 0.8105182526856075} +>>> text_tags("On Monday, president Barack Obama will be...") +{u'fashion': 0.024739582352183764, u'art': 0.008637280256320275, u'energy': 0.013183388999943419, ...} + >>> test_face = np.linspace(0,50,48*48).reshape(48,48).tolist() >>> fer(test_face) diff --git a/setup.py b/setup.py index d61d160..cc6e82f 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ except ImportError: setup( name="IndicoIo", - version='0.4.7', + version='0.4.8', packages=[ "indicoio", "indicoio.text",