From ea29884396eaa9b38bd0ca633873d24084742fe2 Mon Sep 17 00:00:00 2001 From: Madison May Date: Mon, 3 Nov 2014 15:42:23 -0500 Subject: [PATCH] NER + document classification --- indicoio/__init__.py | 4 ++++ indicoio/local/__init__.py | 4 ++++ indicoio/text/classification.py | 25 +++++++++++++++++++++++++ indicoio/text/lang.py | 3 --- indicoio/text/ner.py | 23 +++++++++++++++++++++++ indicoio/text/sentiment.py | 3 --- tests/local/test_local.py | 12 ++++++++++++ tests/remote/test_remote.py | 15 ++++++++++++++- 8 files changed, 82 insertions(+), 7 deletions(-) create mode 100644 indicoio/text/classification.py create mode 100644 indicoio/text/ner.py diff --git a/indicoio/__init__.py b/indicoio/__init__.py index d92b8ba..d9965a4 100644 --- a/indicoio/__init__.py +++ b/indicoio/__init__.py @@ -8,6 +8,8 @@ Version, version, __version__, VERSION = ('0.4.5',) * 4 from indicoio.text.sentiment import political, posneg from indicoio.text.sentiment import posneg as sentiment from indicoio.text.lang import language +from indicoio.text.classification import classification +from indicoio.text.ner import named_entities from indicoio.images.fer import fer from indicoio.images.features import facial_features from indicoio.images.features import image_features @@ -20,3 +22,5 @@ language = partial(language, config.api_root) fer = partial(fer, config.api_root) facial_features = partial(facial_features, config.api_root) image_features = partial(image_features, config.api_root) +classification = partial(classification, config.api_root) +named_entities = partial(named_entities, config.api_root) diff --git a/indicoio/local/__init__.py b/indicoio/local/__init__.py index 9d44439..63d55bd 100644 --- a/indicoio/local/__init__.py +++ b/indicoio/local/__init__.py @@ -6,6 +6,8 @@ JSON_HEADERS = {'Content-type': 'application/json', 'Accept': 'text/plain'} from indicoio.text.sentiment import political, posneg from indicoio.text.sentiment import posneg as sentiment from indicoio.text.lang import language +from indicoio.text.classification import classification +from indicoio.text.ner import named_entities from indicoio.images.fer import fer from indicoio.images.features import facial_features from indicoio.images.features import image_features @@ -18,3 +20,5 @@ language = partial(language, config.local_api_root) fer = partial(fer, config.local_api_root) facial_features = partial(facial_features, config.local_api_root) image_features = partial(image_features, config.local_api_root) +classification = partial(classification, config.local_api_root) +named_entities = partial(named_entities, config.local_api_root) diff --git a/indicoio/text/classification.py b/indicoio/text/classification.py new file mode 100644 index 0000000..3d7bfd0 --- /dev/null +++ b/indicoio/text/classification.py @@ -0,0 +1,25 @@ +from indicoio.utils import api_handler + +def classification(api_root, text): + """ + Given input text, returns a probability distribution over 100 document categories + + Example usage: + + .. code-block:: python + + >>> import indicoio + >>> import numpy as np + >>> text = 'Monday: Delightful with mostly sunny skies. Highs in the low 70s.' + >>> possible = indicoio.classification(text) + >>> category = possible.keys()[np.argmax(possible.values())] + >>> probability = np.max(possible.values()) + >>> "Predicted category '%s' with probability %.4f"%(category,probability) + u'Predicted 'Weather' with probability 0.8548' + + :param text: The text to be analyzed. + :type text: str or unicode + :rtype: Dictionary of class probability pairs + """ + + return api_handler(text, api_root + "documentclassification") diff --git a/indicoio/text/lang.py b/indicoio/text/lang.py index 169b415..068d45d 100644 --- a/indicoio/text/lang.py +++ b/indicoio/text/lang.py @@ -1,6 +1,3 @@ -import requests -import json - from indicoio.utils import api_handler def language(api_root, text): diff --git a/indicoio/text/ner.py b/indicoio/text/ner.py new file mode 100644 index 0000000..23b2792 --- /dev/null +++ b/indicoio/text/ner.py @@ -0,0 +1,23 @@ +from indicoio.utils import api_handler + +def named_entities(api_root, text): + """ + Given input text, returns a mapping from named entities to + named entity categories. + + Example usage: + + .. code-block:: python + + >>> import indicoio + >>> import numpy as np + >>> text = 'On Monday, president Barack Obama will be...' + >>> indicoio.named_entities(text) + >>> "{'Monday': 'Time', 'Barack Obama': 'Person'}" + + :param text: The text to be analyzed. + :type text: str or unicode + :rtype: Dictionary of named entity, category pairs + """ + + return api_handler(text, api_root + "ner") diff --git a/indicoio/text/sentiment.py b/indicoio/text/sentiment.py index e60cd42..81f0363 100644 --- a/indicoio/text/sentiment.py +++ b/indicoio/text/sentiment.py @@ -1,6 +1,3 @@ -import requests -import json - from indicoio import JSON_HEADERS from indicoio.utils import api_handler diff --git a/tests/local/test_local.py b/tests/local/test_local.py index f8bc5dd..7d382d6 100644 --- a/tests/local/test_local.py +++ b/tests/local/test_local.py @@ -20,6 +20,18 @@ class FullAPIRun(unittest.TestCase): self.assertTrue(vector.min() < minimum) self.assertTrue(np.ptp(vector) > span) + def test_document_classification(self): + categories = set(['arts']) + text = "On Monday, president Barack Obama will be..." + results = classification(text) + self.assertTrue(categories < set(results.keys())) + + def test_named_entity_recognition(self): + categories = set(['arts']) + text = "On Monday, president Barack Obama will be..." + results = named_entities(text) + self.assertTrue('named entity' in set(results.keys())) + def test_political(self): political_set = set(['Libertarian', 'Liberal', 'Conservative', 'Green']) test_string = "Guns don't kill people, people kill people." diff --git a/tests/remote/test_remote.py b/tests/remote/test_remote.py index 9c75c3b..ed2ea06 100644 --- a/tests/remote/test_remote.py +++ b/tests/remote/test_remote.py @@ -4,7 +4,8 @@ import os import numpy as np import skimage.io -from indicoio import political, sentiment, fer, facial_features, language, image_features +from indicoio import political, sentiment, fer, facial_features, language, image_features, \ + classification, named_entities DIR = os.path.dirname(os.path.realpath(__file__)) @@ -22,6 +23,18 @@ class FullAPIRun(unittest.TestCase): self.assertTrue(vector.min() < minimum) self.assertTrue(np.ptp(vector) > span) + def test_document_classification(self): + categories = set(['arts']) + text = "On Monday, president Barack Obama will be..." + results = classification(text) + self.assertTrue(categories < set(results.keys())) + + def test_named_entity_recognition(self): + categories = set(['arts']) + text = "On Monday, president Barack Obama will be..." + results = named_entities(text) + self.assertTrue('named entity' in set(results.keys())) + def test_political(self): political_set = set(['Libertarian', 'Liberal', 'Conservative', 'Green']) test_string = "Guns don't kill people, people kill people."