From f046e0d7c82e0dc6f4e8e6d7857c51db13bbd805 Mon Sep 17 00:00:00 2001 From: kengz Date: Sun, 16 Oct 2016 14:20:23 -0400 Subject: [PATCH 1/4] add parse_tree method to language, separate from __call__ for efficiency, but will use __call__ to get the doc --- spacy/language.py | 56 +++++++++++++++++++++++++++ spacy/tests/tokens/test_tokens_api.py | 7 ++++ 2 files changed, 63 insertions(+) diff --git a/spacy/language.py b/spacy/language.py index df7728d08..b9edf1379 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -261,6 +261,62 @@ class Language(object): proc(doc) return doc + def merge_ents(self, doc): + '''Helper: merge adjacent entities into single tokens; modifies the doc.''' + for ent in doc.ents: + ent.merge(ent.root.tag_, ent.text, ent.label_) + return doc + + def format_POS(self, token, light=False, flat=False): + '''helper: form the POS output for a token''' + subtree = dict([ + ("word", token.text), + ("lemma", token.lemma_), # trigger + ("NE", token.ent_type_), # trigger + ("POS_fine", token.tag_), + ("POS_coarse", token.pos_), + ("arc", token.dep_), + ("modifiers", []) + ]) + if light: + subtree.pop("lemma") + subtree.pop("NE") + if flat: + subtree.pop("arc") + subtree.pop("modifiers") + return subtree + + def POS_tree_(self, root, light=False): + '''Helper: generate a POS tree for a root token. + The doc must have merge_ents(doc) ran on it. + ''' + subtree = self.format_POS(root, light=light) + for c in root.children: + subtree["modifiers"].append(self.POS_tree_(c)) + return subtree + + def parse_tree_(self, doc, light=False): + '''generate the POS tree for all sentences in a doc''' + self.merge_ents(doc) # merge the entities into single tokens first + return [self.POS_tree_(sent.root, light=light) for sent in doc.sents] + + def parse_tree(self, text, tag=True, parse=True, entity=True, light=False): + """Apply self.__call__ and use the resulting doc to construct a syntactic parse tree, similar to the one used in displaCy. + + Args: + text (unicode): The text to be processed. + + Returns: + [parse_trees (OrderedDicts)]: + + >>> from spacy.en import English + >>> nlp = English() + >>> trees = nlp.parse_tree('Bob brought Alice the pizza. Alice ate the pizza.') + [{'modifiers': [{'modifiers': [], 'NE': 'PERSON', 'word': 'Bob', 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Bob'}, {'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'dobj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'}, {'modifiers': [{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'lemma': 'pizza'}, {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}], 'NE': '', 'word': 'brought', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'bring'}, {'modifiers': [{'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'}, {'modifiers': [{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'lemma': 'pizza'}, {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}], 'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'eat'}] + """ + doc = self.__call__(text, tag=tag, parse=parse, entity=entity) + return self.parse_tree_(doc, light=light) + def pipe(self, texts, tag=True, parse=True, entity=True, n_threads=2, batch_size=1000): skip = {self.tagger: not tag, self.parser: not parse, self.entity: not entity} diff --git a/spacy/tests/tokens/test_tokens_api.py b/spacy/tests/tokens/test_tokens_api.py index 47ad8545f..74522d574 100644 --- a/spacy/tests/tokens/test_tokens_api.py +++ b/spacy/tests/tokens/test_tokens_api.py @@ -206,3 +206,10 @@ def test_right_edge(EN): def test_has_vector(EN): doc = EN(u'''apple orange pear''') assert doc.has_vector + + +def test_parse_tree(EN): + trees = EN.parse_tree(u'''Bob brought Alice the pizza.''') + assert len(trees) > 0 + tree = trees[0] + assert all(k in list(tree.keys()) for k in ['word', 'lemma', 'NE', 'POS_fine', 'POS_coarse', 'arc', 'modifiers']) From 17b7832419eaa66599deb050a69543916993af61 Mon Sep 17 00:00:00 2001 From: kengz Date: Sun, 16 Oct 2016 14:39:07 -0400 Subject: [PATCH 2/4] mark test as needing models --- spacy/tests/tokens/test_tokens_api.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/tests/tokens/test_tokens_api.py b/spacy/tests/tokens/test_tokens_api.py index 74522d574..00ace4c42 100644 --- a/spacy/tests/tokens/test_tokens_api.py +++ b/spacy/tests/tokens/test_tokens_api.py @@ -208,6 +208,7 @@ def test_has_vector(EN): assert doc.has_vector +@pytest.mark.models def test_parse_tree(EN): trees = EN.parse_tree(u'''Bob brought Alice the pizza.''') assert len(trees) > 0 From fb92e2d06184ffe52c2b7fb3122f31f6acc1a448 Mon Sep 17 00:00:00 2001 From: kengz Date: Sun, 16 Oct 2016 15:12:08 -0400 Subject: [PATCH 3/4] activate parse_tree test, use from_array, test for root correctness --- spacy/tests/tokens/test_tokens_api.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/spacy/tests/tokens/test_tokens_api.py b/spacy/tests/tokens/test_tokens_api.py index 00ace4c42..1b46ecb67 100644 --- a/spacy/tests/tokens/test_tokens_api.py +++ b/spacy/tests/tokens/test_tokens_api.py @@ -208,9 +208,14 @@ def test_has_vector(EN): assert doc.has_vector -@pytest.mark.models def test_parse_tree(EN): - trees = EN.parse_tree(u'''Bob brought Alice the pizza.''') + text = 'I like New York in Autumn.' + EN = English(parser=False) + doc = EN(text, tag=True) + doc.from_array([HEAD], numpy.asarray([[1, 0, 1, -2, -3, -1, -5]], dtype='int32').T) + # full method parse_tree(text) is a trivial composition + trees = EN.parse_tree_(doc) assert len(trees) > 0 tree = trees[0] assert all(k in list(tree.keys()) for k in ['word', 'lemma', 'NE', 'POS_fine', 'POS_coarse', 'arc', 'modifiers']) + assert tree['word'] == 'like' # check root is correct From da44183ae13e26dc74abe80ff487817cfdf6a2bd Mon Sep 17 00:00:00 2001 From: kengz Date: Fri, 30 Dec 2016 12:19:18 -0500 Subject: [PATCH 4/4] move parse_tree logic to a new tokens/printers.py file --- spacy/language.py | 56 --------------------------- spacy/tests/tokens/test_tokens_api.py | 2 +- spacy/tokens/doc.pyx | 5 +++ spacy/tokens/printers.py | 54 ++++++++++++++++++++++++++ 4 files changed, 60 insertions(+), 57 deletions(-) create mode 100644 spacy/tokens/printers.py diff --git a/spacy/language.py b/spacy/language.py index b9edf1379..df7728d08 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -261,62 +261,6 @@ class Language(object): proc(doc) return doc - def merge_ents(self, doc): - '''Helper: merge adjacent entities into single tokens; modifies the doc.''' - for ent in doc.ents: - ent.merge(ent.root.tag_, ent.text, ent.label_) - return doc - - def format_POS(self, token, light=False, flat=False): - '''helper: form the POS output for a token''' - subtree = dict([ - ("word", token.text), - ("lemma", token.lemma_), # trigger - ("NE", token.ent_type_), # trigger - ("POS_fine", token.tag_), - ("POS_coarse", token.pos_), - ("arc", token.dep_), - ("modifiers", []) - ]) - if light: - subtree.pop("lemma") - subtree.pop("NE") - if flat: - subtree.pop("arc") - subtree.pop("modifiers") - return subtree - - def POS_tree_(self, root, light=False): - '''Helper: generate a POS tree for a root token. - The doc must have merge_ents(doc) ran on it. - ''' - subtree = self.format_POS(root, light=light) - for c in root.children: - subtree["modifiers"].append(self.POS_tree_(c)) - return subtree - - def parse_tree_(self, doc, light=False): - '''generate the POS tree for all sentences in a doc''' - self.merge_ents(doc) # merge the entities into single tokens first - return [self.POS_tree_(sent.root, light=light) for sent in doc.sents] - - def parse_tree(self, text, tag=True, parse=True, entity=True, light=False): - """Apply self.__call__ and use the resulting doc to construct a syntactic parse tree, similar to the one used in displaCy. - - Args: - text (unicode): The text to be processed. - - Returns: - [parse_trees (OrderedDicts)]: - - >>> from spacy.en import English - >>> nlp = English() - >>> trees = nlp.parse_tree('Bob brought Alice the pizza. Alice ate the pizza.') - [{'modifiers': [{'modifiers': [], 'NE': 'PERSON', 'word': 'Bob', 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Bob'}, {'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'dobj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'}, {'modifiers': [{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'lemma': 'pizza'}, {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}], 'NE': '', 'word': 'brought', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'bring'}, {'modifiers': [{'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'}, {'modifiers': [{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'lemma': 'pizza'}, {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}], 'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'eat'}] - """ - doc = self.__call__(text, tag=tag, parse=parse, entity=entity) - return self.parse_tree_(doc, light=light) - def pipe(self, texts, tag=True, parse=True, entity=True, n_threads=2, batch_size=1000): skip = {self.tagger: not tag, self.parser: not parse, self.entity: not entity} diff --git a/spacy/tests/tokens/test_tokens_api.py b/spacy/tests/tokens/test_tokens_api.py index 1b46ecb67..d229ae7b1 100644 --- a/spacy/tests/tokens/test_tokens_api.py +++ b/spacy/tests/tokens/test_tokens_api.py @@ -214,7 +214,7 @@ def test_parse_tree(EN): doc = EN(text, tag=True) doc.from_array([HEAD], numpy.asarray([[1, 0, 1, -2, -3, -1, -5]], dtype='int32').T) # full method parse_tree(text) is a trivial composition - trees = EN.parse_tree_(doc) + trees = doc.print_tree() assert len(trees) > 0 tree = trees[0] assert all(k in list(tree.keys()) for k in ['word', 'lemma', 'NE', 'POS_fine', 'POS_coarse', 'arc', 'modifiers']) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 66654482e..758b0290f 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -21,6 +21,7 @@ from ..parts_of_speech cimport univ_pos_t from ..lexeme cimport Lexeme from .span cimport Span from .token cimport Token +from .printers import parse_tree from ..serialize.bits cimport BitArray from ..util import normalize_slice from ..syntax.iterators import CHUNKERS @@ -649,6 +650,10 @@ cdef class Doc: # Return the merged Python object return self[start] + def print_tree(self, light=False, flat=False): + """Returns the parse trees in the JSON (Dict) format.""" + return parse_tree(self, light=light, flat=flat) + cdef int token_by_start(const TokenC* tokens, int length, int start_char) except -2: cdef int i diff --git a/spacy/tokens/printers.py b/spacy/tokens/printers.py new file mode 100644 index 000000000..d70088540 --- /dev/null +++ b/spacy/tokens/printers.py @@ -0,0 +1,54 @@ +from copy import deepcopy + +def merge_ents(doc): + '''Helper: merge adjacent entities into single tokens; modifies the doc.''' + for ent in doc.ents: + ent.merge(ent.root.tag_, ent.text, ent.label_) + return doc + +def format_POS(token, light, flat): + '''helper: form the POS output for a token''' + subtree = dict([ + ("word", token.text), + ("lemma", token.lemma_), # trigger + ("NE", token.ent_type_), # trigger + ("POS_fine", token.tag_), + ("POS_coarse", token.pos_), + ("arc", token.dep_), + ("modifiers", []) + ]) + if light: + subtree.pop("lemma") + subtree.pop("NE") + if flat: + subtree.pop("arc") + subtree.pop("modifiers") + return subtree + +def POS_tree(root, light, flat): + '''Helper: generate a POS tree for a root token. + The doc must have merge_ents(doc) ran on it. + ''' + subtree = format_POS(root, light=light, flat=flat) + for c in root.children: + subtree["modifiers"].append(POS_tree(c)) + return subtree + +def parse_tree(doc, light=False, flat=False): + """Makes a copy of the doc, then construct a syntactic parse tree, similar to the one used in displaCy. Generates the POS tree for all sentences in a doc + + Args: + doc: The doc for parsing. + + Returns: + [parse_trees (Dict)]: + + >>> from spacy.en import English + >>> nlp = English() + >>> doc = nlp('Bob brought Alice the pizza. Alice ate the pizza.') + >>> trees = doc.print_tree() + [{'modifiers': [{'modifiers': [], 'NE': 'PERSON', 'word': 'Bob', 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Bob'}, {'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'dobj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'}, {'modifiers': [{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'lemma': 'pizza'}, {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}], 'NE': '', 'word': 'brought', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'bring'}, {'modifiers': [{'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'}, {'modifiers': [{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'lemma': 'pizza'}, {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}], 'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'eat'}] + """ + doc_clone = deepcopy(doc) + merge_ents(doc_clone) # merge the entities into single tokens first + return [POS_tree(sent.root, light=light, flat=flat) for sent in doc_clone.sents]