Merge Kengz's tree_print patch

2025-11-13 14:25:52 +03:00 · 2017-05-13 03:18:49 +02:00 · 2017-05-13 03:18:49 +02:00 · b2540d2379
commit b2540d2379
parent 76ebd0fe5c 73a38bd4d1
3 changed files with 74 additions and 0 deletions
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@ -215,3 +215,16 @@ def test_doc_api_has_vector(en_tokenizer, text_file, text, vectors):

    doc = en_tokenizer(text)
    assert doc.has_vector
+
+
+def test_parse_tree(EN):
+    text = 'I like New York in Autumn.'
+    EN = English(parser=False)
+    doc = EN(text, tag=True)
+    doc.from_array([HEAD], numpy.asarray([[1, 0, 1, -2, -3, -1, -5]], dtype='int32').T)
+    # full method parse_tree(text) is a trivial composition
+    trees = doc.print_tree()
+    assert len(trees) > 0
+    tree = trees[0]
+    assert all(k in list(tree.keys()) for k in ['word', 'lemma', 'NE', 'POS_fine', 'POS_coarse', 'arc', 'modifiers'])
+    assert tree['word'] == 'like' # check root is correct
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -22,6 +22,9 @@ from ..attrs cimport POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
 from ..parts_of_speech cimport CCONJ, PUNCT, NOUN
 from ..parts_of_speech cimport univ_pos_t
 from ..lexeme cimport Lexeme
+from .span cimport Span
+from .token cimport Token
+from .printers import parse_tree
 from ..serialize.bits cimport BitArray
 from ..util import normalize_slice
 from ..syntax.iterators import CHUNKERS
@ -774,6 +777,10 @@ cdef class Doc:
        # Return the merged Python object
        return self[start]

+    def print_tree(self, light=False, flat=False):
+        """Returns the parse trees in the JSON (Dict) format."""
+        return parse_tree(self, light=light, flat=flat)
+

 cdef int token_by_start(const TokenC* tokens, int length, int start_char) except -2:
    cdef int i
--- a/spacy/tokens/printers.py
+++ b/spacy/tokens/printers.py
@ -0,0 +1,54 @@
+from copy import deepcopy
+
+def merge_ents(doc):
+    '''Helper: merge adjacent entities into single tokens; modifies the doc.'''
+    for ent in doc.ents:
+        ent.merge(ent.root.tag_, ent.text, ent.label_)
+    return doc
+
+def format_POS(token, light, flat):
+    '''helper: form the POS output for a token'''
+    subtree = dict([
+        ("word", token.text),
+        ("lemma", token.lemma_),  # trigger
+        ("NE", token.ent_type_),  # trigger
+        ("POS_fine", token.tag_),
+        ("POS_coarse", token.pos_),
+        ("arc", token.dep_),
+        ("modifiers", [])
+    ])
+    if light:
+        subtree.pop("lemma")
+        subtree.pop("NE")
+    if flat:
+        subtree.pop("arc")
+        subtree.pop("modifiers")
+    return subtree
+
+def POS_tree(root, light, flat):
+    '''Helper: generate a POS tree for a root token.
+    The doc must have merge_ents(doc) ran on it.
+    '''
+    subtree = format_POS(root, light=light, flat=flat)
+    for c in root.children:
+        subtree["modifiers"].append(POS_tree(c))
+    return subtree
+
+def parse_tree(doc, light=False, flat=False):
+    """Makes a copy of the doc, then construct a syntactic parse tree, similar to the one used in displaCy. Generates the POS tree for all sentences in a doc
+
+    Args:
+        doc: The doc for parsing.
+
+    Returns:
+        [parse_trees (Dict)]:
+
+    >>> from spacy.en import English
+    >>> nlp = English()
+    >>> doc = nlp('Bob brought Alice the pizza. Alice ate the pizza.')
+    >>> trees = doc.print_tree()
+    [{'modifiers': [{'modifiers': [], 'NE': 'PERSON', 'word': 'Bob', 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Bob'}, {'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'dobj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'}, {'modifiers': [{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'lemma': 'pizza'}, {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}], 'NE': '', 'word': 'brought', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'bring'}, {'modifiers': [{'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'}, {'modifiers': [{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'lemma': 'pizza'}, {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}], 'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'eat'}]
+    """
+    doc_clone = deepcopy(doc)
+    merge_ents(doc_clone)  # merge the entities into single tokens first
+    return [POS_tree(sent.root, light=light, flat=flat) for sent in doc_clone.sents]