move parse_tree logic to a new tokens/printers.py file

2025-11-11 13:25:43 +03:00 · 2016-12-30 12:19:18 -05:00 · 2016-12-30 12:19:18 -05:00 · da44183ae1
commit da44183ae1
parent fb92e2d061
4 changed files with 60 additions and 57 deletions
--- a/spacy/language.py
+++ b/spacy/language.py
@ -261,62 +261,6 @@ class Language(object):
                proc(doc)
        return doc
    def merge_ents(self, doc):
        '''Helper: merge adjacent entities into single tokens; modifies the doc.'''
        for ent in doc.ents:
            ent.merge(ent.root.tag_, ent.text, ent.label_)
        return doc
    def format_POS(self, token, light=False, flat=False):
        '''helper: form the POS output for a token'''
        subtree = dict([
            ("word", token.text),
            ("lemma", token.lemma_),  # trigger
            ("NE", token.ent_type_),  # trigger
            ("POS_fine", token.tag_),
            ("POS_coarse", token.pos_),
            ("arc", token.dep_),
            ("modifiers", [])
        ])
        if light:
            subtree.pop("lemma")
            subtree.pop("NE")
        if flat:
            subtree.pop("arc")
            subtree.pop("modifiers")
        return subtree
    def POS_tree_(self, root, light=False):
        '''Helper: generate a POS tree for a root token.
        The doc must have merge_ents(doc) ran on it.
        '''
        subtree = self.format_POS(root, light=light)
        for c in root.children:
            subtree["modifiers"].append(self.POS_tree_(c))
        return subtree
    def parse_tree_(self, doc, light=False):
        '''generate the POS tree for all sentences in a doc'''
        self.merge_ents(doc)  # merge the entities into single tokens first
        return [self.POS_tree_(sent.root, light=light) for sent in doc.sents]
    def parse_tree(self, text, tag=True, parse=True, entity=True, light=False):
        """Apply self.__call__ and use the resulting doc to construct a syntactic parse tree, similar to the one used in displaCy.
        Args:
            text (unicode): The text to be processed.
        Returns:
            [parse_trees (OrderedDicts)]:
        >>> from spacy.en import English
        >>> nlp = English()
        >>> trees = nlp.parse_tree('Bob brought Alice the pizza. Alice ate the pizza.')
        [{'modifiers': [{'modifiers': [], 'NE': 'PERSON', 'word': 'Bob', 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Bob'}, {'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'dobj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'}, {'modifiers': [{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'lemma': 'pizza'}, {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}], 'NE': '', 'word': 'brought', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'bring'}, {'modifiers': [{'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'}, {'modifiers': [{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'lemma': 'pizza'}, {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}], 'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'eat'}]
        """
        doc = self.__call__(text, tag=tag, parse=parse, entity=entity)
        return self.parse_tree_(doc, light=light)
    def pipe(self, texts, tag=True, parse=True, entity=True, n_threads=2,
            batch_size=1000):
        skip = {self.tagger: not tag, self.parser: not parse, self.entity: not entity}
--- a/spacy/tests/tokens/test_tokens_api.py
+++ b/spacy/tests/tokens/test_tokens_api.py
@ -214,7 +214,7 @@ def test_parse_tree(EN):
    doc = EN(text, tag=True)
    doc.from_array([HEAD], numpy.asarray([[1, 0, 1, -2, -3, -1, -5]], dtype='int32').T)
    # full method parse_tree(text) is a trivial composition
-    trees = EN.parse_tree_(doc)
+    trees = doc.print_tree()
    assert len(trees) > 0
    tree = trees[0]
    assert all(k in list(tree.keys()) for k in ['word', 'lemma', 'NE', 'POS_fine', 'POS_coarse', 'arc', 'modifiers'])
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -21,6 +21,7 @@ from ..parts_of_speech cimport univ_pos_t
 from ..lexeme cimport Lexeme
 from .span cimport Span
 from .token cimport Token
 from .printers import parse_tree
 from ..serialize.bits cimport BitArray
 from ..util import normalize_slice
 from ..syntax.iterators import CHUNKERS
@ -649,6 +650,10 @@ cdef class Doc:
        # Return the merged Python object
        return self[start]
    def print_tree(self, light=False, flat=False):
        """Returns the parse trees in the JSON (Dict) format."""
        return parse_tree(self, light=light, flat=flat)
 cdef int token_by_start(const TokenC* tokens, int length, int start_char) except -2:
    cdef int i
--- a/spacy/tokens/printers.py
+++ b/spacy/tokens/printers.py
@ -0,0 +1,54 @@
 from copy import deepcopy
 def merge_ents(doc):
    '''Helper: merge adjacent entities into single tokens; modifies the doc.'''
    for ent in doc.ents:
        ent.merge(ent.root.tag_, ent.text, ent.label_)
    return doc
 def format_POS(token, light, flat):
    '''helper: form the POS output for a token'''
    subtree = dict([
        ("word", token.text),
        ("lemma", token.lemma_),  # trigger
        ("NE", token.ent_type_),  # trigger
        ("POS_fine", token.tag_),
        ("POS_coarse", token.pos_),
        ("arc", token.dep_),
        ("modifiers", [])
    ])
    if light:
        subtree.pop("lemma")
        subtree.pop("NE")
    if flat:
        subtree.pop("arc")
        subtree.pop("modifiers")
    return subtree
 def POS_tree(root, light, flat):
    '''Helper: generate a POS tree for a root token.
    The doc must have merge_ents(doc) ran on it.
    '''
    subtree = format_POS(root, light=light, flat=flat)
    for c in root.children:
        subtree["modifiers"].append(POS_tree(c))
    return subtree
 def parse_tree(doc, light=False, flat=False):
    """Makes a copy of the doc, then construct a syntactic parse tree, similar to the one used in displaCy. Generates the POS tree for all sentences in a doc
    Args:
        doc: The doc for parsing.
    Returns:
        [parse_trees (Dict)]:
    >>> from spacy.en import English
    >>> nlp = English()
    >>> doc = nlp('Bob brought Alice the pizza. Alice ate the pizza.')
    >>> trees = doc.print_tree()
    [{'modifiers': [{'modifiers': [], 'NE': 'PERSON', 'word': 'Bob', 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Bob'}, {'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'dobj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'}, {'modifiers': [{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'lemma': 'pizza'}, {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}], 'NE': '', 'word': 'brought', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'bring'}, {'modifiers': [{'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'}, {'modifiers': [{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'lemma': 'pizza'}, {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}], 'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'eat'}]
    """
    doc_clone = deepcopy(doc)
    merge_ents(doc_clone)  # merge the entities into single tokens first
    return [POS_tree(sent.root, light=light, flat=flat) for sent in doc_clone.sents]