spaCy/spacy/tokens/printers.py

# coding: utf8
from __future__ import unicode_literals

from .doc import Doc
from ..symbols import HEAD, TAG, DEP, ENT_IOB, ENT_TYPE


def merge_ents(doc):
    """
    Helper: merge adjacent entities into single tokens; modifies the doc.
    """
    for ent in doc.ents:
        ent.merge(ent.root.tag_, ent.text, ent.label_)
    return doc


def format_POS(token, light, flat):
    """
    Helper: form the POS output for a token.
    """
    subtree = dict([
        ("word", token.text),
        ("lemma", token.lemma_),  # trigger
        ("NE", token.ent_type_),  # trigger
        ("POS_fine", token.tag_),
        ("POS_coarse", token.pos_),
        ("arc", token.dep_),
        ("modifiers", [])
    ])
    if light:
        subtree.pop("lemma")
        subtree.pop("NE")
    if flat:
        subtree.pop("arc")
        subtree.pop("modifiers")
    return subtree


def POS_tree(root, light=False, flat=False):
    """
    Helper: generate a POS tree for a root token. The doc must have
    merge_ents(doc) ran on it.
    """
    subtree = format_POS(root, light=light, flat=flat)
    for c in root.children:
        subtree["modifiers"].append(POS_tree(c))
    return subtree


def parse_tree(doc, light=False, flat=False):
    """
    Makes a copy of the doc, then construct a syntactic parse tree, similar to
    the one used in displaCy. Generates the POS tree for all sentences in a doc.

    Args:
        doc: The doc for parsing.

    Returns:
        [parse_trees (Dict)]:

    >>> from spacy.en import English
    >>> nlp = English()
    >>> doc = nlp('Bob brought Alice the pizza. Alice ate the pizza.')
    >>> trees = doc.print_tree()
    [{'modifiers': [{'modifiers': [], 'NE': 'PERSON', 'word': 'Bob', 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Bob'}, {'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'dobj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'}, {'modifiers': [{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'lemma': 'pizza'}, {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}], 'NE': '', 'word': 'brought', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'bring'}, {'modifiers': [{'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'}, {'modifiers': [{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'lemma': 'pizza'}, {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}], 'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'eat'}]
    """
    doc_clone  = Doc(doc.vocab, words=[w.text for w in doc])
    doc_clone.from_array([HEAD, TAG, DEP, ENT_IOB, ENT_TYPE],
                         doc.to_array([HEAD, TAG, DEP, ENT_IOB, ENT_TYPE]))
    merge_ents(doc_clone)  # merge the entities into single tokens first
    return [POS_tree(sent.root, light=light, flat=flat) for sent in doc_clone.sents]
Fix formatting 2017-05-13 13:32:06 +03:00			`# coding: utf8`
			`from __future__ import unicode_literals`
move parse_tree logic to a new tokens/printers.py file 2016-12-30 20:19:18 +03:00
Replace deepcopy 2017-05-13 13:32:37 +03:00			`from .doc import Doc`
			`from ..symbols import HEAD, TAG, DEP, ENT_IOB, ENT_TYPE`


move parse_tree logic to a new tokens/printers.py file 2016-12-30 20:19:18 +03:00			`def merge_ents(doc):`
Fix formatting 2017-05-13 13:32:06 +03:00			`"""`
			`Helper: merge adjacent entities into single tokens; modifies the doc.`
			`"""`
move parse_tree logic to a new tokens/printers.py file 2016-12-30 20:19:18 +03:00			`for ent in doc.ents:`
			`ent.merge(ent.root.tag_, ent.text, ent.label_)`
			`return doc`

Fix formatting 2017-05-13 13:32:06 +03:00
move parse_tree logic to a new tokens/printers.py file 2016-12-30 20:19:18 +03:00			`def format_POS(token, light, flat):`
Fix formatting 2017-05-13 13:32:06 +03:00			`"""`
			`Helper: form the POS output for a token.`
			`"""`
move parse_tree logic to a new tokens/printers.py file 2016-12-30 20:19:18 +03:00			`subtree = dict([`
			`("word", token.text),`
			`("lemma", token.lemma_), # trigger`
			`("NE", token.ent_type_), # trigger`
			`("POS_fine", token.tag_),`
			`("POS_coarse", token.pos_),`
			`("arc", token.dep_),`
			`("modifiers", [])`
			`])`
			`if light:`
			`subtree.pop("lemma")`
			`subtree.pop("NE")`
			`if flat:`
			`subtree.pop("arc")`
			`subtree.pop("modifiers")`
			`return subtree`

Fix formatting 2017-05-13 13:32:06 +03:00
Set defaults for light and flat kwargs 2017-05-13 13:32:23 +03:00			`def POS_tree(root, light=False, flat=False):`
Fix formatting 2017-05-13 13:32:06 +03:00			`"""`
			`Helper: generate a POS tree for a root token. The doc must have`
			`merge_ents(doc) ran on it.`
			`"""`
move parse_tree logic to a new tokens/printers.py file 2016-12-30 20:19:18 +03:00			`subtree = format_POS(root, light=light, flat=flat)`
			`for c in root.children:`
			`subtree["modifiers"].append(POS_tree(c))`
			`return subtree`

Fix formatting 2017-05-13 13:32:06 +03:00
move parse_tree logic to a new tokens/printers.py file 2016-12-30 20:19:18 +03:00			`def parse_tree(doc, light=False, flat=False):`
Fix formatting 2017-05-13 13:32:06 +03:00			`"""`
			`Makes a copy of the doc, then construct a syntactic parse tree, similar to`
			`the one used in displaCy. Generates the POS tree for all sentences in a doc.`
move parse_tree logic to a new tokens/printers.py file 2016-12-30 20:19:18 +03:00
			`Args:`
			`doc: The doc for parsing.`

			`Returns:`
			`[parse_trees (Dict)]:`

			`>>> from spacy.en import English`
			`>>> nlp = English()`
			`>>> doc = nlp('Bob brought Alice the pizza. Alice ate the pizza.')`
			`>>> trees = doc.print_tree()`
			[{'modifiers': [{'modifiers': [], 'NE': 'PERSON', 'word': 'Bob', 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Bob'}, {'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'dobj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'}, {'modifiers': [{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'lemma': 'pizza'}, {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}], 'NE': '', 'word': 'brought', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'bring'}, {'modifiers': [{'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'}, {'modifiers': [{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'lemma': 'pizza'}, {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}], 'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'eat'}]
			`"""`
Replace deepcopy 2017-05-13 13:32:37 +03:00			`doc_clone = Doc(doc.vocab, words=[w.text for w in doc])`
			`doc_clone.from_array([HEAD, TAG, DEP, ENT_IOB, ENT_TYPE],`
			`doc.to_array([HEAD, TAG, DEP, ENT_IOB, ENT_TYPE]))`
move parse_tree logic to a new tokens/printers.py file 2016-12-30 20:19:18 +03:00			`merge_ents(doc_clone) # merge the entities into single tokens first`
			`return [POS_tree(sent.root, light=light, flat=flat) for sent in doc_clone.sents]`