From f046e0d7c82e0dc6f4e8e6d7857c51db13bbd805 Mon Sep 17 00:00:00 2001
From: kengz <kengzwl@gmail.com>
Date: Sun, 16 Oct 2016 14:20:23 -0400
Subject: [PATCH 1/4] add parse_tree method to language, separate from __call__
 for efficiency, but will use __call__ to get the doc

---
 spacy/language.py                     | 56 +++++++++++++++++++++++++++
 spacy/tests/tokens/test_tokens_api.py |  7 ++++
 2 files changed, 63 insertions(+)

diff --git a/spacy/language.py b/spacy/language.py
index df7728d08..b9edf1379 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -261,6 +261,62 @@ class Language(object):
                 proc(doc)
         return doc
 
+    def merge_ents(self, doc):
+        '''Helper: merge adjacent entities into single tokens; modifies the doc.'''
+        for ent in doc.ents:
+            ent.merge(ent.root.tag_, ent.text, ent.label_)
+        return doc
+
+    def format_POS(self, token, light=False, flat=False):
+        '''helper: form the POS output for a token'''
+        subtree = dict([
+            ("word", token.text),
+            ("lemma", token.lemma_),  # trigger
+            ("NE", token.ent_type_),  # trigger
+            ("POS_fine", token.tag_),
+            ("POS_coarse", token.pos_),
+            ("arc", token.dep_),
+            ("modifiers", [])
+        ])
+        if light:
+            subtree.pop("lemma")
+            subtree.pop("NE")
+        if flat:
+            subtree.pop("arc")
+            subtree.pop("modifiers")
+        return subtree
+
+    def POS_tree_(self, root, light=False):
+        '''Helper: generate a POS tree for a root token.
+        The doc must have merge_ents(doc) ran on it.
+        '''
+        subtree = self.format_POS(root, light=light)
+        for c in root.children:
+            subtree["modifiers"].append(self.POS_tree_(c))
+        return subtree
+
+    def parse_tree_(self, doc, light=False):
+        '''generate the POS tree for all sentences in a doc'''
+        self.merge_ents(doc)  # merge the entities into single tokens first
+        return [self.POS_tree_(sent.root, light=light) for sent in doc.sents]
+
+    def parse_tree(self, text, tag=True, parse=True, entity=True, light=False):
+        """Apply self.__call__ and use the resulting doc to construct a syntactic parse tree, similar to the one used in displaCy.
+
+        Args:
+            text (unicode): The text to be processed.
+
+        Returns:
+            [parse_trees (OrderedDicts)]:
+
+        >>> from spacy.en import English
+        >>> nlp = English()
+        >>> trees = nlp.parse_tree('Bob brought Alice the pizza. Alice ate the pizza.')
+        [{'modifiers': [{'modifiers': [], 'NE': 'PERSON', 'word': 'Bob', 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Bob'}, {'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'dobj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'}, {'modifiers': [{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'lemma': 'pizza'}, {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}], 'NE': '', 'word': 'brought', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'bring'}, {'modifiers': [{'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'}, {'modifiers': [{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'lemma': 'pizza'}, {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}], 'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'eat'}]
+        """
+        doc = self.__call__(text, tag=tag, parse=parse, entity=entity)
+        return self.parse_tree_(doc, light=light)
+
     def pipe(self, texts, tag=True, parse=True, entity=True, n_threads=2,
             batch_size=1000):
         skip = {self.tagger: not tag, self.parser: not parse, self.entity: not entity}
diff --git a/spacy/tests/tokens/test_tokens_api.py b/spacy/tests/tokens/test_tokens_api.py
index 47ad8545f..74522d574 100644
--- a/spacy/tests/tokens/test_tokens_api.py
+++ b/spacy/tests/tokens/test_tokens_api.py
@@ -206,3 +206,10 @@ def test_right_edge(EN):
 def test_has_vector(EN):
     doc = EN(u'''apple orange pear''')
     assert doc.has_vector
+
+
+def test_parse_tree(EN):
+    trees = EN.parse_tree(u'''Bob brought Alice the pizza.''')
+    assert len(trees) > 0
+    tree = trees[0]
+    assert all(k in list(tree.keys()) for k in ['word', 'lemma', 'NE', 'POS_fine', 'POS_coarse', 'arc', 'modifiers'])

From 17b7832419eaa66599deb050a69543916993af61 Mon Sep 17 00:00:00 2001
From: kengz <kengzwl@gmail.com>
Date: Sun, 16 Oct 2016 14:39:07 -0400
Subject: [PATCH 2/4] mark test as needing models

---
 spacy/tests/tokens/test_tokens_api.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/spacy/tests/tokens/test_tokens_api.py b/spacy/tests/tokens/test_tokens_api.py
index 74522d574..00ace4c42 100644
--- a/spacy/tests/tokens/test_tokens_api.py
+++ b/spacy/tests/tokens/test_tokens_api.py
@@ -208,6 +208,7 @@ def test_has_vector(EN):
     assert doc.has_vector
 
 
+@pytest.mark.models
 def test_parse_tree(EN):
     trees = EN.parse_tree(u'''Bob brought Alice the pizza.''')
     assert len(trees) > 0

From fb92e2d06184ffe52c2b7fb3122f31f6acc1a448 Mon Sep 17 00:00:00 2001
From: kengz <kengzwl@gmail.com>
Date: Sun, 16 Oct 2016 15:12:08 -0400
Subject: [PATCH 3/4] activate parse_tree test, use from_array, test for root
 correctness

---
 spacy/tests/tokens/test_tokens_api.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/spacy/tests/tokens/test_tokens_api.py b/spacy/tests/tokens/test_tokens_api.py
index 00ace4c42..1b46ecb67 100644
--- a/spacy/tests/tokens/test_tokens_api.py
+++ b/spacy/tests/tokens/test_tokens_api.py
@@ -208,9 +208,14 @@ def test_has_vector(EN):
     assert doc.has_vector
 
 
-@pytest.mark.models
 def test_parse_tree(EN):
-    trees = EN.parse_tree(u'''Bob brought Alice the pizza.''')
+    text = 'I like New York in Autumn.'
+    EN = English(parser=False)
+    doc = EN(text, tag=True)
+    doc.from_array([HEAD], numpy.asarray([[1, 0, 1, -2, -3, -1, -5]], dtype='int32').T)
+    # full method parse_tree(text) is a trivial composition
+    trees = EN.parse_tree_(doc)
     assert len(trees) > 0
     tree = trees[0]
     assert all(k in list(tree.keys()) for k in ['word', 'lemma', 'NE', 'POS_fine', 'POS_coarse', 'arc', 'modifiers'])
+    assert tree['word'] == 'like' # check root is correct

From da44183ae13e26dc74abe80ff487817cfdf6a2bd Mon Sep 17 00:00:00 2001
From: kengz <kengzwl@gmail.com>
Date: Fri, 30 Dec 2016 12:19:18 -0500
Subject: [PATCH 4/4] move parse_tree logic to a new tokens/printers.py file

---
 spacy/language.py                     | 56 ---------------------------
 spacy/tests/tokens/test_tokens_api.py |  2 +-
 spacy/tokens/doc.pyx                  |  5 +++
 spacy/tokens/printers.py              | 54 ++++++++++++++++++++++++++
 4 files changed, 60 insertions(+), 57 deletions(-)
 create mode 100644 spacy/tokens/printers.py

diff --git a/spacy/language.py b/spacy/language.py
index b9edf1379..df7728d08 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -261,62 +261,6 @@ class Language(object):
                 proc(doc)
         return doc
 
-    def merge_ents(self, doc):
-        '''Helper: merge adjacent entities into single tokens; modifies the doc.'''
-        for ent in doc.ents:
-            ent.merge(ent.root.tag_, ent.text, ent.label_)
-        return doc
-
-    def format_POS(self, token, light=False, flat=False):
-        '''helper: form the POS output for a token'''
-        subtree = dict([
-            ("word", token.text),
-            ("lemma", token.lemma_),  # trigger
-            ("NE", token.ent_type_),  # trigger
-            ("POS_fine", token.tag_),
-            ("POS_coarse", token.pos_),
-            ("arc", token.dep_),
-            ("modifiers", [])
-        ])
-        if light:
-            subtree.pop("lemma")
-            subtree.pop("NE")
-        if flat:
-            subtree.pop("arc")
-            subtree.pop("modifiers")
-        return subtree
-
-    def POS_tree_(self, root, light=False):
-        '''Helper: generate a POS tree for a root token.
-        The doc must have merge_ents(doc) ran on it.
-        '''
-        subtree = self.format_POS(root, light=light)
-        for c in root.children:
-            subtree["modifiers"].append(self.POS_tree_(c))
-        return subtree
-
-    def parse_tree_(self, doc, light=False):
-        '''generate the POS tree for all sentences in a doc'''
-        self.merge_ents(doc)  # merge the entities into single tokens first
-        return [self.POS_tree_(sent.root, light=light) for sent in doc.sents]
-
-    def parse_tree(self, text, tag=True, parse=True, entity=True, light=False):
-        """Apply self.__call__ and use the resulting doc to construct a syntactic parse tree, similar to the one used in displaCy.
-
-        Args:
-            text (unicode): The text to be processed.
-
-        Returns:
-            [parse_trees (OrderedDicts)]:
-
-        >>> from spacy.en import English
-        >>> nlp = English()
-        >>> trees = nlp.parse_tree('Bob brought Alice the pizza. Alice ate the pizza.')
-        [{'modifiers': [{'modifiers': [], 'NE': 'PERSON', 'word': 'Bob', 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Bob'}, {'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'dobj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'}, {'modifiers': [{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'lemma': 'pizza'}, {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}], 'NE': '', 'word': 'brought', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'bring'}, {'modifiers': [{'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'}, {'modifiers': [{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'lemma': 'pizza'}, {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}], 'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'eat'}]
-        """
-        doc = self.__call__(text, tag=tag, parse=parse, entity=entity)
-        return self.parse_tree_(doc, light=light)
-
     def pipe(self, texts, tag=True, parse=True, entity=True, n_threads=2,
             batch_size=1000):
         skip = {self.tagger: not tag, self.parser: not parse, self.entity: not entity}
diff --git a/spacy/tests/tokens/test_tokens_api.py b/spacy/tests/tokens/test_tokens_api.py
index 1b46ecb67..d229ae7b1 100644
--- a/spacy/tests/tokens/test_tokens_api.py
+++ b/spacy/tests/tokens/test_tokens_api.py
@@ -214,7 +214,7 @@ def test_parse_tree(EN):
     doc = EN(text, tag=True)
     doc.from_array([HEAD], numpy.asarray([[1, 0, 1, -2, -3, -1, -5]], dtype='int32').T)
     # full method parse_tree(text) is a trivial composition
-    trees = EN.parse_tree_(doc)
+    trees = doc.print_tree()
     assert len(trees) > 0
     tree = trees[0]
     assert all(k in list(tree.keys()) for k in ['word', 'lemma', 'NE', 'POS_fine', 'POS_coarse', 'arc', 'modifiers'])
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 66654482e..758b0290f 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -21,6 +21,7 @@ from ..parts_of_speech cimport univ_pos_t
 from ..lexeme cimport Lexeme
 from .span cimport Span
 from .token cimport Token
+from .printers import parse_tree
 from ..serialize.bits cimport BitArray
 from ..util import normalize_slice
 from ..syntax.iterators import CHUNKERS
@@ -649,6 +650,10 @@ cdef class Doc:
         # Return the merged Python object
         return self[start]
 
+    def print_tree(self, light=False, flat=False):
+        """Returns the parse trees in the JSON (Dict) format."""
+        return parse_tree(self, light=light, flat=flat)
+
 
 cdef int token_by_start(const TokenC* tokens, int length, int start_char) except -2:
     cdef int i
diff --git a/spacy/tokens/printers.py b/spacy/tokens/printers.py
new file mode 100644
index 000000000..d70088540
--- /dev/null
+++ b/spacy/tokens/printers.py
@@ -0,0 +1,54 @@
+from copy import deepcopy
+
+def merge_ents(doc):
+    '''Helper: merge adjacent entities into single tokens; modifies the doc.'''
+    for ent in doc.ents:
+        ent.merge(ent.root.tag_, ent.text, ent.label_)
+    return doc
+
+def format_POS(token, light, flat):
+    '''helper: form the POS output for a token'''
+    subtree = dict([
+        ("word", token.text),
+        ("lemma", token.lemma_),  # trigger
+        ("NE", token.ent_type_),  # trigger
+        ("POS_fine", token.tag_),
+        ("POS_coarse", token.pos_),
+        ("arc", token.dep_),
+        ("modifiers", [])
+    ])
+    if light:
+        subtree.pop("lemma")
+        subtree.pop("NE")
+    if flat:
+        subtree.pop("arc")
+        subtree.pop("modifiers")
+    return subtree
+
+def POS_tree(root, light, flat):
+    '''Helper: generate a POS tree for a root token.
+    The doc must have merge_ents(doc) ran on it.
+    '''
+    subtree = format_POS(root, light=light, flat=flat)
+    for c in root.children:
+        subtree["modifiers"].append(POS_tree(c))
+    return subtree
+
+def parse_tree(doc, light=False, flat=False):
+    """Makes a copy of the doc, then construct a syntactic parse tree, similar to the one used in displaCy. Generates the POS tree for all sentences in a doc
+
+    Args:
+        doc: The doc for parsing.
+
+    Returns:
+        [parse_trees (Dict)]:
+
+    >>> from spacy.en import English
+    >>> nlp = English()
+    >>> doc = nlp('Bob brought Alice the pizza. Alice ate the pizza.')
+    >>> trees = doc.print_tree()
+    [{'modifiers': [{'modifiers': [], 'NE': 'PERSON', 'word': 'Bob', 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Bob'}, {'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'dobj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'}, {'modifiers': [{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'lemma': 'pizza'}, {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}], 'NE': '', 'word': 'brought', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'bring'}, {'modifiers': [{'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'}, {'modifiers': [{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'lemma': 'pizza'}, {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}], 'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'eat'}]
+    """
+    doc_clone = deepcopy(doc)
+    merge_ents(doc_clone)  # merge the entities into single tokens first
+    return [POS_tree(sent.root, light=light, flat=flat) for sent in doc_clone.sents]