mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
move parse_tree logic to a new tokens/printers.py file
This commit is contained in:
parent
fb92e2d061
commit
da44183ae1
|
@ -261,62 +261,6 @@ class Language(object):
|
||||||
proc(doc)
|
proc(doc)
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
def merge_ents(self, doc):
|
|
||||||
'''Helper: merge adjacent entities into single tokens; modifies the doc.'''
|
|
||||||
for ent in doc.ents:
|
|
||||||
ent.merge(ent.root.tag_, ent.text, ent.label_)
|
|
||||||
return doc
|
|
||||||
|
|
||||||
def format_POS(self, token, light=False, flat=False):
|
|
||||||
'''helper: form the POS output for a token'''
|
|
||||||
subtree = dict([
|
|
||||||
("word", token.text),
|
|
||||||
("lemma", token.lemma_), # trigger
|
|
||||||
("NE", token.ent_type_), # trigger
|
|
||||||
("POS_fine", token.tag_),
|
|
||||||
("POS_coarse", token.pos_),
|
|
||||||
("arc", token.dep_),
|
|
||||||
("modifiers", [])
|
|
||||||
])
|
|
||||||
if light:
|
|
||||||
subtree.pop("lemma")
|
|
||||||
subtree.pop("NE")
|
|
||||||
if flat:
|
|
||||||
subtree.pop("arc")
|
|
||||||
subtree.pop("modifiers")
|
|
||||||
return subtree
|
|
||||||
|
|
||||||
def POS_tree_(self, root, light=False):
|
|
||||||
'''Helper: generate a POS tree for a root token.
|
|
||||||
The doc must have merge_ents(doc) ran on it.
|
|
||||||
'''
|
|
||||||
subtree = self.format_POS(root, light=light)
|
|
||||||
for c in root.children:
|
|
||||||
subtree["modifiers"].append(self.POS_tree_(c))
|
|
||||||
return subtree
|
|
||||||
|
|
||||||
def parse_tree_(self, doc, light=False):
|
|
||||||
'''generate the POS tree for all sentences in a doc'''
|
|
||||||
self.merge_ents(doc) # merge the entities into single tokens first
|
|
||||||
return [self.POS_tree_(sent.root, light=light) for sent in doc.sents]
|
|
||||||
|
|
||||||
def parse_tree(self, text, tag=True, parse=True, entity=True, light=False):
|
|
||||||
"""Apply self.__call__ and use the resulting doc to construct a syntactic parse tree, similar to the one used in displaCy.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
text (unicode): The text to be processed.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
[parse_trees (OrderedDicts)]:
|
|
||||||
|
|
||||||
>>> from spacy.en import English
|
|
||||||
>>> nlp = English()
|
|
||||||
>>> trees = nlp.parse_tree('Bob brought Alice the pizza. Alice ate the pizza.')
|
|
||||||
[{'modifiers': [{'modifiers': [], 'NE': 'PERSON', 'word': 'Bob', 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Bob'}, {'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'dobj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'}, {'modifiers': [{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'lemma': 'pizza'}, {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}], 'NE': '', 'word': 'brought', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'bring'}, {'modifiers': [{'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'}, {'modifiers': [{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'lemma': 'pizza'}, {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}], 'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'eat'}]
|
|
||||||
"""
|
|
||||||
doc = self.__call__(text, tag=tag, parse=parse, entity=entity)
|
|
||||||
return self.parse_tree_(doc, light=light)
|
|
||||||
|
|
||||||
def pipe(self, texts, tag=True, parse=True, entity=True, n_threads=2,
|
def pipe(self, texts, tag=True, parse=True, entity=True, n_threads=2,
|
||||||
batch_size=1000):
|
batch_size=1000):
|
||||||
skip = {self.tagger: not tag, self.parser: not parse, self.entity: not entity}
|
skip = {self.tagger: not tag, self.parser: not parse, self.entity: not entity}
|
||||||
|
|
|
@ -214,7 +214,7 @@ def test_parse_tree(EN):
|
||||||
doc = EN(text, tag=True)
|
doc = EN(text, tag=True)
|
||||||
doc.from_array([HEAD], numpy.asarray([[1, 0, 1, -2, -3, -1, -5]], dtype='int32').T)
|
doc.from_array([HEAD], numpy.asarray([[1, 0, 1, -2, -3, -1, -5]], dtype='int32').T)
|
||||||
# full method parse_tree(text) is a trivial composition
|
# full method parse_tree(text) is a trivial composition
|
||||||
trees = EN.parse_tree_(doc)
|
trees = doc.print_tree()
|
||||||
assert len(trees) > 0
|
assert len(trees) > 0
|
||||||
tree = trees[0]
|
tree = trees[0]
|
||||||
assert all(k in list(tree.keys()) for k in ['word', 'lemma', 'NE', 'POS_fine', 'POS_coarse', 'arc', 'modifiers'])
|
assert all(k in list(tree.keys()) for k in ['word', 'lemma', 'NE', 'POS_fine', 'POS_coarse', 'arc', 'modifiers'])
|
||||||
|
|
|
@ -21,6 +21,7 @@ from ..parts_of_speech cimport univ_pos_t
|
||||||
from ..lexeme cimport Lexeme
|
from ..lexeme cimport Lexeme
|
||||||
from .span cimport Span
|
from .span cimport Span
|
||||||
from .token cimport Token
|
from .token cimport Token
|
||||||
|
from .printers import parse_tree
|
||||||
from ..serialize.bits cimport BitArray
|
from ..serialize.bits cimport BitArray
|
||||||
from ..util import normalize_slice
|
from ..util import normalize_slice
|
||||||
from ..syntax.iterators import CHUNKERS
|
from ..syntax.iterators import CHUNKERS
|
||||||
|
@ -649,6 +650,10 @@ cdef class Doc:
|
||||||
# Return the merged Python object
|
# Return the merged Python object
|
||||||
return self[start]
|
return self[start]
|
||||||
|
|
||||||
|
def print_tree(self, light=False, flat=False):
|
||||||
|
"""Returns the parse trees in the JSON (Dict) format."""
|
||||||
|
return parse_tree(self, light=light, flat=flat)
|
||||||
|
|
||||||
|
|
||||||
cdef int token_by_start(const TokenC* tokens, int length, int start_char) except -2:
|
cdef int token_by_start(const TokenC* tokens, int length, int start_char) except -2:
|
||||||
cdef int i
|
cdef int i
|
||||||
|
|
54
spacy/tokens/printers.py
Normal file
54
spacy/tokens/printers.py
Normal file
|
@ -0,0 +1,54 @@
|
||||||
|
from copy import deepcopy
|
||||||
|
|
||||||
|
def merge_ents(doc):
|
||||||
|
'''Helper: merge adjacent entities into single tokens; modifies the doc.'''
|
||||||
|
for ent in doc.ents:
|
||||||
|
ent.merge(ent.root.tag_, ent.text, ent.label_)
|
||||||
|
return doc
|
||||||
|
|
||||||
|
def format_POS(token, light, flat):
|
||||||
|
'''helper: form the POS output for a token'''
|
||||||
|
subtree = dict([
|
||||||
|
("word", token.text),
|
||||||
|
("lemma", token.lemma_), # trigger
|
||||||
|
("NE", token.ent_type_), # trigger
|
||||||
|
("POS_fine", token.tag_),
|
||||||
|
("POS_coarse", token.pos_),
|
||||||
|
("arc", token.dep_),
|
||||||
|
("modifiers", [])
|
||||||
|
])
|
||||||
|
if light:
|
||||||
|
subtree.pop("lemma")
|
||||||
|
subtree.pop("NE")
|
||||||
|
if flat:
|
||||||
|
subtree.pop("arc")
|
||||||
|
subtree.pop("modifiers")
|
||||||
|
return subtree
|
||||||
|
|
||||||
|
def POS_tree(root, light, flat):
|
||||||
|
'''Helper: generate a POS tree for a root token.
|
||||||
|
The doc must have merge_ents(doc) ran on it.
|
||||||
|
'''
|
||||||
|
subtree = format_POS(root, light=light, flat=flat)
|
||||||
|
for c in root.children:
|
||||||
|
subtree["modifiers"].append(POS_tree(c))
|
||||||
|
return subtree
|
||||||
|
|
||||||
|
def parse_tree(doc, light=False, flat=False):
|
||||||
|
"""Makes a copy of the doc, then construct a syntactic parse tree, similar to the one used in displaCy. Generates the POS tree for all sentences in a doc
|
||||||
|
|
||||||
|
Args:
|
||||||
|
doc: The doc for parsing.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
[parse_trees (Dict)]:
|
||||||
|
|
||||||
|
>>> from spacy.en import English
|
||||||
|
>>> nlp = English()
|
||||||
|
>>> doc = nlp('Bob brought Alice the pizza. Alice ate the pizza.')
|
||||||
|
>>> trees = doc.print_tree()
|
||||||
|
[{'modifiers': [{'modifiers': [], 'NE': 'PERSON', 'word': 'Bob', 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Bob'}, {'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'dobj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'}, {'modifiers': [{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'lemma': 'pizza'}, {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}], 'NE': '', 'word': 'brought', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'bring'}, {'modifiers': [{'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'}, {'modifiers': [{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'lemma': 'pizza'}, {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}], 'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'eat'}]
|
||||||
|
"""
|
||||||
|
doc_clone = deepcopy(doc)
|
||||||
|
merge_ents(doc_clone) # merge the entities into single tokens first
|
||||||
|
return [POS_tree(sent.root, light=light, flat=flat) for sent in doc_clone.sents]
|
Loading…
Reference in New Issue
Block a user