mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 00:46:28 +03:00
* Add noun_chunks iterator, and fix left/right child setting in Doc.merge
This commit is contained in:
parent
d153f18969
commit
74d8cb3980
|
@ -11,10 +11,10 @@ from ..attrs cimport attr_id_t
|
|||
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
|
||||
from ..attrs cimport POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
|
||||
from ..parts_of_speech import UNIV_POS_NAMES
|
||||
from ..parts_of_speech cimport CONJ, PUNCT
|
||||
from ..parts_of_speech cimport CONJ, PUNCT, NOUN
|
||||
from ..lexeme cimport check_flag
|
||||
from ..lexeme cimport get_attr as get_lex_attr
|
||||
from .spans import Span
|
||||
from .spans cimport Span
|
||||
from .token cimport Token
|
||||
from ..serialize.bits cimport BitArray
|
||||
|
||||
|
@ -154,6 +154,18 @@ cdef class Doc:
|
|||
if start != -1:
|
||||
yield Span(self, start, self.length, label=label)
|
||||
|
||||
@property
|
||||
def noun_chunks(self):
|
||||
"""Yield spans for base noun phrases."""
|
||||
cdef const TokenC* word
|
||||
labels = ['nsubj', 'nsubjpass', 'pcomp', 'pobj', 'conj']
|
||||
np_deps = [self.vocab.strings[label] for label in labels]
|
||||
np_label = self.vocab.strings['NP']
|
||||
for i in range(self.length):
|
||||
word = &self.data[i]
|
||||
if word.pos == NOUN and word.dep in np_deps:
|
||||
yield Span(self, word.l_edge, i+1, label=np_label)
|
||||
|
||||
@property
|
||||
def sents(self):
|
||||
"""
|
||||
|
@ -297,20 +309,7 @@ cdef class Doc:
|
|||
elif attr_id == ENT_TYPE:
|
||||
for i in range(length):
|
||||
tokens[i].ent_type = values[i]
|
||||
cdef TokenC* head
|
||||
cdef TokenC* child
|
||||
# Set left edges
|
||||
for i in range(length):
|
||||
child = &tokens[i]
|
||||
head = &tokens[i + child.head]
|
||||
if child < head and child.l_edge < head.l_edge:
|
||||
head.l_edge = child.l_edge
|
||||
# Set right edges --- same as above, but iterate in reverse
|
||||
for i in range(length-1, -1, -1):
|
||||
child = &tokens[i]
|
||||
head = &tokens[i + child.head]
|
||||
if child > head and child.r_edge > head.r_edge:
|
||||
head.r_edge = child.r_edge
|
||||
set_children_from_heads(self.data, self.length)
|
||||
return self
|
||||
|
||||
def to_bytes(self):
|
||||
|
@ -354,9 +353,12 @@ cdef class Doc:
|
|||
break
|
||||
else:
|
||||
return None
|
||||
cdef unicode string = self.string
|
||||
|
||||
cdef Span span = self[start:end]
|
||||
# Get LexemeC for newly merged token
|
||||
new_orth = string[start_idx:end_idx]
|
||||
new_orth = ''.join([t.string for t in span])
|
||||
if span[-1].whitespace_:
|
||||
new_orth = new_orth[:-1]
|
||||
cdef const LexemeC* lex = self.vocab.get(self.mem, new_orth)
|
||||
# House the new merged token where it starts
|
||||
cdef TokenC* token = &self.data[start]
|
||||
|
@ -372,30 +374,16 @@ cdef class Doc:
|
|||
else:
|
||||
token.ent_iob = 3
|
||||
token.ent_type = self.vocab.strings[ent_type]
|
||||
# Fix dependencies
|
||||
# Begin by setting all the head indices to absolute token positions
|
||||
# This is easier to work with for now than the offsets
|
||||
# Before thinking of something simpler, beware the case where a dependency
|
||||
# bridges over the entity. Here the alignment of the tokens changes.
|
||||
span_root = span.root.i
|
||||
for i in range(self.length):
|
||||
self.data[i].head += i
|
||||
# Find the head of the merged token, and its dep relation
|
||||
outer_heads = {}
|
||||
for i in range(start, end):
|
||||
head_idx = self.data[i].head
|
||||
if head_idx == i or head_idx < start or head_idx >= end:
|
||||
# Don't consider "heads" which are actually dominated by a word
|
||||
# in the region we're merging
|
||||
gp = head_idx
|
||||
while self.data[gp].head != gp:
|
||||
if start <= gp < end:
|
||||
break
|
||||
gp = self.data[gp].head
|
||||
else:
|
||||
# If we have multiple words attaching to the same head,
|
||||
# but with different dep labels, we're preferring the last
|
||||
# occurring dep label. Shrug. What else could we do, I guess?
|
||||
outer_heads[head_idx] = self.data[i].dep
|
||||
|
||||
token.head, token.dep = max(outer_heads.items())
|
||||
# Set the head of the merged token, and its dep relation, from the Span
|
||||
token.head = self.data[span_root].head
|
||||
token.dep = span.root.dep
|
||||
# Adjust deps before shrinking tokens
|
||||
# Tokens which point into the merged token should now point to it
|
||||
# Subtract the offset from all tokens which point to >= end
|
||||
|
@ -406,7 +394,6 @@ cdef class Doc:
|
|||
self.data[i].head = start
|
||||
elif head_idx >= end:
|
||||
self.data[i].head -= offset
|
||||
# TODO: Fix left and right deps
|
||||
# Now compress the token array
|
||||
for i in range(end, self.length):
|
||||
self.data[i - offset] = self.data[i]
|
||||
|
@ -417,6 +404,28 @@ cdef class Doc:
|
|||
for i in range(self.length):
|
||||
# ...And, set heads back to a relative position
|
||||
self.data[i].head -= i
|
||||
|
||||
# Set the left/right children, left/right edges
|
||||
set_children_from_heads(self.data, self.length)
|
||||
# Clear the cached Python objects
|
||||
self._py_tokens = [None] * self.length
|
||||
# Return the merged Python object
|
||||
return self[start]
|
||||
|
||||
|
||||
cdef int set_children_from_heads(TokenC* tokens, int length) except -1:
|
||||
cdef TokenC* head
|
||||
cdef TokenC* child
|
||||
cdef int i
|
||||
# Set left edges
|
||||
for i in range(length):
|
||||
child = &tokens[i]
|
||||
head = &tokens[i + child.head]
|
||||
if child < head and child.l_edge < head.l_edge:
|
||||
head.l_edge = child.l_edge
|
||||
# Set right edges --- same as above, but iterate in reverse
|
||||
for i in range(length-1, -1, -1):
|
||||
child = &tokens[i]
|
||||
head = &tokens[i + child.head]
|
||||
if child > head and child.r_edge > head.r_edge:
|
||||
head.r_edge = child.r_edge
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user