mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
* Add noun_chunks iterator, and fix left/right child setting in Doc.merge
This commit is contained in:
parent
d153f18969
commit
74d8cb3980
|
@ -11,10 +11,10 @@ from ..attrs cimport attr_id_t
|
||||||
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
|
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
|
||||||
from ..attrs cimport POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
|
from ..attrs cimport POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
|
||||||
from ..parts_of_speech import UNIV_POS_NAMES
|
from ..parts_of_speech import UNIV_POS_NAMES
|
||||||
from ..parts_of_speech cimport CONJ, PUNCT
|
from ..parts_of_speech cimport CONJ, PUNCT, NOUN
|
||||||
from ..lexeme cimport check_flag
|
from ..lexeme cimport check_flag
|
||||||
from ..lexeme cimport get_attr as get_lex_attr
|
from ..lexeme cimport get_attr as get_lex_attr
|
||||||
from .spans import Span
|
from .spans cimport Span
|
||||||
from .token cimport Token
|
from .token cimport Token
|
||||||
from ..serialize.bits cimport BitArray
|
from ..serialize.bits cimport BitArray
|
||||||
|
|
||||||
|
@ -154,6 +154,18 @@ cdef class Doc:
|
||||||
if start != -1:
|
if start != -1:
|
||||||
yield Span(self, start, self.length, label=label)
|
yield Span(self, start, self.length, label=label)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def noun_chunks(self):
|
||||||
|
"""Yield spans for base noun phrases."""
|
||||||
|
cdef const TokenC* word
|
||||||
|
labels = ['nsubj', 'nsubjpass', 'pcomp', 'pobj', 'conj']
|
||||||
|
np_deps = [self.vocab.strings[label] for label in labels]
|
||||||
|
np_label = self.vocab.strings['NP']
|
||||||
|
for i in range(self.length):
|
||||||
|
word = &self.data[i]
|
||||||
|
if word.pos == NOUN and word.dep in np_deps:
|
||||||
|
yield Span(self, word.l_edge, i+1, label=np_label)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def sents(self):
|
def sents(self):
|
||||||
"""
|
"""
|
||||||
|
@ -297,20 +309,7 @@ cdef class Doc:
|
||||||
elif attr_id == ENT_TYPE:
|
elif attr_id == ENT_TYPE:
|
||||||
for i in range(length):
|
for i in range(length):
|
||||||
tokens[i].ent_type = values[i]
|
tokens[i].ent_type = values[i]
|
||||||
cdef TokenC* head
|
set_children_from_heads(self.data, self.length)
|
||||||
cdef TokenC* child
|
|
||||||
# Set left edges
|
|
||||||
for i in range(length):
|
|
||||||
child = &tokens[i]
|
|
||||||
head = &tokens[i + child.head]
|
|
||||||
if child < head and child.l_edge < head.l_edge:
|
|
||||||
head.l_edge = child.l_edge
|
|
||||||
# Set right edges --- same as above, but iterate in reverse
|
|
||||||
for i in range(length-1, -1, -1):
|
|
||||||
child = &tokens[i]
|
|
||||||
head = &tokens[i + child.head]
|
|
||||||
if child > head and child.r_edge > head.r_edge:
|
|
||||||
head.r_edge = child.r_edge
|
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def to_bytes(self):
|
def to_bytes(self):
|
||||||
|
@ -354,9 +353,12 @@ cdef class Doc:
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
return None
|
return None
|
||||||
cdef unicode string = self.string
|
|
||||||
|
cdef Span span = self[start:end]
|
||||||
# Get LexemeC for newly merged token
|
# Get LexemeC for newly merged token
|
||||||
new_orth = string[start_idx:end_idx]
|
new_orth = ''.join([t.string for t in span])
|
||||||
|
if span[-1].whitespace_:
|
||||||
|
new_orth = new_orth[:-1]
|
||||||
cdef const LexemeC* lex = self.vocab.get(self.mem, new_orth)
|
cdef const LexemeC* lex = self.vocab.get(self.mem, new_orth)
|
||||||
# House the new merged token where it starts
|
# House the new merged token where it starts
|
||||||
cdef TokenC* token = &self.data[start]
|
cdef TokenC* token = &self.data[start]
|
||||||
|
@ -372,30 +374,16 @@ cdef class Doc:
|
||||||
else:
|
else:
|
||||||
token.ent_iob = 3
|
token.ent_iob = 3
|
||||||
token.ent_type = self.vocab.strings[ent_type]
|
token.ent_type = self.vocab.strings[ent_type]
|
||||||
# Fix dependencies
|
|
||||||
# Begin by setting all the head indices to absolute token positions
|
# Begin by setting all the head indices to absolute token positions
|
||||||
# This is easier to work with for now than the offsets
|
# This is easier to work with for now than the offsets
|
||||||
|
# Before thinking of something simpler, beware the case where a dependency
|
||||||
|
# bridges over the entity. Here the alignment of the tokens changes.
|
||||||
|
span_root = span.root.i
|
||||||
for i in range(self.length):
|
for i in range(self.length):
|
||||||
self.data[i].head += i
|
self.data[i].head += i
|
||||||
# Find the head of the merged token, and its dep relation
|
# Set the head of the merged token, and its dep relation, from the Span
|
||||||
outer_heads = {}
|
token.head = self.data[span_root].head
|
||||||
for i in range(start, end):
|
token.dep = span.root.dep
|
||||||
head_idx = self.data[i].head
|
|
||||||
if head_idx == i or head_idx < start or head_idx >= end:
|
|
||||||
# Don't consider "heads" which are actually dominated by a word
|
|
||||||
# in the region we're merging
|
|
||||||
gp = head_idx
|
|
||||||
while self.data[gp].head != gp:
|
|
||||||
if start <= gp < end:
|
|
||||||
break
|
|
||||||
gp = self.data[gp].head
|
|
||||||
else:
|
|
||||||
# If we have multiple words attaching to the same head,
|
|
||||||
# but with different dep labels, we're preferring the last
|
|
||||||
# occurring dep label. Shrug. What else could we do, I guess?
|
|
||||||
outer_heads[head_idx] = self.data[i].dep
|
|
||||||
|
|
||||||
token.head, token.dep = max(outer_heads.items())
|
|
||||||
# Adjust deps before shrinking tokens
|
# Adjust deps before shrinking tokens
|
||||||
# Tokens which point into the merged token should now point to it
|
# Tokens which point into the merged token should now point to it
|
||||||
# Subtract the offset from all tokens which point to >= end
|
# Subtract the offset from all tokens which point to >= end
|
||||||
|
@ -406,7 +394,6 @@ cdef class Doc:
|
||||||
self.data[i].head = start
|
self.data[i].head = start
|
||||||
elif head_idx >= end:
|
elif head_idx >= end:
|
||||||
self.data[i].head -= offset
|
self.data[i].head -= offset
|
||||||
# TODO: Fix left and right deps
|
|
||||||
# Now compress the token array
|
# Now compress the token array
|
||||||
for i in range(end, self.length):
|
for i in range(end, self.length):
|
||||||
self.data[i - offset] = self.data[i]
|
self.data[i - offset] = self.data[i]
|
||||||
|
@ -417,6 +404,28 @@ cdef class Doc:
|
||||||
for i in range(self.length):
|
for i in range(self.length):
|
||||||
# ...And, set heads back to a relative position
|
# ...And, set heads back to a relative position
|
||||||
self.data[i].head -= i
|
self.data[i].head -= i
|
||||||
|
# Set the left/right children, left/right edges
|
||||||
|
set_children_from_heads(self.data, self.length)
|
||||||
|
# Clear the cached Python objects
|
||||||
|
self._py_tokens = [None] * self.length
|
||||||
# Return the merged Python object
|
# Return the merged Python object
|
||||||
return self[start]
|
return self[start]
|
||||||
|
|
||||||
|
|
||||||
|
cdef int set_children_from_heads(TokenC* tokens, int length) except -1:
|
||||||
|
cdef TokenC* head
|
||||||
|
cdef TokenC* child
|
||||||
|
cdef int i
|
||||||
|
# Set left edges
|
||||||
|
for i in range(length):
|
||||||
|
child = &tokens[i]
|
||||||
|
head = &tokens[i + child.head]
|
||||||
|
if child < head and child.l_edge < head.l_edge:
|
||||||
|
head.l_edge = child.l_edge
|
||||||
|
# Set right edges --- same as above, but iterate in reverse
|
||||||
|
for i in range(length-1, -1, -1):
|
||||||
|
child = &tokens[i]
|
||||||
|
head = &tokens[i + child.head]
|
||||||
|
if child > head and child.r_edge > head.r_edge:
|
||||||
|
head.r_edge = child.r_edge
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user