* Add noun_chunks iterator, and fix left/right child setting in Doc.merge

This commit is contained in:
Matthew Honnibal 2015-07-30 02:29:49 +02:00
parent d153f18969
commit 74d8cb3980

View File

@ -11,10 +11,10 @@ from ..attrs cimport attr_id_t
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
from ..attrs cimport POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE from ..attrs cimport POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
from ..parts_of_speech import UNIV_POS_NAMES from ..parts_of_speech import UNIV_POS_NAMES
from ..parts_of_speech cimport CONJ, PUNCT from ..parts_of_speech cimport CONJ, PUNCT, NOUN
from ..lexeme cimport check_flag from ..lexeme cimport check_flag
from ..lexeme cimport get_attr as get_lex_attr from ..lexeme cimport get_attr as get_lex_attr
from .spans import Span from .spans cimport Span
from .token cimport Token from .token cimport Token
from ..serialize.bits cimport BitArray from ..serialize.bits cimport BitArray
@ -154,6 +154,18 @@ cdef class Doc:
if start != -1: if start != -1:
yield Span(self, start, self.length, label=label) yield Span(self, start, self.length, label=label)
@property
def noun_chunks(self):
"""Yield spans for base noun phrases."""
cdef const TokenC* word
labels = ['nsubj', 'nsubjpass', 'pcomp', 'pobj', 'conj']
np_deps = [self.vocab.strings[label] for label in labels]
np_label = self.vocab.strings['NP']
for i in range(self.length):
word = &self.data[i]
if word.pos == NOUN and word.dep in np_deps:
yield Span(self, word.l_edge, i+1, label=np_label)
@property @property
def sents(self): def sents(self):
""" """
@ -297,20 +309,7 @@ cdef class Doc:
elif attr_id == ENT_TYPE: elif attr_id == ENT_TYPE:
for i in range(length): for i in range(length):
tokens[i].ent_type = values[i] tokens[i].ent_type = values[i]
cdef TokenC* head set_children_from_heads(self.data, self.length)
cdef TokenC* child
# Set left edges
for i in range(length):
child = &tokens[i]
head = &tokens[i + child.head]
if child < head and child.l_edge < head.l_edge:
head.l_edge = child.l_edge
# Set right edges --- same as above, but iterate in reverse
for i in range(length-1, -1, -1):
child = &tokens[i]
head = &tokens[i + child.head]
if child > head and child.r_edge > head.r_edge:
head.r_edge = child.r_edge
return self return self
def to_bytes(self): def to_bytes(self):
@ -354,9 +353,12 @@ cdef class Doc:
break break
else: else:
return None return None
cdef unicode string = self.string
cdef Span span = self[start:end]
# Get LexemeC for newly merged token # Get LexemeC for newly merged token
new_orth = string[start_idx:end_idx] new_orth = ''.join([t.string for t in span])
if span[-1].whitespace_:
new_orth = new_orth[:-1]
cdef const LexemeC* lex = self.vocab.get(self.mem, new_orth) cdef const LexemeC* lex = self.vocab.get(self.mem, new_orth)
# House the new merged token where it starts # House the new merged token where it starts
cdef TokenC* token = &self.data[start] cdef TokenC* token = &self.data[start]
@ -372,30 +374,16 @@ cdef class Doc:
else: else:
token.ent_iob = 3 token.ent_iob = 3
token.ent_type = self.vocab.strings[ent_type] token.ent_type = self.vocab.strings[ent_type]
# Fix dependencies
# Begin by setting all the head indices to absolute token positions # Begin by setting all the head indices to absolute token positions
# This is easier to work with for now than the offsets # This is easier to work with for now than the offsets
# Before thinking of something simpler, beware the case where a dependency
# bridges over the entity. Here the alignment of the tokens changes.
span_root = span.root.i
for i in range(self.length): for i in range(self.length):
self.data[i].head += i self.data[i].head += i
# Find the head of the merged token, and its dep relation # Set the head of the merged token, and its dep relation, from the Span
outer_heads = {} token.head = self.data[span_root].head
for i in range(start, end): token.dep = span.root.dep
head_idx = self.data[i].head
if head_idx == i or head_idx < start or head_idx >= end:
# Don't consider "heads" which are actually dominated by a word
# in the region we're merging
gp = head_idx
while self.data[gp].head != gp:
if start <= gp < end:
break
gp = self.data[gp].head
else:
# If we have multiple words attaching to the same head,
# but with different dep labels, we're preferring the last
# occurring dep label. Shrug. What else could we do, I guess?
outer_heads[head_idx] = self.data[i].dep
token.head, token.dep = max(outer_heads.items())
# Adjust deps before shrinking tokens # Adjust deps before shrinking tokens
# Tokens which point into the merged token should now point to it # Tokens which point into the merged token should now point to it
# Subtract the offset from all tokens which point to >= end # Subtract the offset from all tokens which point to >= end
@ -406,7 +394,6 @@ cdef class Doc:
self.data[i].head = start self.data[i].head = start
elif head_idx >= end: elif head_idx >= end:
self.data[i].head -= offset self.data[i].head -= offset
# TODO: Fix left and right deps
# Now compress the token array # Now compress the token array
for i in range(end, self.length): for i in range(end, self.length):
self.data[i - offset] = self.data[i] self.data[i - offset] = self.data[i]
@ -417,6 +404,28 @@ cdef class Doc:
for i in range(self.length): for i in range(self.length):
# ...And, set heads back to a relative position # ...And, set heads back to a relative position
self.data[i].head -= i self.data[i].head -= i
# Set the left/right children, left/right edges
set_children_from_heads(self.data, self.length)
# Clear the cached Python objects
self._py_tokens = [None] * self.length
# Return the merged Python object # Return the merged Python object
return self[start] return self[start]
cdef int set_children_from_heads(TokenC* tokens, int length) except -1:
cdef TokenC* head
cdef TokenC* child
cdef int i
# Set left edges
for i in range(length):
child = &tokens[i]
head = &tokens[i + child.head]
if child < head and child.l_edge < head.l_edge:
head.l_edge = child.l_edge
# Set right edges --- same as above, but iterate in reverse
for i in range(length-1, -1, -1):
child = &tokens[i]
head = &tokens[i + child.head]
if child > head and child.r_edge > head.r_edge:
head.r_edge = child.r_edge