mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 09:26:27 +03:00
* merge add lex last - add index finder funcs
This commit is contained in:
parent
a06e3c8963
commit
562db6d2d0
|
@ -438,11 +438,25 @@ cdef class Doc:
|
||||||
keep_reading = False
|
keep_reading = False
|
||||||
yield n_bytes_str + data
|
yield n_bytes_str + data
|
||||||
|
|
||||||
# This function is terrible --- need to fix this.
|
def token_index_start(self, int start_idx):
|
||||||
def merge(self, int start_idx, int end_idx, unicode tag, unicode lemma,
|
""" Get index of token in doc that has character index start_idx """
|
||||||
unicode ent_type):
|
cdef int i
|
||||||
"""Merge a multi-word expression into a single token. Currently
|
for i in range(self.length):
|
||||||
experimental; API is likely to change."""
|
if self.c[i].idx == start_idx:
|
||||||
|
return i
|
||||||
|
return None
|
||||||
|
|
||||||
|
def token_index_end(self, int end_idx):
|
||||||
|
""" Get index+1 of token in doc ending with character index end_idx """
|
||||||
|
cdef int i
|
||||||
|
for i in range(self.length):
|
||||||
|
if (self.c[i].idx + self.c[i].lex.length) == end_idx:
|
||||||
|
return i + 1
|
||||||
|
return None
|
||||||
|
|
||||||
|
def range_from_indices(self, int start_idx, int end_idx):
|
||||||
|
""" Get tuple - span of token indices which correspond to
|
||||||
|
character indices (start_idx, end_idx) if such a span exists"""
|
||||||
cdef int i
|
cdef int i
|
||||||
cdef int start = -1
|
cdef int start = -1
|
||||||
cdef int end = -1
|
cdef int end = -1
|
||||||
|
@ -453,10 +467,18 @@ cdef class Doc:
|
||||||
if start == -1:
|
if start == -1:
|
||||||
return None
|
return None
|
||||||
end = i + 1
|
end = i + 1
|
||||||
break
|
return (start, end)
|
||||||
else:
|
return None
|
||||||
return None
|
|
||||||
|
|
||||||
|
# This function is terrible --- need to fix this.
|
||||||
|
def merge(self, int start_idx, int end_idx, unicode tag, unicode lemma,
|
||||||
|
unicode ent_type):
|
||||||
|
"""Merge a multi-word expression into a single token. Currently
|
||||||
|
experimental; API is likely to change."""
|
||||||
|
start_end = self.range_from_indices(start_idx, end_idx)
|
||||||
|
if start_end is None:
|
||||||
|
return None
|
||||||
|
start, end = start_end
|
||||||
cdef Span span = self[start:end]
|
cdef Span span = self[start:end]
|
||||||
# Get LexemeC for newly merged token
|
# Get LexemeC for newly merged token
|
||||||
new_orth = ''.join([t.text_with_ws for t in span])
|
new_orth = ''.join([t.text_with_ws for t in span])
|
||||||
|
@ -465,8 +487,6 @@ cdef class Doc:
|
||||||
cdef const LexemeC* lex = self.vocab.get(self.mem, new_orth)
|
cdef const LexemeC* lex = self.vocab.get(self.mem, new_orth)
|
||||||
# House the new merged token where it starts
|
# House the new merged token where it starts
|
||||||
cdef TokenC* token = &self.c[start]
|
cdef TokenC* token = &self.c[start]
|
||||||
# Update fields
|
|
||||||
token.lex = lex
|
|
||||||
token.spacy = self.c[end-1].spacy
|
token.spacy = self.c[end-1].spacy
|
||||||
if tag in self.vocab.morphology.tag_map:
|
if tag in self.vocab.morphology.tag_map:
|
||||||
self.vocab.morphology.assign_tag(token, tag)
|
self.vocab.morphology.assign_tag(token, tag)
|
||||||
|
@ -485,6 +505,10 @@ cdef class Doc:
|
||||||
# bridges over the entity. Here the alignment of the tokens changes.
|
# bridges over the entity. Here the alignment of the tokens changes.
|
||||||
span_root = span.root.i
|
span_root = span.root.i
|
||||||
token.dep = span.root.dep
|
token.dep = span.root.dep
|
||||||
|
# We update token.lex after keeping span root and dep, since
|
||||||
|
# setting token.lex will change span.start and span.end properties
|
||||||
|
# as it modifies the character offsets in the doc
|
||||||
|
token.lex = lex
|
||||||
for i in range(self.length):
|
for i in range(self.length):
|
||||||
self.c[i].head += i
|
self.c[i].head += i
|
||||||
# Set the head of the merged token, and its dep relation, from the Span
|
# Set the head of the merged token, and its dep relation, from the Span
|
||||||
|
|
Loading…
Reference in New Issue
Block a user