* Allow regular expressions to specify labels for merged spans

This commit is contained in:
Matthew Honnibal 2015-03-27 17:40:52 +01:00
parent a3af6b7c3d
commit 557856e84c

View File

@ -12,10 +12,11 @@ cdef class Spans:
token_starts = {t.idx: t.i for t in self.tokens}
token_ends = {t.idx + len(t): t.i+1 for t in self.tokens}
tokens_str = unicode(self.tokens)
for match in mwe_re.finditer(tokens_str):
for label, regex in mwe_re:
for match in regex.finditer(tokens_str):
start = token_starts.get(match.start())
end = token_ends.get(match.end())
self.merge(start, end)
self.merge(start, end, label=label)
if merge_ents:
# Merge named entities and units
for ent in self.tokens.ents:
@ -134,6 +135,10 @@ cdef class Span:
def __get__(self):
return ' '.join([t.lemma_ for t in self]).strip()
property string:
def __get__(self):
return ''.join([t.string for t in self])
property label_:
def __get__(self):
return self._seq.vocab.strings[self.label]