* Allow regular expressions to specify labels for merged spans

This commit is contained in:
Matthew Honnibal 2015-03-27 17:40:52 +01:00
parent a3af6b7c3d
commit 557856e84c

View File

@ -12,10 +12,11 @@ cdef class Spans:
token_starts = {t.idx: t.i for t in self.tokens} token_starts = {t.idx: t.i for t in self.tokens}
token_ends = {t.idx + len(t): t.i+1 for t in self.tokens} token_ends = {t.idx + len(t): t.i+1 for t in self.tokens}
tokens_str = unicode(self.tokens) tokens_str = unicode(self.tokens)
for match in mwe_re.finditer(tokens_str): for label, regex in mwe_re:
start = token_starts.get(match.start()) for match in regex.finditer(tokens_str):
end = token_ends.get(match.end()) start = token_starts.get(match.start())
self.merge(start, end) end = token_ends.get(match.end())
self.merge(start, end, label=label)
if merge_ents: if merge_ents:
# Merge named entities and units # Merge named entities and units
for ent in self.tokens.ents: for ent in self.tokens.ents:
@ -128,12 +129,16 @@ cdef class Span:
property orth_: property orth_:
def __get__(self): def __get__(self):
return ''.join([t.string for t in self]).strip() return ' '.join([t.string for t in self]).strip()
property lemma_: property lemma_:
def __get__(self): def __get__(self):
return ' '.join([t.lemma_ for t in self]).strip() return ' '.join([t.lemma_ for t in self]).strip()
property string:
def __get__(self):
return ''.join([t.string for t in self])
property label_: property label_:
def __get__(self): def __get__(self):
return self._seq.vocab.strings[self.label] return self._seq.vocab.strings[self.label]