Don't require attrs argument in Doc.retokenize and allow both ints and unicode (resolves #2304)

This commit is contained in:
ines 2018-05-20 15:15:37 +02:00
parent 5768df4f09
commit b59e3b157f
2 changed files with 35 additions and 10 deletions

View File

@ -4,6 +4,7 @@ from __future__ import unicode_literals
from ..util import get_doc from ..util import get_doc
from ...tokens import Doc from ...tokens import Doc
from ...vocab import Vocab from ...vocab import Vocab
from ...attrs import LEMMA
import pytest import pytest
import numpy import numpy
@ -178,6 +179,26 @@ def test_doc_api_merge_hang(en_tokenizer):
doc.merge(8, 32, tag='', lemma='', ent_type='ORG') doc.merge(8, 32, tag='', lemma='', ent_type='ORG')
def test_doc_api_retokenizer(en_tokenizer):
doc = en_tokenizer("WKRO played songs by the beach boys all night")
with doc.retokenize() as retokenizer:
retokenizer.merge(doc[4:7])
assert len(doc) == 7
assert doc[4].text == 'the beach boys'
def test_doc_api_retokenizer_attrs(en_tokenizer):
doc = en_tokenizer("WKRO played songs by the beach boys all night")
# test both string and integer attributes and values
attrs = {LEMMA: 'boys', 'ENT_TYPE': doc.vocab.strings['ORG']}
with doc.retokenize() as retokenizer:
retokenizer.merge(doc[4:7], attrs=attrs)
assert len(doc) == 7
assert doc[4].text == 'the beach boys'
assert doc[4].lemma_ == 'boys'
assert doc[4].ent_type_ == 'ORG'
def test_doc_api_sents_empty_string(en_tokenizer): def test_doc_api_sents_empty_string(en_tokenizer):
doc = en_tokenizer("") doc = en_tokenizer("")
doc.is_parsed = True doc.is_parsed = True

View File

@ -11,11 +11,13 @@ from .span cimport Span
from .token cimport Token from .token cimport Token
from ..lexeme cimport Lexeme, EMPTY_LEXEME from ..lexeme cimport Lexeme, EMPTY_LEXEME
from ..structs cimport LexemeC, TokenC from ..structs cimport LexemeC, TokenC
from ..attrs cimport * from ..attrs cimport TAG
from ..attrs import intify_attrs
from ..util import SimpleFrozenDict
cdef class Retokenizer: cdef class Retokenizer:
'''Helper class for doc.retokenize() context manager.''' """Helper class for doc.retokenize() context manager."""
cdef Doc doc cdef Doc doc
cdef list merges cdef list merges
cdef list splits cdef list splits
@ -24,14 +26,18 @@ cdef class Retokenizer:
self.merges = [] self.merges = []
self.splits = [] self.splits = []
def merge(self, Span span, attrs=None): def merge(self, Span span, attrs=SimpleFrozenDict()):
'''Mark a span for merging. The attrs will be applied to the resulting """Mark a span for merging. The attrs will be applied to the resulting
token.''' token.
"""
attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings)
self.merges.append((span.start_char, span.end_char, attrs)) self.merges.append((span.start_char, span.end_char, attrs))
def split(self, Token token, orths, attrs=None): def split(self, Token token, orths, attrs=SimpleFrozenDict()):
'''Mark a Token for splitting, into the specified orths. The attrs """Mark a Token for splitting, into the specified orths. The attrs
will be applied to each subtoken.''' will be applied to each subtoken.
"""
attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings)
self.splits.append((token.start_char, orths, attrs)) self.splits.append((token.start_char, orths, attrs))
def __enter__(self): def __enter__(self):
@ -125,5 +131,3 @@ def _merge(Doc doc, int start, int end, attributes):
# Clear the cached Python objects # Clear the cached Python objects
# Return the merged Python object # Return the merged Python object
return doc[start] return doc[start]