mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 16:07:41 +03:00 
			
		
		
		
	Don't require attrs argument in Doc.retokenize and allow both ints and unicode (resolves #2304)
This commit is contained in:
		
							parent
							
								
									5768df4f09
								
							
						
					
					
						commit
						b59e3b157f
					
				|  | @ -4,6 +4,7 @@ from __future__ import unicode_literals | |||
| from ..util import get_doc | ||||
| from ...tokens import Doc | ||||
| from ...vocab import Vocab | ||||
| from ...attrs import LEMMA | ||||
| 
 | ||||
| import pytest | ||||
| import numpy | ||||
|  | @ -178,6 +179,26 @@ def test_doc_api_merge_hang(en_tokenizer): | |||
|     doc.merge(8, 32, tag='', lemma='', ent_type='ORG') | ||||
| 
 | ||||
| 
 | ||||
| def test_doc_api_retokenizer(en_tokenizer): | ||||
|     doc = en_tokenizer("WKRO played songs by the beach boys all night") | ||||
|     with doc.retokenize() as retokenizer: | ||||
|         retokenizer.merge(doc[4:7]) | ||||
|     assert len(doc) == 7 | ||||
|     assert doc[4].text == 'the beach boys' | ||||
| 
 | ||||
| 
 | ||||
| def test_doc_api_retokenizer_attrs(en_tokenizer): | ||||
|     doc = en_tokenizer("WKRO played songs by the beach boys all night") | ||||
|     # test both string and integer attributes and values | ||||
|     attrs = {LEMMA: 'boys', 'ENT_TYPE': doc.vocab.strings['ORG']} | ||||
|     with doc.retokenize() as retokenizer: | ||||
|         retokenizer.merge(doc[4:7], attrs=attrs) | ||||
|     assert len(doc) == 7 | ||||
|     assert doc[4].text == 'the beach boys' | ||||
|     assert doc[4].lemma_ == 'boys' | ||||
|     assert doc[4].ent_type_ == 'ORG' | ||||
| 
 | ||||
| 
 | ||||
| def test_doc_api_sents_empty_string(en_tokenizer): | ||||
|     doc = en_tokenizer("") | ||||
|     doc.is_parsed = True | ||||
|  |  | |||
|  | @ -11,11 +11,13 @@ from .span cimport Span | |||
| from .token cimport Token | ||||
| from ..lexeme cimport Lexeme, EMPTY_LEXEME | ||||
| from ..structs cimport LexemeC, TokenC | ||||
| from ..attrs cimport * | ||||
| from ..attrs cimport TAG | ||||
| from ..attrs import intify_attrs | ||||
| from ..util import SimpleFrozenDict | ||||
| 
 | ||||
| 
 | ||||
| cdef class Retokenizer: | ||||
|     '''Helper class for doc.retokenize() context manager.''' | ||||
|     """Helper class for doc.retokenize() context manager.""" | ||||
|     cdef Doc doc | ||||
|     cdef list merges | ||||
|     cdef list splits | ||||
|  | @ -24,14 +26,18 @@ cdef class Retokenizer: | |||
|         self.merges = [] | ||||
|         self.splits = [] | ||||
| 
 | ||||
|     def merge(self, Span span, attrs=None): | ||||
|         '''Mark a span for merging. The attrs will be applied to the resulting | ||||
|         token.''' | ||||
|     def merge(self, Span span, attrs=SimpleFrozenDict()): | ||||
|         """Mark a span for merging. The attrs will be applied to the resulting | ||||
|         token. | ||||
|         """ | ||||
|         attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings) | ||||
|         self.merges.append((span.start_char, span.end_char, attrs)) | ||||
| 
 | ||||
|     def split(self, Token token, orths, attrs=None): | ||||
|         '''Mark a Token for splitting, into the specified orths. The attrs | ||||
|         will be applied to each subtoken.''' | ||||
|     def split(self, Token token, orths, attrs=SimpleFrozenDict()): | ||||
|         """Mark a Token for splitting, into the specified orths. The attrs | ||||
|         will be applied to each subtoken. | ||||
|         """ | ||||
|         attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings) | ||||
|         self.splits.append((token.start_char, orths, attrs)) | ||||
| 
 | ||||
|     def __enter__(self): | ||||
|  | @ -125,5 +131,3 @@ def _merge(Doc doc, int start, int end, attributes): | |||
|     # Clear the cached Python objects | ||||
|     # Return the merged Python object | ||||
|     return doc[start] | ||||
| 
 | ||||
| 
 | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user