mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 23:47:31 +03:00 
			
		
		
		
	Update customizing-tokenizer.jade
update some codes: - `me` -> `-PRON` - `TAG` -> `POS` - `create_tokenizer` function
This commit is contained in:
		
							parent
							
								
									ea2732469b
								
							
						
					
					
						commit
						6f450306c3
					
				|  | @ -40,7 +40,9 @@ p | ||||||
|             { |             { | ||||||
|                 ORTH: u'me'}]) |                 ORTH: u'me'}]) | ||||||
|     assert [w.text for w in nlp(u'gimme that')] == [u'gim', u'me', u'that'] |     assert [w.text for w in nlp(u'gimme that')] == [u'gim', u'me', u'that'] | ||||||
|     assert [w.lemma_ for w in nlp(u'gimme that')] == [u'give', u'me', u'that'] |     # Pronoun lemma is returned as -PRON- | ||||||
|  |     # More details please see: https://spacy.io/docs/usage/troubleshooting#pron-lemma | ||||||
|  |     assert [w.lemma_ for w in nlp(u'gimme that')] == [u'give', u'-PRON-', u'that'] | ||||||
| 
 | 
 | ||||||
| p | p | ||||||
|     |  The special case doesn't have to match an entire whitespace-delimited |     |  The special case doesn't have to match an entire whitespace-delimited | ||||||
|  | @ -57,7 +59,7 @@ p | ||||||
| +code. | +code. | ||||||
|     nlp.tokenizer.add_special_case(u'...gimme...?', |     nlp.tokenizer.add_special_case(u'...gimme...?', | ||||||
|         [{ |         [{ | ||||||
|             ORTH: u'...gimme...?', LEMMA: u'give', TAG: u'VB'}]) |             ORTH: u'...gimme...?', LEMMA: u'give', POS: u'VB'}]) | ||||||
|     assert len(nlp(u'...gimme...?')) == 1 |     assert len(nlp(u'...gimme...?')) == 1 | ||||||
| 
 | 
 | ||||||
| p | p | ||||||
|  | @ -172,12 +174,14 @@ p | ||||||
| 
 | 
 | ||||||
|     prefix_re = re.compile(r'''[\[\("']''') |     prefix_re = re.compile(r'''[\[\("']''') | ||||||
|     suffix_re = re.compile(r'''[\]\)"']''') |     suffix_re = re.compile(r'''[\]\)"']''') | ||||||
|  |     infix_re = re.compile(r'''[-~]''') | ||||||
|     def create_tokenizer(nlp): |     def create_tokenizer(nlp): | ||||||
|         return Tokenizer(nlp.vocab, |         return Tokenizer(nlp.vocab, rules={}, | ||||||
|                 prefix_search=prefix_re.search, |                 prefix_search=prefix_re.search, | ||||||
|                 suffix_search=suffix_re.search) |                 suffix_search=suffix_re.search, | ||||||
|  |                 infix_finditer=infix_re.finditer) | ||||||
| 
 | 
 | ||||||
|     nlp = spacy.load('en', tokenizer=create_make_doc) |     nlp = spacy.load('en', create_make_doc=create_tokenizer) | ||||||
| 
 | 
 | ||||||
| p | p | ||||||
|     |  If you need to subclass the tokenizer instead, the relevant methods to |     |  If you need to subclass the tokenizer instead, the relevant methods to | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user