mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-03 21:24:11 +03:00
Handle out-of-vocab words
Wasn't handling words out of the tokenizer dictionary vocabulary properly. This adds a fix and test for that. -POLM
This commit is contained in:
parent
95050201ce
commit
8b3e1f7b5b
|
@ -66,11 +66,15 @@ def detailed_tokens(tokenizer, text):
|
|||
node = node.next # first node is beginning of sentence and empty, skip it
|
||||
words = []
|
||||
while node.posid != 0:
|
||||
surface = node.surface
|
||||
base = surface
|
||||
parts = node.feature.split(',')
|
||||
pos = ','.join(parts[0:4])
|
||||
reading = parts[6]
|
||||
base = parts[7]
|
||||
surface = parts[8]
|
||||
|
||||
if len(parts) > 6:
|
||||
# this information is only available for words in the tokenizer dictionary
|
||||
reading = parts[6]
|
||||
base = parts[7]
|
||||
|
||||
words.append( ShortUnitWord(surface, base, pos) )
|
||||
node = node.next
|
||||
|
|
|
@ -22,7 +22,12 @@ TAGGER_TESTS = [
|
|||
('動詞,一般,*,*', 'VERB'),
|
||||
('助詞,接続助詞,*,*', 'SCONJ'),
|
||||
('動詞,非自立可能,*,*', 'VERB'),
|
||||
('助詞,終助詞,*,*', 'PART')))
|
||||
('助詞,終助詞,*,*', 'PART'))),
|
||||
('プププランドに行きたい',
|
||||
(('名詞,普通名詞,一般,*', 'NOUN'),
|
||||
('助詞,格助詞,*,*', 'ADP'),
|
||||
('動詞,非自立可能,*,*', 'VERB'),
|
||||
('助動詞,*,*,*', 'AUX')))
|
||||
]
|
||||
|
||||
@pytest.mark.parametrize('text,expected_tags', TAGGER_TESTS)
|
||||
|
|
Loading…
Reference in New Issue
Block a user