mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-03 21:24:11 +03:00
Handle out-of-vocab words
Wasn't handling words out of the tokenizer dictionary vocabulary properly. This adds a fix and test for that. -POLM
This commit is contained in:
parent
95050201ce
commit
8b3e1f7b5b
|
@ -66,11 +66,15 @@ def detailed_tokens(tokenizer, text):
|
||||||
node = node.next # first node is beginning of sentence and empty, skip it
|
node = node.next # first node is beginning of sentence and empty, skip it
|
||||||
words = []
|
words = []
|
||||||
while node.posid != 0:
|
while node.posid != 0:
|
||||||
|
surface = node.surface
|
||||||
|
base = surface
|
||||||
parts = node.feature.split(',')
|
parts = node.feature.split(',')
|
||||||
pos = ','.join(parts[0:4])
|
pos = ','.join(parts[0:4])
|
||||||
reading = parts[6]
|
|
||||||
base = parts[7]
|
if len(parts) > 6:
|
||||||
surface = parts[8]
|
# this information is only available for words in the tokenizer dictionary
|
||||||
|
reading = parts[6]
|
||||||
|
base = parts[7]
|
||||||
|
|
||||||
words.append( ShortUnitWord(surface, base, pos) )
|
words.append( ShortUnitWord(surface, base, pos) )
|
||||||
node = node.next
|
node = node.next
|
||||||
|
|
|
@ -22,7 +22,12 @@ TAGGER_TESTS = [
|
||||||
('動詞,一般,*,*', 'VERB'),
|
('動詞,一般,*,*', 'VERB'),
|
||||||
('助詞,接続助詞,*,*', 'SCONJ'),
|
('助詞,接続助詞,*,*', 'SCONJ'),
|
||||||
('動詞,非自立可能,*,*', 'VERB'),
|
('動詞,非自立可能,*,*', 'VERB'),
|
||||||
('助詞,終助詞,*,*', 'PART')))
|
('助詞,終助詞,*,*', 'PART'))),
|
||||||
|
('プププランドに行きたい',
|
||||||
|
(('名詞,普通名詞,一般,*', 'NOUN'),
|
||||||
|
('助詞,格助詞,*,*', 'ADP'),
|
||||||
|
('動詞,非自立可能,*,*', 'VERB'),
|
||||||
|
('助動詞,*,*,*', 'AUX')))
|
||||||
]
|
]
|
||||||
|
|
||||||
@pytest.mark.parametrize('text,expected_tags', TAGGER_TESTS)
|
@pytest.mark.parametrize('text,expected_tags', TAGGER_TESTS)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user