Handle out-of-vocab words

Wasn't handling words out of the tokenizer dictionary vocabulary
properly. This adds a fix and test for that. -POLM
This commit is contained in:
Paul O'Leary McCann 2017-08-29 23:58:42 +09:00
parent 95050201ce
commit 8b3e1f7b5b
2 changed files with 13 additions and 4 deletions

View File

@ -66,11 +66,15 @@ def detailed_tokens(tokenizer, text):
node = node.next # first node is beginning of sentence and empty, skip it
words = []
while node.posid != 0:
surface = node.surface
base = surface
parts = node.feature.split(',')
pos = ','.join(parts[0:4])
reading = parts[6]
base = parts[7]
surface = parts[8]
if len(parts) > 6:
# this information is only available for words in the tokenizer dictionary
reading = parts[6]
base = parts[7]
words.append( ShortUnitWord(surface, base, pos) )
node = node.next

View File

@ -22,7 +22,12 @@ TAGGER_TESTS = [
('動詞,一般,*,*', 'VERB'),
('助詞,接続助詞,*,*', 'SCONJ'),
('動詞,非自立可能,*,*', 'VERB'),
('助詞,終助詞,*,*', 'PART')))
('助詞,終助詞,*,*', 'PART'))),
('プププランドに行きたい',
(('名詞,普通名詞,一般,*', 'NOUN'),
('助詞,格助詞,*,*', 'ADP'),
('動詞,非自立可能,*,*', 'VERB'),
('助動詞,*,*,*', 'AUX')))
]
@pytest.mark.parametrize('text,expected_tags', TAGGER_TESTS)