Add method to update / reset pkuseg user dict (#5404)

This commit is contained in:
adrianeboyd 2020-05-08 11:21:46 +02:00 committed by GitHub
parent 4a15b559ba
commit c963e269ba
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 29 additions and 0 deletions

View File

@ -104,6 +104,22 @@ class ChineseTokenizer(DummyTokenizer):
(words, spaces) = util.get_words_and_spaces(words, text)
return Doc(self.vocab, words=words, spaces=spaces)
def pkuseg_update_user_dict(self, words, reset=False):
if self.pkuseg_seg:
if reset:
try:
import pkuseg
self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(None)
except ImportError:
if self.use_pkuseg:
msg = (
"pkuseg not installed: unable to reset pkuseg "
"user dict. Please " + _PKUSEG_INSTALL_MSG
)
raise ImportError(msg)
for word in words:
self.pkuseg_seg.preprocesser.insert(word.strip(), '')
def _get_config(self):
config = OrderedDict(
(

View File

@ -2,6 +2,7 @@
from __future__ import unicode_literals
import pytest
from spacy.lang.zh import _get_pkuseg_trie_data
# fmt: off
@ -39,6 +40,18 @@ def test_zh_tokenizer_pkuseg(zh_tokenizer_pkuseg, text, expected_tokens):
assert tokens == expected_tokens
def test_zh_tokenizer_pkuseg_user_dict(zh_tokenizer_pkuseg):
user_dict = _get_pkuseg_trie_data(zh_tokenizer_pkuseg.pkuseg_seg.preprocesser.trie)
zh_tokenizer_pkuseg.pkuseg_update_user_dict(["nonsense_asdf"])
updated_user_dict = _get_pkuseg_trie_data(zh_tokenizer_pkuseg.pkuseg_seg.preprocesser.trie)
assert len(user_dict) == len(updated_user_dict) - 1
# reset user dict
zh_tokenizer_pkuseg.pkuseg_update_user_dict([], reset=True)
reset_user_dict = _get_pkuseg_trie_data(zh_tokenizer_pkuseg.pkuseg_seg.preprocesser.trie)
assert len(reset_user_dict) == 0
def test_extra_spaces(zh_tokenizer_char):
# note: three spaces after "I"
tokens = zh_tokenizer_char("I like cheese.")