mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Add method to update / reset pkuseg user dict (#5404)
This commit is contained in:
parent
4a15b559ba
commit
c963e269ba
|
@ -104,6 +104,22 @@ class ChineseTokenizer(DummyTokenizer):
|
|||
(words, spaces) = util.get_words_and_spaces(words, text)
|
||||
return Doc(self.vocab, words=words, spaces=spaces)
|
||||
|
||||
def pkuseg_update_user_dict(self, words, reset=False):
|
||||
if self.pkuseg_seg:
|
||||
if reset:
|
||||
try:
|
||||
import pkuseg
|
||||
self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(None)
|
||||
except ImportError:
|
||||
if self.use_pkuseg:
|
||||
msg = (
|
||||
"pkuseg not installed: unable to reset pkuseg "
|
||||
"user dict. Please " + _PKUSEG_INSTALL_MSG
|
||||
)
|
||||
raise ImportError(msg)
|
||||
for word in words:
|
||||
self.pkuseg_seg.preprocesser.insert(word.strip(), '')
|
||||
|
||||
def _get_config(self):
|
||||
config = OrderedDict(
|
||||
(
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
from spacy.lang.zh import _get_pkuseg_trie_data
|
||||
|
||||
|
||||
# fmt: off
|
||||
|
@ -39,6 +40,18 @@ def test_zh_tokenizer_pkuseg(zh_tokenizer_pkuseg, text, expected_tokens):
|
|||
assert tokens == expected_tokens
|
||||
|
||||
|
||||
def test_zh_tokenizer_pkuseg_user_dict(zh_tokenizer_pkuseg):
|
||||
user_dict = _get_pkuseg_trie_data(zh_tokenizer_pkuseg.pkuseg_seg.preprocesser.trie)
|
||||
zh_tokenizer_pkuseg.pkuseg_update_user_dict(["nonsense_asdf"])
|
||||
updated_user_dict = _get_pkuseg_trie_data(zh_tokenizer_pkuseg.pkuseg_seg.preprocesser.trie)
|
||||
assert len(user_dict) == len(updated_user_dict) - 1
|
||||
|
||||
# reset user dict
|
||||
zh_tokenizer_pkuseg.pkuseg_update_user_dict([], reset=True)
|
||||
reset_user_dict = _get_pkuseg_trie_data(zh_tokenizer_pkuseg.pkuseg_seg.preprocesser.trie)
|
||||
assert len(reset_user_dict) == 0
|
||||
|
||||
|
||||
def test_extra_spaces(zh_tokenizer_char):
|
||||
# note: three spaces after "I"
|
||||
tokens = zh_tokenizer_char("I like cheese.")
|
||||
|
|
Loading…
Reference in New Issue
Block a user