mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Add method to update / reset pkuseg user dict (#5404)
This commit is contained in:
parent
4a15b559ba
commit
c963e269ba
|
@ -104,6 +104,22 @@ class ChineseTokenizer(DummyTokenizer):
|
||||||
(words, spaces) = util.get_words_and_spaces(words, text)
|
(words, spaces) = util.get_words_and_spaces(words, text)
|
||||||
return Doc(self.vocab, words=words, spaces=spaces)
|
return Doc(self.vocab, words=words, spaces=spaces)
|
||||||
|
|
||||||
|
def pkuseg_update_user_dict(self, words, reset=False):
|
||||||
|
if self.pkuseg_seg:
|
||||||
|
if reset:
|
||||||
|
try:
|
||||||
|
import pkuseg
|
||||||
|
self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(None)
|
||||||
|
except ImportError:
|
||||||
|
if self.use_pkuseg:
|
||||||
|
msg = (
|
||||||
|
"pkuseg not installed: unable to reset pkuseg "
|
||||||
|
"user dict. Please " + _PKUSEG_INSTALL_MSG
|
||||||
|
)
|
||||||
|
raise ImportError(msg)
|
||||||
|
for word in words:
|
||||||
|
self.pkuseg_seg.preprocesser.insert(word.strip(), '')
|
||||||
|
|
||||||
def _get_config(self):
|
def _get_config(self):
|
||||||
config = OrderedDict(
|
config = OrderedDict(
|
||||||
(
|
(
|
||||||
|
|
|
@ -2,6 +2,7 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
from spacy.lang.zh import _get_pkuseg_trie_data
|
||||||
|
|
||||||
|
|
||||||
# fmt: off
|
# fmt: off
|
||||||
|
@ -39,6 +40,18 @@ def test_zh_tokenizer_pkuseg(zh_tokenizer_pkuseg, text, expected_tokens):
|
||||||
assert tokens == expected_tokens
|
assert tokens == expected_tokens
|
||||||
|
|
||||||
|
|
||||||
|
def test_zh_tokenizer_pkuseg_user_dict(zh_tokenizer_pkuseg):
|
||||||
|
user_dict = _get_pkuseg_trie_data(zh_tokenizer_pkuseg.pkuseg_seg.preprocesser.trie)
|
||||||
|
zh_tokenizer_pkuseg.pkuseg_update_user_dict(["nonsense_asdf"])
|
||||||
|
updated_user_dict = _get_pkuseg_trie_data(zh_tokenizer_pkuseg.pkuseg_seg.preprocesser.trie)
|
||||||
|
assert len(user_dict) == len(updated_user_dict) - 1
|
||||||
|
|
||||||
|
# reset user dict
|
||||||
|
zh_tokenizer_pkuseg.pkuseg_update_user_dict([], reset=True)
|
||||||
|
reset_user_dict = _get_pkuseg_trie_data(zh_tokenizer_pkuseg.pkuseg_seg.preprocesser.trie)
|
||||||
|
assert len(reset_user_dict) == 0
|
||||||
|
|
||||||
|
|
||||||
def test_extra_spaces(zh_tokenizer_char):
|
def test_extra_spaces(zh_tokenizer_char):
|
||||||
# note: three spaces after "I"
|
# note: three spaces after "I"
|
||||||
tokens = zh_tokenizer_char("I like cheese.")
|
tokens = zh_tokenizer_char("I like cheese.")
|
||||||
|
|
Loading…
Reference in New Issue
Block a user