From c963e269bac9c41222d81abf82131b1937912325 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Fri, 8 May 2020 11:21:46 +0200 Subject: [PATCH] Add method to update / reset pkuseg user dict (#5404) --- spacy/lang/zh/__init__.py | 16 ++++++++++++++++ spacy/tests/lang/zh/test_tokenizer.py | 13 +++++++++++++ 2 files changed, 29 insertions(+) diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py index 701e696a4..ed0b3eb74 100644 --- a/spacy/lang/zh/__init__.py +++ b/spacy/lang/zh/__init__.py @@ -104,6 +104,22 @@ class ChineseTokenizer(DummyTokenizer): (words, spaces) = util.get_words_and_spaces(words, text) return Doc(self.vocab, words=words, spaces=spaces) + def pkuseg_update_user_dict(self, words, reset=False): + if self.pkuseg_seg: + if reset: + try: + import pkuseg + self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(None) + except ImportError: + if self.use_pkuseg: + msg = ( + "pkuseg not installed: unable to reset pkuseg " + "user dict. Please " + _PKUSEG_INSTALL_MSG + ) + raise ImportError(msg) + for word in words: + self.pkuseg_seg.preprocesser.insert(word.strip(), '') + def _get_config(self): config = OrderedDict( ( diff --git a/spacy/tests/lang/zh/test_tokenizer.py b/spacy/tests/lang/zh/test_tokenizer.py index bff7b1ed1..035798aa1 100644 --- a/spacy/tests/lang/zh/test_tokenizer.py +++ b/spacy/tests/lang/zh/test_tokenizer.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals import pytest +from spacy.lang.zh import _get_pkuseg_trie_data # fmt: off @@ -39,6 +40,18 @@ def test_zh_tokenizer_pkuseg(zh_tokenizer_pkuseg, text, expected_tokens): assert tokens == expected_tokens +def test_zh_tokenizer_pkuseg_user_dict(zh_tokenizer_pkuseg): + user_dict = _get_pkuseg_trie_data(zh_tokenizer_pkuseg.pkuseg_seg.preprocesser.trie) + zh_tokenizer_pkuseg.pkuseg_update_user_dict(["nonsense_asdf"]) + updated_user_dict = _get_pkuseg_trie_data(zh_tokenizer_pkuseg.pkuseg_seg.preprocesser.trie) + assert len(user_dict) == len(updated_user_dict) - 1 + + # reset user dict + zh_tokenizer_pkuseg.pkuseg_update_user_dict([], reset=True) + reset_user_dict = _get_pkuseg_trie_data(zh_tokenizer_pkuseg.pkuseg_seg.preprocesser.trie) + assert len(reset_user_dict) == 0 + + def test_extra_spaces(zh_tokenizer_char): # note: three spaces after "I" tokens = zh_tokenizer_char("I like cheese.")