mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 23:47:31 +03:00 
			
		
		
		
	Update Vietnamese tokenizer (#8099)
* Adapt tokenization methods from `pyvi` to preserve text encoding and whitespace * Add serialization support similar to Chinese and Japanese Note: as for Chinese and Japanese, some settings are duplicated in `config.cfg` and `tokenizer/cfg`.
This commit is contained in:
		
							parent
							
								
									946a4284be
								
							
						
					
					
						commit
						1d59fdbd39
					
				|  | @ -43,8 +43,8 @@ scikit-learn | |||
| 
 | ||||
| * Files: scorer.py | ||||
| 
 | ||||
| The following implementation of roc_auc_score() is adapted from | ||||
| scikit-learn, which is distributed under the following license: | ||||
| The implementation of roc_auc_score() is adapted from scikit-learn, which is | ||||
| distributed under the following license: | ||||
| 
 | ||||
| New BSD License | ||||
| 
 | ||||
|  | @ -77,3 +77,30 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |||
| LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | ||||
| OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH | ||||
| DAMAGE. | ||||
| 
 | ||||
| 
 | ||||
| pyvi | ||||
| ---- | ||||
| 
 | ||||
| * Files: lang/vi/__init__.py | ||||
| 
 | ||||
| The MIT License (MIT) | ||||
| Copyright (c) 2016 Viet-Trung Tran | ||||
| 
 | ||||
| Permission is hereby granted, free of charge, to any person obtaining a copy of | ||||
| this software and associated documentation files (the "Software"), to deal in | ||||
| the Software without restriction, including without limitation the rights to | ||||
| use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies | ||||
| of the Software, and to permit persons to whom the Software is furnished to do | ||||
| so, subject to the following conditions: | ||||
| 
 | ||||
| The above copyright notice and this permission notice shall be included in all | ||||
| copies or substantial portions of the Software. | ||||
| 
 | ||||
| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||||
| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||||
| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||||
| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||||
| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||||
| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||||
| SOFTWARE. | ||||
|  |  | |||
|  | @ -1,8 +1,15 @@ | |||
| from typing import Any, Dict, Union | ||||
| from pathlib import Path | ||||
| import re | ||||
| import srsly | ||||
| import string | ||||
| 
 | ||||
| from .stop_words import STOP_WORDS | ||||
| from .lex_attrs import LEX_ATTRS | ||||
| from ...language import Language | ||||
| from ...tokens import Doc | ||||
| from ...util import DummyTokenizer, registry, load_config_from_str | ||||
| from ... import util | ||||
| 
 | ||||
| 
 | ||||
| DEFAULT_CONFIG = """ | ||||
|  | @ -40,17 +47,108 @@ class VietnameseTokenizer(DummyTokenizer): | |||
| 
 | ||||
|     def __call__(self, text: str) -> Doc: | ||||
|         if self.use_pyvi: | ||||
|             words, spaces = self.ViTokenizer.spacy_tokenize(text) | ||||
|             words = self.pyvi_tokenize(text) | ||||
|             words, spaces = util.get_words_and_spaces(words, text) | ||||
|             return Doc(self.vocab, words=words, spaces=spaces) | ||||
|         else: | ||||
|             words = [] | ||||
|             spaces = [] | ||||
|             for token in self.tokenizer(text): | ||||
|                 words.extend(list(token.text)) | ||||
|                 spaces.extend([False] * len(token.text)) | ||||
|                 spaces[-1] = bool(token.whitespace_) | ||||
|             words, spaces = util.get_words_and_spaces(text.split(), text) | ||||
|             return Doc(self.vocab, words=words, spaces=spaces) | ||||
| 
 | ||||
|     # The methods pyvi_sylabelize_with_ws and pyvi_tokenize are adapted from | ||||
|     # pyvi v0.1, MIT License, Copyright (c) 2016 Viet-Trung Tran. | ||||
|     # See licenses/3rd_party_licenses.txt | ||||
|     def pyvi_sylabelize_with_ws(self, text): | ||||
|         """Modified from pyvi to preserve whitespace and skip unicode | ||||
|         normalization.""" | ||||
|         specials = [r"==>", r"->", r"\.\.\.", r">>"] | ||||
|         digit = r"\d+([\.,_]\d+)+" | ||||
|         email = r"([a-zA-Z0-9_.+-]+@([a-zA-Z0-9-]+\.)+[a-zA-Z0-9-]+)" | ||||
|         web = r"\w+://[^\s]+" | ||||
|         word = r"\w+" | ||||
|         non_word = r"[^\w\s]" | ||||
|         abbreviations = [ | ||||
|             r"[A-ZĐ]+\.", | ||||
|             r"Tp\.", | ||||
|             r"Mr\.", | ||||
|             r"Mrs\.", | ||||
|             r"Ms\.", | ||||
|             r"Dr\.", | ||||
|             r"ThS\.", | ||||
|         ] | ||||
| 
 | ||||
|         patterns = [] | ||||
|         patterns.extend(abbreviations) | ||||
|         patterns.extend(specials) | ||||
|         patterns.extend([web, email]) | ||||
|         patterns.extend([digit, non_word, word]) | ||||
| 
 | ||||
|         patterns = r"(\s+|" + "|".join(patterns) + ")" | ||||
|         tokens = re.findall(patterns, text, re.UNICODE) | ||||
| 
 | ||||
|         return [token[0] for token in tokens] | ||||
| 
 | ||||
|     def pyvi_tokenize(self, text): | ||||
|         """Modified from pyvi to preserve text and whitespace.""" | ||||
|         if len(text) == 0: | ||||
|             return [] | ||||
|         elif text.isspace(): | ||||
|             return [text] | ||||
|         segs = self.pyvi_sylabelize_with_ws(text) | ||||
|         words = [] | ||||
|         preceding_ws = [] | ||||
|         for i, token in enumerate(segs): | ||||
|             if not token.isspace(): | ||||
|                 words.append(token) | ||||
|                 preceding_ws.append( | ||||
|                     "" if (i == 0 or not segs[i - 1].isspace()) else segs[i - 1] | ||||
|                 ) | ||||
|         labels = self.ViTokenizer.ViTokenizer.model.predict( | ||||
|             [self.ViTokenizer.ViTokenizer.sent2features(words, False)] | ||||
|         ) | ||||
|         token = words[0] | ||||
|         tokens = [] | ||||
|         for i in range(1, len(labels[0])): | ||||
|             if ( | ||||
|                 labels[0][i] == "I_W" | ||||
|                 and words[i] not in string.punctuation | ||||
|                 and words[i - 1] not in string.punctuation | ||||
|                 and not words[i][0].isdigit() | ||||
|                 and not words[i - 1][0].isdigit() | ||||
|                 and not (words[i][0].istitle() and not words[i - 1][0].istitle()) | ||||
|             ): | ||||
|                 token = token + preceding_ws[i] + words[i] | ||||
|             else: | ||||
|                 tokens.append(token) | ||||
|                 token = words[i] | ||||
|         tokens.append(token) | ||||
|         return tokens | ||||
| 
 | ||||
|     def _get_config(self) -> Dict[str, Any]: | ||||
|         return {"use_pyvi": self.use_pyvi} | ||||
| 
 | ||||
|     def _set_config(self, config: Dict[str, Any] = {}) -> None: | ||||
|         self.use_pyvi = config.get("use_pyvi", False) | ||||
| 
 | ||||
|     def to_bytes(self, **kwargs) -> bytes: | ||||
|         serializers = {"cfg": lambda: srsly.json_dumps(self._get_config())} | ||||
|         return util.to_bytes(serializers, []) | ||||
| 
 | ||||
|     def from_bytes(self, data: bytes, **kwargs) -> "VietnameseTokenizer": | ||||
|         deserializers = {"cfg": lambda b: self._set_config(srsly.json_loads(b))} | ||||
|         util.from_bytes(data, deserializers, []) | ||||
|         return self | ||||
| 
 | ||||
|     def to_disk(self, path: Union[str, Path], **kwargs) -> None: | ||||
|         path = util.ensure_path(path) | ||||
|         serializers = {"cfg": lambda p: srsly.write_json(p, self._get_config())} | ||||
|         return util.to_disk(path, serializers, []) | ||||
| 
 | ||||
|     def from_disk(self, path: Union[str, Path], **kwargs) -> "VietnameseTokenizer": | ||||
|         path = util.ensure_path(path) | ||||
|         serializers = {"cfg": lambda p: self._set_config(srsly.read_json(p))} | ||||
|         util.from_disk(path, serializers, []) | ||||
|         return self | ||||
| 
 | ||||
| 
 | ||||
| class VietnameseDefaults(Language.Defaults): | ||||
|     config = load_config_from_str(DEFAULT_CONFIG) | ||||
|  |  | |||
|  | @ -286,6 +286,12 @@ def ur_tokenizer(): | |||
|     return get_lang_class("ur")().tokenizer | ||||
| 
 | ||||
| 
 | ||||
| @pytest.fixture(scope="session") | ||||
| def vi_tokenizer(): | ||||
|     pytest.importorskip("pyvi") | ||||
|     return get_lang_class("vi")().tokenizer | ||||
| 
 | ||||
| 
 | ||||
| @pytest.fixture(scope="session") | ||||
| def yo_tokenizer(): | ||||
|     return get_lang_class("yo")().tokenizer | ||||
|  |  | |||
							
								
								
									
										0
									
								
								spacy/tests/lang/vi/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								spacy/tests/lang/vi/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										33
									
								
								spacy/tests/lang/vi/test_serialize.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										33
									
								
								spacy/tests/lang/vi/test_serialize.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,33 @@ | |||
| from spacy.lang.vi import Vietnamese | ||||
| from ...util import make_tempdir | ||||
| 
 | ||||
| 
 | ||||
| def test_vi_tokenizer_serialize(vi_tokenizer): | ||||
|     tokenizer_bytes = vi_tokenizer.to_bytes() | ||||
|     nlp = Vietnamese() | ||||
|     nlp.tokenizer.from_bytes(tokenizer_bytes) | ||||
|     assert tokenizer_bytes == nlp.tokenizer.to_bytes() | ||||
|     assert nlp.tokenizer.use_pyvi is True | ||||
| 
 | ||||
|     with make_tempdir() as d: | ||||
|         file_path = d / "tokenizer" | ||||
|         vi_tokenizer.to_disk(file_path) | ||||
|         nlp = Vietnamese() | ||||
|         nlp.tokenizer.from_disk(file_path) | ||||
|         assert tokenizer_bytes == nlp.tokenizer.to_bytes() | ||||
|         assert nlp.tokenizer.use_pyvi is True | ||||
| 
 | ||||
|     # mode is (de)serialized correctly | ||||
|     nlp = Vietnamese.from_config({"nlp": {"tokenizer": {"use_pyvi": False}}}) | ||||
|     nlp_bytes = nlp.to_bytes() | ||||
|     nlp_r = Vietnamese() | ||||
|     nlp_r.from_bytes(nlp_bytes) | ||||
|     assert nlp_bytes == nlp_r.to_bytes() | ||||
|     assert nlp_r.tokenizer.use_pyvi == False | ||||
| 
 | ||||
|     with make_tempdir() as d: | ||||
|         nlp.to_disk(d) | ||||
|         nlp_r = Vietnamese() | ||||
|         nlp_r.from_disk(d) | ||||
|         assert nlp_bytes == nlp_r.to_bytes() | ||||
|         assert nlp_r.tokenizer.use_pyvi == False | ||||
							
								
								
									
										47
									
								
								spacy/tests/lang/vi/test_tokenizer.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										47
									
								
								spacy/tests/lang/vi/test_tokenizer.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,47 @@ | |||
| import pytest | ||||
| 
 | ||||
| from ...tokenizer.test_naughty_strings import NAUGHTY_STRINGS | ||||
| from spacy.lang.vi import Vietnamese | ||||
| 
 | ||||
| 
 | ||||
| # fmt: off | ||||
| TOKENIZER_TESTS = [ | ||||
|     ("Đây là một văn  bản bằng tiếng Việt Sau đó, đây là một văn bản khác bằng ngôn ngữ này", ['Đây', 'là', 'một', 'văn  bản', 'bằng', 'tiếng', 'Việt', 'Sau', 'đó', ',', 'đây', 'là', 'một', 'văn bản', 'khác', 'bằng', 'ngôn ngữ', 'này']), | ||||
| ] | ||||
| # fmt: on | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize("text,expected_tokens", TOKENIZER_TESTS) | ||||
| def test_vi_tokenizer(vi_tokenizer, text, expected_tokens): | ||||
|     tokens = [token.text for token in vi_tokenizer(text)] | ||||
|     assert tokens == expected_tokens | ||||
| 
 | ||||
| 
 | ||||
| def test_vi_tokenizer_extra_spaces(vi_tokenizer): | ||||
|     # note: three spaces after "I" | ||||
|     tokens = vi_tokenizer("I   like cheese.") | ||||
|     assert tokens[1].orth_ == "  " | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize("text", NAUGHTY_STRINGS) | ||||
| def test_vi_tokenizer_naughty_strings(vi_tokenizer, text): | ||||
|     tokens = vi_tokenizer(text) | ||||
|     assert tokens.text_with_ws == text | ||||
| 
 | ||||
| 
 | ||||
| def test_vi_tokenizer_emptyish_texts(vi_tokenizer): | ||||
|     doc = vi_tokenizer("") | ||||
|     assert len(doc) == 0 | ||||
|     doc = vi_tokenizer(" ") | ||||
|     assert len(doc) == 1 | ||||
|     doc = vi_tokenizer("\n\n\n \t\t \n\n\n") | ||||
|     assert len(doc) == 1 | ||||
| 
 | ||||
| 
 | ||||
| def test_vi_tokenizer_no_pyvi(): | ||||
|     """Test for whitespace tokenization without pyvi""" | ||||
|     nlp = Vietnamese.from_config({"nlp": {"tokenizer": {"use_pyvi": False}}}) | ||||
|     text = "Đây là một văn  bản bằng tiếng Việt Sau đó, đây là một văn bản khác bằng ngôn ngữ này" | ||||
|     doc = nlp(text) | ||||
|     assert [t.text for t in doc if not t.is_space] == text.split() | ||||
|     assert doc[4].text == " " | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user