mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 16:07:41 +03:00 
			
		
		
		
	Passing Hungatian abbrev tests.
This commit is contained in:
		
							parent
							
								
									0289b8ceaa
								
							
						
					
					
						commit
						2051726fd3
					
				|  | @ -24,34 +24,7 @@ STOP_WORDS = set(_load_txt_data(_STOPWORDS_PATH)) | ||||||
| 
 | 
 | ||||||
| HYPHENS = [six.unichr(cp) for cp in [173, 8211, 8212, 8213, 8722, 9472]] | HYPHENS = [six.unichr(cp) for cp in [173, 8211, 8212, 8213, 8722, 9472]] | ||||||
| 
 | 
 | ||||||
| TOKENIZER_PREFIXES = map(re.escape, r''' | TOKENIZER_PREFIXES = r''', " ( [ { * < $ £ “ ' `` ` # US$ C$ A$ a- ‘ .... ...'''.split() | ||||||
| , |  | ||||||
| " |  | ||||||
| ( |  | ||||||
| [ |  | ||||||
| { |  | ||||||
| * |  | ||||||
| < |  | ||||||
| > |  | ||||||
| $ |  | ||||||
| £ |  | ||||||
| „ |  | ||||||
| “ |  | ||||||
| ' |  | ||||||
| `` |  | ||||||
| ` |  | ||||||
| # |  | ||||||
| US$ |  | ||||||
| C$ |  | ||||||
| A$ |  | ||||||
| ‘ |  | ||||||
| .... |  | ||||||
| ... |  | ||||||
| ‚ |  | ||||||
| » |  | ||||||
| _ |  | ||||||
| § |  | ||||||
| '''.strip().split('\n')) |  | ||||||
| 
 | 
 | ||||||
| TOKENIZER_SUFFIXES = r''' | TOKENIZER_SUFFIXES = r''' | ||||||
| , | , | ||||||
|  | @ -125,11 +98,11 @@ _ | ||||||
| (?<=[0-9])kb | (?<=[0-9])kb | ||||||
| '''.strip().split('\n') | '''.strip().split('\n') | ||||||
| 
 | 
 | ||||||
| TOKENIZER_INFIXES = (r'''\.\.\.+ (?<=[a-z])\.(?=[A-Z]) (?<=[a-zA-Z])-(?=[a-zA-z]) ''' | TOKENIZER_INFIXES = (r'''\.\.+ (?<=[a-z])\.(?=[A-Z]) (?<=[a-zA-Z])-(?=[a-zA-z]) ''' | ||||||
|                      r'''(?<=[a-zA-Z])--(?=[a-zA-z]) (?<=[0-9])-(?=[0-9]) ''' |                      r'''(?<=[a-zA-Z])--(?=[a-zA-z]) (?<=[0-9])-(?=[0-9]) ''' | ||||||
|                      r'''(?<=[A-Za-z]),(?=[A-Za-z])''').split() |                      r'''(?<=[A-Za-z]),(?=[A-Za-z])''').split() | ||||||
| 
 | 
 | ||||||
| ABBREVIATIONS = {abbrev: [{"F": abbrev}] for abbrev in | ABBREVIATIONS = {abbrev: [{"ORTH": abbrev}] for abbrev in | ||||||
|                  _load_txt_data(_ABBREVIATIONS_ORIG_PATH, _ABBREVIATIONS_NYTUD_PATH)} |                  _load_txt_data(_ABBREVIATIONS_ORIG_PATH, _ABBREVIATIONS_NYTUD_PATH)} | ||||||
| 
 | 
 | ||||||
| TOKENIZER_EXCEPTIONS = { | TOKENIZER_EXCEPTIONS = { | ||||||
|  |  | ||||||
|  | @ -1,27 +0,0 @@ | ||||||
| import pytest |  | ||||||
| 
 |  | ||||||
| from spacy.hu import Hungarian |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| @pytest.fixture(scope="session") |  | ||||||
| def HU(): |  | ||||||
|     return Hungarian() |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| @pytest.fixture(scope="module") |  | ||||||
| def hu_tokenizer(HU): |  | ||||||
|     return HU.tokenizer |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| @pytest.mark.parametrize(("input_str", "expected_length"), [ |  | ||||||
|     ("A vs. egy", 3), |  | ||||||
|     ("A dr. egy", 3), |  | ||||||
|     ("A .hu egy tld.", 5), |  | ||||||
|     ("A .hu.", 3), |  | ||||||
|     ("Az egy.ketto pelda.", 4), |  | ||||||
|     ("A pl. rovidites.", 4), |  | ||||||
|     ("A S.M.A.R.T. szo.", 4) |  | ||||||
| ]) |  | ||||||
| def test_abbreviations(hu_tokenizer, input_str, expected_length): |  | ||||||
|     tokens = hu_tokenizer(input_str) |  | ||||||
|     assert len(tokens) == expected_length |  | ||||||
							
								
								
									
										4
									
								
								spacy/tests/hu/tokenizer/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										4
									
								
								spacy/tests/hu/tokenizer/__init__.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,4 @@ | ||||||
|  | __author__ = 'gyorgyorosz' | ||||||
|  | 
 | ||||||
|  | if __name__ == "__main__": | ||||||
|  |     pass | ||||||
							
								
								
									
										58
									
								
								spacy/tests/hu/tokenizer/test_default_token_dots.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										58
									
								
								spacy/tests/hu/tokenizer/test_default_token_dots.txt
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,58 @@ | ||||||
|  | # TOKEN dots | ||||||
|  | 
 | ||||||
|  | 0. egyszeru szavak | ||||||
|  | IN : N. kormányzósági | ||||||
|  | IN : székhely. | ||||||
|  | OUT: <s><w>N.</w><ws> </ws><w>kormányzósági</w><ws> | ||||||
|  | OUT: </ws><w>székhely</w><c>.</c></s> | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 1. szavak pontokkal | ||||||
|  | 
 | ||||||
|  | 1.1 mondatkozi verziok | ||||||
|  | 1.1.1 pottal kezdodo szavak | ||||||
|  | IN : A .hu egy tld. | ||||||
|  | OUT: <s><w>A</w><ws> </ws><w>.hu</w><ws> </ws><w>egy</w><ws> </ws><w>tld</w><c>.</c></s> | ||||||
|  | 1.1.2 pont a szo belsejeben | ||||||
|  | IN : Az egy.ketto pelda. | ||||||
|  | OUT: <s><w>Az</w><ws> </ws><w>egy.ketto</w><ws> </ws><w>pelda</w><c>.</c></s> | ||||||
|  | 1.1.3 pont a szo vegen | ||||||
|  | IN : A pl. rovidites. | ||||||
|  | OUT: <s><w>A</w><ws> </ws><w>pl.</w><ws> </ws><w>rovidites</w><c>.</c></s> | ||||||
|  | 1.1.4 pontozott szo | ||||||
|  | IN : A S.M.A.R.T. szo. | ||||||
|  | OUT: <s><w>A</w><ws> </ws><w>S.M.A.R.T.</w><ws> </ws><w>szo</w><c>.</c></s> | ||||||
|  | 
 | ||||||
|  | 1.2 mondatvegi verziok | ||||||
|  | 1.2.1 pottal kezdodo szavak | ||||||
|  | IN : A .hu. | ||||||
|  | OUT: <s><w>A</w><ws> </ws><w>.hu</w><c>.</c></s> | ||||||
|  | 1.2.2 pont a szo belsejeben | ||||||
|  | IN : Az egy.ketto. | ||||||
|  | OUT: <s><w>Az</w><ws> </ws><w>egy.ketto</w><c>.</c></s> | ||||||
|  | 1.2.3 pont a szo vegen | ||||||
|  | #TODO: cf. Huntoken | ||||||
|  | IN : A pl. | ||||||
|  | OUT: <s><w>A</w><ws> </ws><w>pl.</w></s> | ||||||
|  | 1.2.4 pontozott szo | ||||||
|  | #TODO: cf. Huntoken | ||||||
|  | IN : A S.M.A.R.T. | ||||||
|  | OUT: <s><w>A</w><ws> </ws><w>S.M.A.R.T.</w></s> | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 2. tobb pont | ||||||
|  | 
 | ||||||
|  | 2.1 ketto vagy tobb pont utan uj szo | ||||||
|  | IN : Egy..ket. | ||||||
|  | OUT: <s><w>Egy</w><c>..</c><w>ket</w><c>.</c></s> | ||||||
|  | IN : Valami... van. | ||||||
|  | OUT: <s><w>Valami</w><c>...</c><ws> </ws><w>van</w><c>.</c></s> | ||||||
|  | IN : Valami ...van... | ||||||
|  | OUT: <s><w>Valami</w><ws> </ws><c>...</c><w>van</w><c>...</c></s> | ||||||
|  | IN : Valami... | ||||||
|  | OUT: <s><w>Valami</w><c>...</c></s> | ||||||
|  | IN : Valami ... | ||||||
|  | OUT: <s><w>Valami</w><ws> </ws><c>...</c></s> | ||||||
|  | IN : Valami ... más. | ||||||
|  | OUT: <s><w>Valami</w><ws> </ws><c>...</c><ws> </ws><w>más</w><c>.</c></s> | ||||||
|  | 
 | ||||||
							
								
								
									
										74
									
								
								spacy/tests/hu/tokenizer/test_tokenizer.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										74
									
								
								spacy/tests/hu/tokenizer/test_tokenizer.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,74 @@ | ||||||
|  | import os | ||||||
|  | import re | ||||||
|  | 
 | ||||||
|  | import pytest | ||||||
|  | 
 | ||||||
|  | from spacy.hu import Hungarian | ||||||
|  | 
 | ||||||
|  | _MODULE_PATH = os.path.dirname(__file__) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class TokenizerTestCase(object): | ||||||
|  |     INPUT_PREFIX = "IN :" | ||||||
|  |     OUTPUT_PREFIX = "OUT:" | ||||||
|  |     WORD_PATTERN = re.compile(r"<([wc])>([^<>]+)</\1>") | ||||||
|  | 
 | ||||||
|  |     def __init__(self, input_str, expected_words): | ||||||
|  |         self.input = input_str | ||||||
|  |         self.expected_tokens = expected_words | ||||||
|  | 
 | ||||||
|  |     def __repr__(self): | ||||||
|  |         return "TokenizerTestCase<input={}, words={}>".format(repr(self.input), self.expected_tokens) | ||||||
|  | 
 | ||||||
|  |     def to_tuple(self): | ||||||
|  |         return (self.input, self.expected_tokens) | ||||||
|  | 
 | ||||||
|  |     @classmethod | ||||||
|  |     def _parse_output_line(cls, line): | ||||||
|  |         for match in cls.WORD_PATTERN.finditer(line): | ||||||
|  |             yield match.group(2) | ||||||
|  | 
 | ||||||
|  |     @classmethod | ||||||
|  |     def read_from_file(cls, path): | ||||||
|  |         with open(path) as f: | ||||||
|  |             input_lines = [] | ||||||
|  |             output_words = [] | ||||||
|  |             last_type = None | ||||||
|  |             for line in f: | ||||||
|  |                 if line.startswith(cls.INPUT_PREFIX): | ||||||
|  |                     if last_type == TokenizerTestCase.OUTPUT_PREFIX and input_lines: | ||||||
|  |                         yield TokenizerTestCase("\n".join(input_lines), output_words) | ||||||
|  |                         input_lines = [] | ||||||
|  |                         output_words = [] | ||||||
|  |                     input_lines.append(line[len(cls.INPUT_PREFIX):].strip()) | ||||||
|  |                     last_type = TokenizerTestCase.INPUT_PREFIX | ||||||
|  |                 elif line.startswith(cls.OUTPUT_PREFIX): | ||||||
|  |                     output_words.extend(list(cls._parse_output_line(line.strip()))) | ||||||
|  |                     last_type = TokenizerTestCase.OUTPUT_PREFIX | ||||||
|  |                 else: | ||||||
|  |                     # Comments separate test cases | ||||||
|  |                     if input_lines: | ||||||
|  |                         yield TokenizerTestCase("\n".join(input_lines), output_words) | ||||||
|  |                         input_lines = [] | ||||||
|  |                         output_words = [] | ||||||
|  |                     last_type = None | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | _DOTS_CASES = list(TokenizerTestCase.read_from_file(_MODULE_PATH + "/test_default_token_dots.txt")) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @pytest.fixture(scope="session") | ||||||
|  | def HU(): | ||||||
|  |     return Hungarian() | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @pytest.fixture(scope="module") | ||||||
|  | def hu_tokenizer(HU): | ||||||
|  |     return HU.tokenizer | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @pytest.mark.parametrize(("test_case"), _DOTS_CASES) | ||||||
|  | def test_abbreviations(hu_tokenizer, test_case): | ||||||
|  |     tokens = hu_tokenizer(test_case.input) | ||||||
|  |     token_list = [token.orth_ for token in tokens if not token.is_space] | ||||||
|  |     assert test_case.expected_tokens == token_list, "{} was erronously tokenized as {}".format(test_case, token_list) | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user