mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 23:47:31 +03:00 
			
		
		
		
	Passing Hungatian abbrev tests.
This commit is contained in:
		
							parent
							
								
									0289b8ceaa
								
							
						
					
					
						commit
						2051726fd3
					
				|  | @ -24,34 +24,7 @@ STOP_WORDS = set(_load_txt_data(_STOPWORDS_PATH)) | |||
| 
 | ||||
| HYPHENS = [six.unichr(cp) for cp in [173, 8211, 8212, 8213, 8722, 9472]] | ||||
| 
 | ||||
| TOKENIZER_PREFIXES = map(re.escape, r''' | ||||
| , | ||||
| " | ||||
| ( | ||||
| [ | ||||
| { | ||||
| * | ||||
| < | ||||
| > | ||||
| $ | ||||
| £ | ||||
| „ | ||||
| “ | ||||
| ' | ||||
| `` | ||||
| ` | ||||
| # | ||||
| US$ | ||||
| C$ | ||||
| A$ | ||||
| ‘ | ||||
| .... | ||||
| ... | ||||
| ‚ | ||||
| » | ||||
| _ | ||||
| § | ||||
| '''.strip().split('\n')) | ||||
| TOKENIZER_PREFIXES = r''', " ( [ { * < $ £ “ ' `` ` # US$ C$ A$ a- ‘ .... ...'''.split() | ||||
| 
 | ||||
| TOKENIZER_SUFFIXES = r''' | ||||
| , | ||||
|  | @ -125,11 +98,11 @@ _ | |||
| (?<=[0-9])kb | ||||
| '''.strip().split('\n') | ||||
| 
 | ||||
| TOKENIZER_INFIXES = (r'''\.\.\.+ (?<=[a-z])\.(?=[A-Z]) (?<=[a-zA-Z])-(?=[a-zA-z]) ''' | ||||
| TOKENIZER_INFIXES = (r'''\.\.+ (?<=[a-z])\.(?=[A-Z]) (?<=[a-zA-Z])-(?=[a-zA-z]) ''' | ||||
|                      r'''(?<=[a-zA-Z])--(?=[a-zA-z]) (?<=[0-9])-(?=[0-9]) ''' | ||||
|                      r'''(?<=[A-Za-z]),(?=[A-Za-z])''').split() | ||||
| 
 | ||||
| ABBREVIATIONS = {abbrev: [{"F": abbrev}] for abbrev in | ||||
| ABBREVIATIONS = {abbrev: [{"ORTH": abbrev}] for abbrev in | ||||
|                  _load_txt_data(_ABBREVIATIONS_ORIG_PATH, _ABBREVIATIONS_NYTUD_PATH)} | ||||
| 
 | ||||
| TOKENIZER_EXCEPTIONS = { | ||||
|  |  | |||
|  | @ -1,27 +0,0 @@ | |||
| import pytest | ||||
| 
 | ||||
| from spacy.hu import Hungarian | ||||
| 
 | ||||
| 
 | ||||
| @pytest.fixture(scope="session") | ||||
| def HU(): | ||||
|     return Hungarian() | ||||
| 
 | ||||
| 
 | ||||
| @pytest.fixture(scope="module") | ||||
| def hu_tokenizer(HU): | ||||
|     return HU.tokenizer | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize(("input_str", "expected_length"), [ | ||||
|     ("A vs. egy", 3), | ||||
|     ("A dr. egy", 3), | ||||
|     ("A .hu egy tld.", 5), | ||||
|     ("A .hu.", 3), | ||||
|     ("Az egy.ketto pelda.", 4), | ||||
|     ("A pl. rovidites.", 4), | ||||
|     ("A S.M.A.R.T. szo.", 4) | ||||
| ]) | ||||
| def test_abbreviations(hu_tokenizer, input_str, expected_length): | ||||
|     tokens = hu_tokenizer(input_str) | ||||
|     assert len(tokens) == expected_length | ||||
							
								
								
									
										4
									
								
								spacy/tests/hu/tokenizer/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										4
									
								
								spacy/tests/hu/tokenizer/__init__.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,4 @@ | |||
| __author__ = 'gyorgyorosz' | ||||
| 
 | ||||
| if __name__ == "__main__": | ||||
|     pass | ||||
							
								
								
									
										58
									
								
								spacy/tests/hu/tokenizer/test_default_token_dots.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										58
									
								
								spacy/tests/hu/tokenizer/test_default_token_dots.txt
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,58 @@ | |||
| # TOKEN dots | ||||
| 
 | ||||
| 0. egyszeru szavak | ||||
| IN : N. kormányzósági | ||||
| IN : székhely. | ||||
| OUT: <s><w>N.</w><ws> </ws><w>kormányzósági</w><ws> | ||||
| OUT: </ws><w>székhely</w><c>.</c></s> | ||||
| 
 | ||||
| 
 | ||||
| 1. szavak pontokkal | ||||
| 
 | ||||
| 1.1 mondatkozi verziok | ||||
| 1.1.1 pottal kezdodo szavak | ||||
| IN : A .hu egy tld. | ||||
| OUT: <s><w>A</w><ws> </ws><w>.hu</w><ws> </ws><w>egy</w><ws> </ws><w>tld</w><c>.</c></s> | ||||
| 1.1.2 pont a szo belsejeben | ||||
| IN : Az egy.ketto pelda. | ||||
| OUT: <s><w>Az</w><ws> </ws><w>egy.ketto</w><ws> </ws><w>pelda</w><c>.</c></s> | ||||
| 1.1.3 pont a szo vegen | ||||
| IN : A pl. rovidites. | ||||
| OUT: <s><w>A</w><ws> </ws><w>pl.</w><ws> </ws><w>rovidites</w><c>.</c></s> | ||||
| 1.1.4 pontozott szo | ||||
| IN : A S.M.A.R.T. szo. | ||||
| OUT: <s><w>A</w><ws> </ws><w>S.M.A.R.T.</w><ws> </ws><w>szo</w><c>.</c></s> | ||||
| 
 | ||||
| 1.2 mondatvegi verziok | ||||
| 1.2.1 pottal kezdodo szavak | ||||
| IN : A .hu. | ||||
| OUT: <s><w>A</w><ws> </ws><w>.hu</w><c>.</c></s> | ||||
| 1.2.2 pont a szo belsejeben | ||||
| IN : Az egy.ketto. | ||||
| OUT: <s><w>Az</w><ws> </ws><w>egy.ketto</w><c>.</c></s> | ||||
| 1.2.3 pont a szo vegen | ||||
| #TODO: cf. Huntoken | ||||
| IN : A pl. | ||||
| OUT: <s><w>A</w><ws> </ws><w>pl.</w></s> | ||||
| 1.2.4 pontozott szo | ||||
| #TODO: cf. Huntoken | ||||
| IN : A S.M.A.R.T. | ||||
| OUT: <s><w>A</w><ws> </ws><w>S.M.A.R.T.</w></s> | ||||
| 
 | ||||
| 
 | ||||
| 2. tobb pont | ||||
| 
 | ||||
| 2.1 ketto vagy tobb pont utan uj szo | ||||
| IN : Egy..ket. | ||||
| OUT: <s><w>Egy</w><c>..</c><w>ket</w><c>.</c></s> | ||||
| IN : Valami... van. | ||||
| OUT: <s><w>Valami</w><c>...</c><ws> </ws><w>van</w><c>.</c></s> | ||||
| IN : Valami ...van... | ||||
| OUT: <s><w>Valami</w><ws> </ws><c>...</c><w>van</w><c>...</c></s> | ||||
| IN : Valami... | ||||
| OUT: <s><w>Valami</w><c>...</c></s> | ||||
| IN : Valami ... | ||||
| OUT: <s><w>Valami</w><ws> </ws><c>...</c></s> | ||||
| IN : Valami ... más. | ||||
| OUT: <s><w>Valami</w><ws> </ws><c>...</c><ws> </ws><w>más</w><c>.</c></s> | ||||
| 
 | ||||
							
								
								
									
										74
									
								
								spacy/tests/hu/tokenizer/test_tokenizer.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										74
									
								
								spacy/tests/hu/tokenizer/test_tokenizer.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,74 @@ | |||
| import os | ||||
| import re | ||||
| 
 | ||||
| import pytest | ||||
| 
 | ||||
| from spacy.hu import Hungarian | ||||
| 
 | ||||
| _MODULE_PATH = os.path.dirname(__file__) | ||||
| 
 | ||||
| 
 | ||||
| class TokenizerTestCase(object): | ||||
|     INPUT_PREFIX = "IN :" | ||||
|     OUTPUT_PREFIX = "OUT:" | ||||
|     WORD_PATTERN = re.compile(r"<([wc])>([^<>]+)</\1>") | ||||
| 
 | ||||
|     def __init__(self, input_str, expected_words): | ||||
|         self.input = input_str | ||||
|         self.expected_tokens = expected_words | ||||
| 
 | ||||
|     def __repr__(self): | ||||
|         return "TokenizerTestCase<input={}, words={}>".format(repr(self.input), self.expected_tokens) | ||||
| 
 | ||||
|     def to_tuple(self): | ||||
|         return (self.input, self.expected_tokens) | ||||
| 
 | ||||
|     @classmethod | ||||
|     def _parse_output_line(cls, line): | ||||
|         for match in cls.WORD_PATTERN.finditer(line): | ||||
|             yield match.group(2) | ||||
| 
 | ||||
|     @classmethod | ||||
|     def read_from_file(cls, path): | ||||
|         with open(path) as f: | ||||
|             input_lines = [] | ||||
|             output_words = [] | ||||
|             last_type = None | ||||
|             for line in f: | ||||
|                 if line.startswith(cls.INPUT_PREFIX): | ||||
|                     if last_type == TokenizerTestCase.OUTPUT_PREFIX and input_lines: | ||||
|                         yield TokenizerTestCase("\n".join(input_lines), output_words) | ||||
|                         input_lines = [] | ||||
|                         output_words = [] | ||||
|                     input_lines.append(line[len(cls.INPUT_PREFIX):].strip()) | ||||
|                     last_type = TokenizerTestCase.INPUT_PREFIX | ||||
|                 elif line.startswith(cls.OUTPUT_PREFIX): | ||||
|                     output_words.extend(list(cls._parse_output_line(line.strip()))) | ||||
|                     last_type = TokenizerTestCase.OUTPUT_PREFIX | ||||
|                 else: | ||||
|                     # Comments separate test cases | ||||
|                     if input_lines: | ||||
|                         yield TokenizerTestCase("\n".join(input_lines), output_words) | ||||
|                         input_lines = [] | ||||
|                         output_words = [] | ||||
|                     last_type = None | ||||
| 
 | ||||
| 
 | ||||
| _DOTS_CASES = list(TokenizerTestCase.read_from_file(_MODULE_PATH + "/test_default_token_dots.txt")) | ||||
| 
 | ||||
| 
 | ||||
| @pytest.fixture(scope="session") | ||||
| def HU(): | ||||
|     return Hungarian() | ||||
| 
 | ||||
| 
 | ||||
| @pytest.fixture(scope="module") | ||||
| def hu_tokenizer(HU): | ||||
|     return HU.tokenizer | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize(("test_case"), _DOTS_CASES) | ||||
| def test_abbreviations(hu_tokenizer, test_case): | ||||
|     tokens = hu_tokenizer(test_case.input) | ||||
|     token_list = [token.orth_ for token in tokens if not token.is_space] | ||||
|     assert test_case.expected_tokens == token_list, "{} was erronously tokenized as {}".format(test_case, token_list) | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user