mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 23:47:31 +03:00 
			
		
		
		
	Auto-format
This commit is contained in:
		
							parent
							
								
									235fe6fe3b
								
							
						
					
					
						commit
						6e303de717
					
				|  | @ -206,7 +206,15 @@ _upper = LATIN_UPPER + _russian_upper + _tatar_upper + _greek_upper + _ukrainian | |||
| _lower = LATIN_LOWER + _russian_lower + _tatar_lower + _greek_lower + _ukrainian_lower | ||||
| 
 | ||||
| _uncased = ( | ||||
|     _bengali + _hebrew + _persian + _sinhala + _hindi + _kannada + _tamil + _telugu + _hangul | ||||
|     _bengali | ||||
|     + _hebrew | ||||
|     + _persian | ||||
|     + _sinhala | ||||
|     + _hindi | ||||
|     + _kannada | ||||
|     + _tamil | ||||
|     + _telugu | ||||
|     + _hangul | ||||
| ) | ||||
| 
 | ||||
| ALPHA = group_chars(LATIN + _russian + _tatar + _greek + _ukrainian + _uncased) | ||||
|  |  | |||
|  | @ -5,47 +5,47 @@ from ...attrs import LIKE_NUM | |||
| 
 | ||||
| 
 | ||||
| _num_words = [ | ||||
|   "영", | ||||
|   "공", | ||||
|   # Native Korean number system | ||||
|   "하나", | ||||
|   "둘", | ||||
|   "셋", | ||||
|   "넷", | ||||
|   "다섯", | ||||
|   "여섯", | ||||
|   "일곱", | ||||
|   "여덟", | ||||
|   "아홉", | ||||
|   "열", | ||||
|   "스물", | ||||
|   "서른", | ||||
|   "마흔", | ||||
|   "쉰", | ||||
|   "예순", | ||||
|   "일흔", | ||||
|   "여든", | ||||
|   "아흔", | ||||
|   # Sino-Korean number system | ||||
|   "일", | ||||
|   "이",  | ||||
|   "삼", | ||||
|   "사", | ||||
|   "오", | ||||
|   "육", | ||||
|   "칠", | ||||
|   "팔", | ||||
|   "구", | ||||
|   "십", | ||||
|   "백", | ||||
|   "천", | ||||
|   "만", | ||||
|   "십만", | ||||
|   "백만", | ||||
|   "천만", | ||||
|   "일억", | ||||
|   "십억", | ||||
|   "백억" | ||||
|     "영", | ||||
|     "공", | ||||
|     # Native Korean number system | ||||
|     "하나", | ||||
|     "둘", | ||||
|     "셋", | ||||
|     "넷", | ||||
|     "다섯", | ||||
|     "여섯", | ||||
|     "일곱", | ||||
|     "여덟", | ||||
|     "아홉", | ||||
|     "열", | ||||
|     "스물", | ||||
|     "서른", | ||||
|     "마흔", | ||||
|     "쉰", | ||||
|     "예순", | ||||
|     "일흔", | ||||
|     "여든", | ||||
|     "아흔", | ||||
|     # Sino-Korean number system | ||||
|     "일", | ||||
|     "이", | ||||
|     "삼", | ||||
|     "사", | ||||
|     "오", | ||||
|     "육", | ||||
|     "칠", | ||||
|     "팔", | ||||
|     "구", | ||||
|     "십", | ||||
|     "백", | ||||
|     "천", | ||||
|     "만", | ||||
|     "십만", | ||||
|     "백만", | ||||
|     "천만", | ||||
|     "일억", | ||||
|     "십억", | ||||
|     "백억", | ||||
| ] | ||||
| 
 | ||||
| 
 | ||||
|  |  | |||
|  | @ -6,9 +6,7 @@ from ...symbols import ORTH, LEMMA, NORM | |||
| # TODO | ||||
| # treat other apostrophes within words as part of the word: [op d'mannst], [fir d'éischt] (= exceptions) | ||||
| 
 | ||||
| _exc = { | ||||
|      | ||||
| } | ||||
| _exc = {} | ||||
| 
 | ||||
| # translate / delete what is not necessary | ||||
| for exc_data in [ | ||||
|  |  | |||
|  | @ -14,6 +14,7 @@ from .tag_map import TAG_MAP | |||
| def try_jieba_import(use_jieba): | ||||
|     try: | ||||
|         import jieba | ||||
| 
 | ||||
|         return jieba | ||||
|     except ImportError: | ||||
|         if use_jieba: | ||||
|  | @ -34,7 +35,9 @@ class ChineseTokenizer(DummyTokenizer): | |||
|     def __call__(self, text): | ||||
|         # use jieba | ||||
|         if self.use_jieba: | ||||
|             jieba_words = list([x for x in self.jieba_seg.cut(text, cut_all=False) if x]) | ||||
|             jieba_words = list( | ||||
|                 [x for x in self.jieba_seg.cut(text, cut_all=False) if x] | ||||
|             ) | ||||
|             words = [jieba_words[0]] | ||||
|             spaces = [False] | ||||
|             for i in range(1, len(jieba_words)): | ||||
|  |  | |||
|  | @ -271,7 +271,9 @@ class Scorer(object): | |||
|                         self.labelled_per_dep[token.dep_.lower()] = PRFScore() | ||||
|                     if token.dep_.lower() not in cand_deps_per_dep: | ||||
|                         cand_deps_per_dep[token.dep_.lower()] = set() | ||||
|                     cand_deps_per_dep[token.dep_.lower()].add((gold_i, gold_head, token.dep_.lower())) | ||||
|                     cand_deps_per_dep[token.dep_.lower()].add( | ||||
|                         (gold_i, gold_head, token.dep_.lower()) | ||||
|                     ) | ||||
|         if "-" not in [token[-1] for token in gold.orig_annot]: | ||||
|             # Find all NER labels in gold and doc | ||||
|             ent_labels = set([x[0] for x in gold_ents] + [k.label_ for k in doc.ents]) | ||||
|  | @ -304,7 +306,9 @@ class Scorer(object): | |||
|         self.tags.score_set(cand_tags, gold_tags) | ||||
|         self.labelled.score_set(cand_deps, gold_deps) | ||||
|         for dep in self.labelled_per_dep: | ||||
|             self.labelled_per_dep[dep].score_set(cand_deps_per_dep.get(dep, set()), gold_deps_per_dep.get(dep, set())) | ||||
|             self.labelled_per_dep[dep].score_set( | ||||
|                 cand_deps_per_dep.get(dep, set()), gold_deps_per_dep.get(dep, set()) | ||||
|             ) | ||||
|         self.unlabelled.score_set( | ||||
|             set(item[:2] for item in cand_deps), set(item[:2] for item in gold_deps) | ||||
|         ) | ||||
|  |  | |||
|  | @ -11,8 +11,14 @@ def test_issue4590(en_vocab): | |||
|     """Test that matches param in on_match method are the same as matches run with no on_match method""" | ||||
|     pattern = [ | ||||
|         {"SPEC": {"NODE_NAME": "jumped"}, "PATTERN": {"ORTH": "jumped"}}, | ||||
|         {"SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"}, "PATTERN": {"ORTH": "fox"}}, | ||||
|         {"SPEC": {"NODE_NAME": "quick", "NBOR_RELOP": ".", "NBOR_NAME": "jumped"}, "PATTERN": {"ORTH": "fox"}}, | ||||
|         { | ||||
|             "SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"}, | ||||
|             "PATTERN": {"ORTH": "fox"}, | ||||
|         }, | ||||
|         { | ||||
|             "SPEC": {"NODE_NAME": "quick", "NBOR_RELOP": ".", "NBOR_NAME": "jumped"}, | ||||
|             "PATTERN": {"ORTH": "fox"}, | ||||
|         }, | ||||
|     ] | ||||
| 
 | ||||
|     on_match = Mock() | ||||
|  | @ -23,12 +29,11 @@ def test_issue4590(en_vocab): | |||
|     text = "The quick brown fox jumped over the lazy fox" | ||||
|     heads = [3, 2, 1, 1, 0, -1, 2, 1, -3] | ||||
|     deps = ["det", "amod", "amod", "nsubj", "prep", "pobj", "det", "amod"] | ||||
|      | ||||
| 
 | ||||
|     doc = get_doc(en_vocab, text.split(), heads=heads, deps=deps) | ||||
|      | ||||
| 
 | ||||
|     matches = matcher(doc) | ||||
|      | ||||
| 
 | ||||
|     on_match_args = on_match.call_args | ||||
| 
 | ||||
|     assert on_match_args[0][3] == matches | ||||
| 
 | ||||
|  |  | |||
|  | @ -12,8 +12,22 @@ from .util import get_doc | |||
| test_las_apple = [ | ||||
|     [ | ||||
|         "Apple is looking at buying U.K. startup for $ 1 billion", | ||||
|         {"heads": [2, 2, 2, 2, 3, 6, 4, 4, 10, 10, 7], | ||||
|          "deps": ['nsubj', 'aux', 'ROOT', 'prep', 'pcomp', 'compound', 'dobj', 'prep', 'quantmod', 'compound', 'pobj']}, | ||||
|         { | ||||
|             "heads": [2, 2, 2, 2, 3, 6, 4, 4, 10, 10, 7], | ||||
|             "deps": [ | ||||
|                 "nsubj", | ||||
|                 "aux", | ||||
|                 "ROOT", | ||||
|                 "prep", | ||||
|                 "pcomp", | ||||
|                 "compound", | ||||
|                 "dobj", | ||||
|                 "prep", | ||||
|                 "quantmod", | ||||
|                 "compound", | ||||
|                 "pobj", | ||||
|             ], | ||||
|         }, | ||||
|     ] | ||||
| ] | ||||
| 
 | ||||
|  | @ -59,7 +73,7 @@ def test_las_per_type(en_vocab): | |||
|             en_vocab, | ||||
|             words=input_.split(" "), | ||||
|             heads=([h - i for i, h in enumerate(annot["heads"])]), | ||||
|             deps=annot["deps"] | ||||
|             deps=annot["deps"], | ||||
|         ) | ||||
|         gold = GoldParse(doc, heads=annot["heads"], deps=annot["deps"]) | ||||
|         doc[0].dep_ = "compound" | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user