mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-28 06:31:12 +03:00 
			
		
		
		
	Tidy up merge conflict leftovers
This commit is contained in:
		
							parent
							
								
									61d09c481b
								
							
						
					
					
						commit
						ae880ef912
					
				|  | @ -1,11 +1,11 @@ | |||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| import re | ||||
| 
 | ||||
| from ...gold import iob_to_biluo | ||||
| from ...util import minibatch | ||||
| 
 | ||||
| import re | ||||
| 
 | ||||
| 
 | ||||
| def iob2json(input_data, n_sents=10, *args, **kwargs): | ||||
|     """ | ||||
|  |  | |||
|  | @ -1,7 +1,6 @@ | |||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| # stop words from HAZM package | ||||
| 
 | ||||
| # Stop words from HAZM package | ||||
| STOP_WORDS = set( | ||||
|  |  | |||
|  | @ -1,10 +1,3 @@ | |||
| """ | ||||
| Slang and abbreviations | ||||
| 
 | ||||
| Daftar kosakata yang sering salah dieja | ||||
| https://id.wikipedia.org/wiki/Wikipedia:Daftar_kosakata_bahasa_Indonesia_yang_sering_salah_dieja | ||||
| 
 | ||||
| """ | ||||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
|  |  | |||
|  | @ -1,6 +1,3 @@ | |||
| """ | ||||
| List of stop words in Bahasa Indonesia. | ||||
| """ | ||||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
|  |  | |||
|  | @ -1,7 +1,3 @@ | |||
| """ | ||||
| Daftar singkatan dan Akronim dari: | ||||
| https://id.wiktionary.org/wiki/Wiktionary:Daftar_singkatan_dan_akronim_bahasa_Indonesia#A | ||||
| """ | ||||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
|  |  | |||
|  | @ -291,8 +291,6 @@ cdef char get_quantifier(PatternStateC state) nogil: | |||
| 
 | ||||
| DEF PADDING = 5 | ||||
| 
 | ||||
| DEF PADDING = 5 | ||||
| 
 | ||||
| 
 | ||||
| cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id, | ||||
|                                  object token_specs) except NULL: | ||||
|  |  | |||
|  | @ -53,24 +53,7 @@ def test_spans_merge_heads(en_tokenizer): | |||
| 
 | ||||
| def test_spans_merge_non_disjoint(en_tokenizer): | ||||
|     text = "Los Angeles start." | ||||
|     tokens = en_tokenizer(text) | ||||
|     doc = get_doc(tokens.vocab, [t.text for t in tokens]) | ||||
|     with pytest.raises(ValueError): | ||||
|         with doc.retokenize() as retokenizer: | ||||
|             retokenizer.merge( | ||||
|                 doc[0:2], | ||||
|                 attrs={"tag": "NNP", "lemma": "Los Angeles", "ent_type": "GPE"}, | ||||
|             ) | ||||
|             retokenizer.merge( | ||||
|                 doc[0:1], | ||||
|                 attrs={"tag": "NNP", "lemma": "Los Angeles", "ent_type": "GPE"}, | ||||
|             ) | ||||
| 
 | ||||
| 
 | ||||
| def test_spans_merge_non_disjoint(en_tokenizer): | ||||
|     text = "Los Angeles start." | ||||
|     tokens = en_tokenizer(text) | ||||
|     doc = get_doc(tokens.vocab, [t.text for t in tokens]) | ||||
|     doc = en_tokenizer(text) | ||||
|     with pytest.raises(ValueError): | ||||
|         with doc.retokenize() as retokenizer: | ||||
|             retokenizer.merge( | ||||
|  |  | |||
|  | @ -1,36 +0,0 @@ | |||
| '''Test issue that arises when too many labels are added to NER model.''' | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| import random | ||||
| from ...lang.en import English | ||||
| 
 | ||||
| def train_model(train_data, entity_types): | ||||
|     nlp = English(pipeline=[]) | ||||
| 
 | ||||
|     ner = nlp.create_pipe("ner") | ||||
|     nlp.add_pipe(ner) | ||||
| 
 | ||||
|     for entity_type in list(entity_types): | ||||
|         ner.add_label(entity_type) | ||||
| 
 | ||||
|     optimizer = nlp.begin_training() | ||||
| 
 | ||||
|     # Start training | ||||
|     for i in range(20): | ||||
|         losses = {} | ||||
|         index = 0 | ||||
|         random.shuffle(train_data) | ||||
| 
 | ||||
|         for statement, entities in train_data: | ||||
|             nlp.update([statement], [entities], sgd=optimizer, losses=losses, drop=0.5) | ||||
|     return nlp | ||||
| 
 | ||||
| 
 | ||||
| def test_train_with_many_entity_types(): | ||||
|     train_data = [] | ||||
|     train_data.extend([("One sentence", {"entities": []})]) | ||||
|     entity_types = [str(i) for i in range(1000)] | ||||
| 
 | ||||
|     model = train_model(train_data, entity_types) | ||||
| 
 | ||||
|      | ||||
|  | @ -1,40 +0,0 @@ | |||
| # coding: utf-8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| import pytest | ||||
| import os | ||||
| from pathlib import Path | ||||
| 
 | ||||
| from ..compat import symlink_to, symlink_remove, path2str | ||||
| 
 | ||||
| 
 | ||||
| def target_local_path(): | ||||
|     return "./foo-target" | ||||
| 
 | ||||
| 
 | ||||
| def link_local_path(): | ||||
|     return "./foo-symlink" | ||||
| 
 | ||||
| 
 | ||||
| @pytest.fixture(scope="function") | ||||
| def setup_target(request): | ||||
|     target = Path(target_local_path()) | ||||
|     if not target.exists(): | ||||
|         os.mkdir(path2str(target)) | ||||
| 
 | ||||
|     # yield -- need to cleanup even if assertion fails | ||||
|     # https://github.com/pytest-dev/pytest/issues/2508#issuecomment-309934240 | ||||
|     def cleanup(): | ||||
|         symlink_remove(Path(link_local_path())) | ||||
|         os.rmdir(target_local_path()) | ||||
| 
 | ||||
|     request.addfinalizer(cleanup) | ||||
| 
 | ||||
| 
 | ||||
| def test_create_symlink_windows(setup_target): | ||||
|     target = Path(target_local_path()) | ||||
|     link = Path(link_local_path()) | ||||
|     assert target.exists() | ||||
| 
 | ||||
|     symlink_to(link, target) | ||||
|     assert link.exists() | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user