mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 07:57:35 +03:00 
			
		
		
		
	Merge remote-tracking branch 'upstream/master' into bugfix/tokenizer-special-cases-matcher
This commit is contained in:
		
						commit
						cf7047bbdf
					
				|  | @ -24,7 +24,7 @@ class UkrainianDefaults(Language.Defaults): | ||||||
|     stop_words = STOP_WORDS |     stop_words = STOP_WORDS | ||||||
| 
 | 
 | ||||||
|     @classmethod |     @classmethod | ||||||
|     def create_lemmatizer(cls, nlp=None): |     def create_lemmatizer(cls, nlp=None, **kwargs): | ||||||
|         return UkrainianLemmatizer() |         return UkrainianLemmatizer() | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -41,8 +41,8 @@ def test_serialize_parser_roundtrip_bytes(en_vocab, Parser): | ||||||
|     parser.model, _ = parser.Model(10) |     parser.model, _ = parser.Model(10) | ||||||
|     new_parser = Parser(en_vocab) |     new_parser = Parser(en_vocab) | ||||||
|     new_parser.model, _ = new_parser.Model(10) |     new_parser.model, _ = new_parser.Model(10) | ||||||
|     new_parser = new_parser.from_bytes(parser.to_bytes()) |     new_parser = new_parser.from_bytes(parser.to_bytes(exclude=["vocab"])) | ||||||
|     assert new_parser.to_bytes() == parser.to_bytes() |     assert new_parser.to_bytes(exclude=["vocab"]) == parser.to_bytes(exclude=["vocab"]) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @pytest.mark.parametrize("Parser", test_parsers) | @pytest.mark.parametrize("Parser", test_parsers) | ||||||
|  | @ -55,8 +55,8 @@ def test_serialize_parser_roundtrip_disk(en_vocab, Parser): | ||||||
|         parser_d = Parser(en_vocab) |         parser_d = Parser(en_vocab) | ||||||
|         parser_d.model, _ = parser_d.Model(0) |         parser_d.model, _ = parser_d.Model(0) | ||||||
|         parser_d = parser_d.from_disk(file_path) |         parser_d = parser_d.from_disk(file_path) | ||||||
|         parser_bytes = parser.to_bytes(exclude=["model"]) |         parser_bytes = parser.to_bytes(exclude=["model", "vocab"]) | ||||||
|         parser_d_bytes = parser_d.to_bytes(exclude=["model"]) |         parser_d_bytes = parser_d.to_bytes(exclude=["model", "vocab"]) | ||||||
|         assert parser_bytes == parser_d_bytes |         assert parser_bytes == parser_d_bytes | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @ -64,7 +64,7 @@ def test_to_from_bytes(parser, blank_parser): | ||||||
|     assert parser.model is not True |     assert parser.model is not True | ||||||
|     assert blank_parser.model is True |     assert blank_parser.model is True | ||||||
|     assert blank_parser.moves.n_moves != parser.moves.n_moves |     assert blank_parser.moves.n_moves != parser.moves.n_moves | ||||||
|     bytes_data = parser.to_bytes() |     bytes_data = parser.to_bytes(exclude=["vocab"]) | ||||||
|     blank_parser.from_bytes(bytes_data) |     blank_parser.from_bytes(bytes_data) | ||||||
|     assert blank_parser.model is not True |     assert blank_parser.model is not True | ||||||
|     assert blank_parser.moves.n_moves == parser.moves.n_moves |     assert blank_parser.moves.n_moves == parser.moves.n_moves | ||||||
|  | @ -94,15 +94,12 @@ def test_serialize_tagger_roundtrip_disk(en_vocab, taggers): | ||||||
|         assert tagger1_d.to_bytes() == tagger2_d.to_bytes() |         assert tagger1_d.to_bytes() == tagger2_d.to_bytes() | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # I can't get this to work with the lookup tables for 3.5 :(. Something to do |  | ||||||
| # with the dict ordering |  | ||||||
| @pytest.mark.xfail |  | ||||||
| def test_serialize_tensorizer_roundtrip_bytes(en_vocab): | def test_serialize_tensorizer_roundtrip_bytes(en_vocab): | ||||||
|     tensorizer = Tensorizer(en_vocab) |     tensorizer = Tensorizer(en_vocab) | ||||||
|     tensorizer.model = tensorizer.Model() |     tensorizer.model = tensorizer.Model() | ||||||
|     tensorizer_b = tensorizer.to_bytes() |     tensorizer_b = tensorizer.to_bytes(exclude=["vocab"]) | ||||||
|     new_tensorizer = Tensorizer(en_vocab).from_bytes(tensorizer_b) |     new_tensorizer = Tensorizer(en_vocab).from_bytes(tensorizer_b) | ||||||
|     assert new_tensorizer.to_bytes() == tensorizer_b |     assert new_tensorizer.to_bytes(exclude=["vocab"]) == tensorizer_b | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_serialize_tensorizer_roundtrip_disk(en_vocab): | def test_serialize_tensorizer_roundtrip_disk(en_vocab): | ||||||
|  | @ -112,16 +109,15 @@ def test_serialize_tensorizer_roundtrip_disk(en_vocab): | ||||||
|         file_path = d / "tensorizer" |         file_path = d / "tensorizer" | ||||||
|         tensorizer.to_disk(file_path) |         tensorizer.to_disk(file_path) | ||||||
|         tensorizer_d = Tensorizer(en_vocab).from_disk(file_path) |         tensorizer_d = Tensorizer(en_vocab).from_disk(file_path) | ||||||
|         assert tensorizer.to_bytes() == tensorizer_d.to_bytes() |         assert tensorizer.to_bytes(exclude=["vocab"]) == tensorizer_d.to_bytes( | ||||||
|  |             exclude=["vocab"] | ||||||
|  |         ) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # I can't get this to work with the lookup tables for 3.5 :(. Something to do |  | ||||||
| # with the dict ordering |  | ||||||
| @pytest.mark.xfail |  | ||||||
| def test_serialize_textcat_empty(en_vocab): | def test_serialize_textcat_empty(en_vocab): | ||||||
|     # See issue #1105 |     # See issue #1105 | ||||||
|     textcat = TextCategorizer(en_vocab, labels=["ENTITY", "ACTION", "MODIFIER"]) |     textcat = TextCategorizer(en_vocab, labels=["ENTITY", "ACTION", "MODIFIER"]) | ||||||
|     textcat.to_bytes() |     textcat.to_bytes(exclude=["vocab"]) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @pytest.mark.parametrize("Parser", test_parsers) | @pytest.mark.parametrize("Parser", test_parsers) | ||||||
|  | @ -134,13 +130,17 @@ def test_serialize_pipe_exclude(en_vocab, Parser): | ||||||
|     parser = Parser(en_vocab) |     parser = Parser(en_vocab) | ||||||
|     parser.model, _ = parser.Model(0) |     parser.model, _ = parser.Model(0) | ||||||
|     parser.cfg["foo"] = "bar" |     parser.cfg["foo"] = "bar" | ||||||
|     new_parser = get_new_parser().from_bytes(parser.to_bytes()) |     new_parser = get_new_parser().from_bytes(parser.to_bytes(exclude=["vocab"])) | ||||||
|     assert "foo" in new_parser.cfg |     assert "foo" in new_parser.cfg | ||||||
|     new_parser = get_new_parser().from_bytes(parser.to_bytes(), exclude=["cfg"]) |     new_parser = get_new_parser().from_bytes( | ||||||
|  |         parser.to_bytes(exclude=["vocab"]), exclude=["cfg"] | ||||||
|  |     ) | ||||||
|     assert "foo" not in new_parser.cfg |     assert "foo" not in new_parser.cfg | ||||||
|     new_parser = get_new_parser().from_bytes(parser.to_bytes(exclude=["cfg"])) |     new_parser = get_new_parser().from_bytes( | ||||||
|  |         parser.to_bytes(exclude=["cfg"]), exclude=["vocab"] | ||||||
|  |     ) | ||||||
|     assert "foo" not in new_parser.cfg |     assert "foo" not in new_parser.cfg | ||||||
|     with pytest.raises(ValueError): |     with pytest.raises(ValueError): | ||||||
|         parser.to_bytes(cfg=False) |         parser.to_bytes(cfg=False, exclude=["vocab"]) | ||||||
|     with pytest.raises(ValueError): |     with pytest.raises(ValueError): | ||||||
|         get_new_parser().from_bytes(parser.to_bytes(), cfg=False) |         get_new_parser().from_bytes(parser.to_bytes(exclude=["vocab"]), cfg=False) | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user