mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 23:47:31 +03:00 
			
		
		
		
	* Update with WIP * Update with WIP * Update with pipeline serialization * Update types and pipe factories * Add deep merge, tidy up and add tests * Fix pipe creation from config * Don't validate default configs on load * Update spacy/language.py Co-authored-by: Ines Montani <ines@ines.io> * Adjust factory/component meta error * Clean up factory args and remove defaults * Add test for failing empty dict defaults * Update pipeline handling and methods * provide KB as registry function instead of as object * small change in test to make functionality more clear * update example script for EL configuration * Fix typo * Simplify test * Simplify test * splitting pipes.pyx into separate files * moving default configs to each component file * fix batch_size type * removing default values from component constructors where possible (TODO: test 4725) * skip instead of xfail * Add test for config -> nlp with multiple instances * pipeline.pipes -> pipeline.pipe * Tidy up, document, remove kwargs * small cleanup/generalization for Tok2VecListener * use DEFAULT_UPSTREAM field * revert to avoid circular imports * Fix tests * Replace deprecated arg * Make model dirs require config * fix pickling of keyword-only arguments in constructor * WIP: clean up and integrate full config * Add helper to handle function args more reliably Now also includes keyword-only args * Fix config composition and serialization * Improve config debugging and add visual diff * Remove unused defaults and fix type * Remove pipeline and factories from meta * Update spacy/default_config.cfg Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update spacy/default_config.cfg * small UX edits * avoid printing stack trace for debug CLI commands * Add support for language-specific factories * specify the section of the config which holds the model to debug * WIP: add Language.from_config * Update with language data refactor WIP * Auto-format * Add backwards-compat handling for Language.factories * Update morphologizer.pyx * Fix morphologizer * Update and simplify lemmatizers * Fix Japanese tests * Port over tagger changes * Fix Chinese and tests * Update to latest Thinc * WIP: xfail first Russian lemmatizer test * Fix component-specific overrides * fix nO for output layers in debug_model * Fix default value * Fix tests and don't pass objects in config * Fix deep merging * Fix lemma lookup data registry Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed) * Add types * Add Vocab.from_config * Fix typo * Fix tests * Make config copying more elegant * Fix pipe analysis * Fix lemmatizers and is_base_form * WIP: move language defaults to config * Fix morphology type * Fix vocab * Remove comment * Update to latest Thinc * Add morph rules to config * Tidy up * Remove set_morphology option from tagger factory * Hack use_gpu * Move [pipeline] to top-level block and make [nlp.pipeline] list Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them * Fix use_gpu and resume in CLI * Auto-format * Remove resume from config * Fix formatting and error * [pipeline] -> [components] * Fix types * Fix tagger test: requires set_morphology? Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com> Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
		
			
				
	
	
		
			180 lines
		
	
	
		
			5.5 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			180 lines
		
	
	
		
			5.5 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| import pytest
 | |
| import re
 | |
| from spacy.tokens import Doc
 | |
| from spacy.vocab import Vocab
 | |
| from spacy.lang.en import English
 | |
| from spacy.lang.lex_attrs import LEX_ATTRS
 | |
| from spacy.matcher import Matcher
 | |
| from spacy.tokenizer import Tokenizer
 | |
| from spacy.lemmatizer import Lemmatizer
 | |
| from spacy.lookups import Lookups
 | |
| from spacy.symbols import ORTH, LEMMA, POS, VERB
 | |
| 
 | |
| 
 | |
| def test_issue1061():
 | |
|     """Test special-case works after tokenizing. Was caching problem."""
 | |
|     text = "I like _MATH_ even _MATH_ when _MATH_, except when _MATH_ is _MATH_! but not _MATH_."
 | |
|     tokenizer = English().tokenizer
 | |
|     doc = tokenizer(text)
 | |
|     assert "MATH" in [w.text for w in doc]
 | |
|     assert "_MATH_" not in [w.text for w in doc]
 | |
| 
 | |
|     tokenizer.add_special_case("_MATH_", [{ORTH: "_MATH_"}])
 | |
|     doc = tokenizer(text)
 | |
|     assert "_MATH_" in [w.text for w in doc]
 | |
|     assert "MATH" not in [w.text for w in doc]
 | |
| 
 | |
|     # For sanity, check it works when pipeline is clean.
 | |
|     tokenizer = English().tokenizer
 | |
|     tokenizer.add_special_case("_MATH_", [{ORTH: "_MATH_"}])
 | |
|     doc = tokenizer(text)
 | |
|     assert "_MATH_" in [w.text for w in doc]
 | |
|     assert "MATH" not in [w.text for w in doc]
 | |
| 
 | |
| 
 | |
| @pytest.mark.skip(
 | |
|     reason="Can not be fixed without variable-width look-behind (which we don't want)"
 | |
| )
 | |
| def test_issue1235():
 | |
|     """Test that g is not split of if preceded by a number and a letter"""
 | |
|     nlp = English()
 | |
|     testwords = "e2g 2g 52g"
 | |
|     doc = nlp(testwords)
 | |
|     assert len(doc) == 5
 | |
|     assert doc[0].text == "e2g"
 | |
|     assert doc[1].text == "2"
 | |
|     assert doc[2].text == "g"
 | |
|     assert doc[3].text == "52"
 | |
|     assert doc[4].text == "g"
 | |
| 
 | |
| 
 | |
| def test_issue1242():
 | |
|     nlp = English()
 | |
|     doc = nlp("")
 | |
|     assert len(doc) == 0
 | |
|     docs = list(nlp.pipe(["", "hello"]))
 | |
|     assert len(docs[0]) == 0
 | |
|     assert len(docs[1]) == 1
 | |
| 
 | |
| 
 | |
| def test_issue1250():
 | |
|     """Test cached special cases."""
 | |
|     special_case = [{ORTH: "reimbur", LEMMA: "reimburse", POS: "VERB"}]
 | |
|     nlp = English()
 | |
|     nlp.tokenizer.add_special_case("reimbur", special_case)
 | |
|     lemmas = [w.lemma_ for w in nlp("reimbur, reimbur...")]
 | |
|     assert lemmas == ["reimburse", ",", "reimburse", "..."]
 | |
|     lemmas = [w.lemma_ for w in nlp("reimbur, reimbur...")]
 | |
|     assert lemmas == ["reimburse", ",", "reimburse", "..."]
 | |
| 
 | |
| 
 | |
| def test_issue1257():
 | |
|     """Test that tokens compare correctly."""
 | |
|     doc1 = Doc(Vocab(), words=["a", "b", "c"])
 | |
|     doc2 = Doc(Vocab(), words=["a", "c", "e"])
 | |
|     assert doc1[0] != doc2[0]
 | |
|     assert not doc1[0] == doc2[0]
 | |
| 
 | |
| 
 | |
| def test_issue1375():
 | |
|     """Test that token.nbor() raises IndexError for out-of-bounds access."""
 | |
|     doc = Doc(Vocab(), words=["0", "1", "2"])
 | |
|     with pytest.raises(IndexError):
 | |
|         assert doc[0].nbor(-1)
 | |
|     assert doc[1].nbor(-1).text == "0"
 | |
|     with pytest.raises(IndexError):
 | |
|         assert doc[2].nbor(1)
 | |
|     assert doc[1].nbor(1).text == "2"
 | |
| 
 | |
| 
 | |
| def test_issue1387():
 | |
|     tag_map = {"VBG": {POS: VERB, "VerbForm": "part"}}
 | |
|     lookups = Lookups()
 | |
|     lookups.add_table("lemma_index", {"verb": ("cope", "cop")})
 | |
|     lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}})
 | |
|     lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
 | |
|     lemmatizer = Lemmatizer(lookups)
 | |
|     vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map)
 | |
|     doc = Doc(vocab, words=["coping"])
 | |
|     doc[0].tag_ = "VBG"
 | |
|     assert doc[0].text == "coping"
 | |
|     assert doc[0].lemma_ == "cope"
 | |
| 
 | |
| 
 | |
| def test_issue1434():
 | |
|     """Test matches occur when optional element at end of short doc."""
 | |
|     pattern = [{"ORTH": "Hello"}, {"IS_ALPHA": True, "OP": "?"}]
 | |
|     vocab = Vocab(lex_attr_getters=LEX_ATTRS)
 | |
|     hello_world = Doc(vocab, words=["Hello", "World"])
 | |
|     hello = Doc(vocab, words=["Hello"])
 | |
|     matcher = Matcher(vocab)
 | |
|     matcher.add("MyMatcher", [pattern])
 | |
|     matches = matcher(hello_world)
 | |
|     assert matches
 | |
|     matches = matcher(hello)
 | |
|     assert matches
 | |
| 
 | |
| 
 | |
| @pytest.mark.parametrize(
 | |
|     "string,start,end",
 | |
|     [
 | |
|         ("a", 0, 1),
 | |
|         ("a b", 0, 2),
 | |
|         ("a c", 0, 1),
 | |
|         ("a b c", 0, 2),
 | |
|         ("a b b c", 0, 3),
 | |
|         ("a b b", 0, 3),
 | |
|     ],
 | |
| )
 | |
| def test_issue1450(string, start, end):
 | |
|     """Test matcher works when patterns end with * operator."""
 | |
|     pattern = [{"ORTH": "a"}, {"ORTH": "b", "OP": "*"}]
 | |
|     matcher = Matcher(Vocab())
 | |
|     matcher.add("TSTEND", [pattern])
 | |
|     doc = Doc(Vocab(), words=string.split())
 | |
|     matches = matcher(doc)
 | |
|     if start is None or end is None:
 | |
|         assert matches == []
 | |
|     assert matches[-1][1] == start
 | |
|     assert matches[-1][2] == end
 | |
| 
 | |
| 
 | |
| def test_issue1488():
 | |
|     prefix_re = re.compile(r"""[\[\("']""")
 | |
|     suffix_re = re.compile(r"""[\]\)"']""")
 | |
|     infix_re = re.compile(r"""[-~\.]""")
 | |
|     simple_url_re = re.compile(r"""^https?://""")
 | |
| 
 | |
|     def my_tokenizer(nlp):
 | |
|         return Tokenizer(
 | |
|             nlp.vocab,
 | |
|             {},
 | |
|             prefix_search=prefix_re.search,
 | |
|             suffix_search=suffix_re.search,
 | |
|             infix_finditer=infix_re.finditer,
 | |
|             token_match=simple_url_re.match,
 | |
|         )
 | |
| 
 | |
|     nlp = English()
 | |
|     nlp.tokenizer = my_tokenizer(nlp)
 | |
|     doc = nlp("This is a test.")
 | |
|     for token in doc:
 | |
|         assert token.text
 | |
| 
 | |
| 
 | |
| def test_issue1494():
 | |
|     infix_re = re.compile(r"""[^a-z]""")
 | |
|     test_cases = [
 | |
|         ("token 123test", ["token", "1", "2", "3", "test"]),
 | |
|         ("token 1test", ["token", "1test"]),
 | |
|         ("hello...test", ["hello", ".", ".", ".", "test"]),
 | |
|     ]
 | |
| 
 | |
|     def new_tokenizer(nlp):
 | |
|         return Tokenizer(nlp.vocab, {}, infix_finditer=infix_re.finditer)
 | |
| 
 | |
|     nlp = English()
 | |
|     nlp.tokenizer = new_tokenizer(nlp)
 | |
|     for text, expected in test_cases:
 | |
|         assert [token.text for token in nlp(text)] == expected
 |