mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	Merge pull request #8397 from adrianeboyd/chore/develop-into-master-v3.1
Merge develop into master for v3.1
This commit is contained in:
		
						commit
						b09be3e1cb
					
				
							
								
								
									
										10
									
								
								.github/azure-steps.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										10
									
								
								.github/azure-steps.yml
									
									
									
									
										vendored
									
									
								
							| 
						 | 
					@ -60,8 +60,8 @@ steps:
 | 
				
			||||||
    displayName: "Run GPU tests"
 | 
					    displayName: "Run GPU tests"
 | 
				
			||||||
    condition: eq(${{ parameters.gpu }}, true)
 | 
					    condition: eq(${{ parameters.gpu }}, true)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  - script: |
 | 
					#  - script: |
 | 
				
			||||||
      python -m spacy download en_core_web_sm
 | 
					#      python -m spacy download en_core_web_sm
 | 
				
			||||||
      python -c "import spacy; nlp=spacy.load('en_core_web_sm'); doc=nlp('test')"
 | 
					#      python -c "import spacy; nlp=spacy.load('en_core_web_sm'); doc=nlp('test')"
 | 
				
			||||||
    displayName: 'Test download CLI'
 | 
					#    displayName: 'Test download CLI'
 | 
				
			||||||
    condition: eq(variables['python_version'], '3.8')
 | 
					#    condition: eq(variables['python_version'], '3.8')
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -8,3 +8,4 @@ recursive-exclude spacy/lang *.json
 | 
				
			||||||
recursive-include spacy/lang *.json.gz
 | 
					recursive-include spacy/lang *.json.gz
 | 
				
			||||||
recursive-include spacy/cli *.json *.yml
 | 
					recursive-include spacy/cli *.json *.yml
 | 
				
			||||||
recursive-include licenses *
 | 
					recursive-include licenses *
 | 
				
			||||||
 | 
					recursive-exclude spacy *.cpp
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -43,8 +43,8 @@ scikit-learn
 | 
				
			||||||
 | 
					
 | 
				
			||||||
* Files: scorer.py
 | 
					* Files: scorer.py
 | 
				
			||||||
 | 
					
 | 
				
			||||||
The following implementation of roc_auc_score() is adapted from
 | 
					The implementation of roc_auc_score() is adapted from scikit-learn, which is
 | 
				
			||||||
scikit-learn, which is distributed under the following license:
 | 
					distributed under the following license:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
New BSD License
 | 
					New BSD License
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -77,3 +77,30 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 | 
				
			||||||
LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 | 
					LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 | 
				
			||||||
OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
 | 
					OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
 | 
				
			||||||
DAMAGE.
 | 
					DAMAGE.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					pyvi
 | 
				
			||||||
 | 
					----
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					* Files: lang/vi/__init__.py
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					The MIT License (MIT)
 | 
				
			||||||
 | 
					Copyright (c) 2016 Viet-Trung Tran
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Permission is hereby granted, free of charge, to any person obtaining a copy of
 | 
				
			||||||
 | 
					this software and associated documentation files (the "Software"), to deal in
 | 
				
			||||||
 | 
					the Software without restriction, including without limitation the rights to
 | 
				
			||||||
 | 
					use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
 | 
				
			||||||
 | 
					of the Software, and to permit persons to whom the Software is furnished to do
 | 
				
			||||||
 | 
					so, subject to the following conditions:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					The above copyright notice and this permission notice shall be included in all
 | 
				
			||||||
 | 
					copies or substantial portions of the Software.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 | 
				
			||||||
 | 
					IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 | 
				
			||||||
 | 
					FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 | 
				
			||||||
 | 
					AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 | 
				
			||||||
 | 
					LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 | 
				
			||||||
 | 
					OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 | 
				
			||||||
 | 
					SOFTWARE.
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -68,7 +68,7 @@ console_scripts =
 | 
				
			||||||
 | 
					
 | 
				
			||||||
[options.extras_require]
 | 
					[options.extras_require]
 | 
				
			||||||
lookups =
 | 
					lookups =
 | 
				
			||||||
    spacy_lookups_data>=1.0.0,<1.1.0
 | 
					    spacy_lookups_data>=1.0.1,<1.1.0
 | 
				
			||||||
transformers =
 | 
					transformers =
 | 
				
			||||||
    spacy_transformers>=1.0.1,<1.1.0
 | 
					    spacy_transformers>=1.0.1,<1.1.0
 | 
				
			||||||
ray =
 | 
					ray =
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,6 @@
 | 
				
			||||||
# fmt: off
 | 
					# fmt: off
 | 
				
			||||||
__title__ = "spacy"
 | 
					__title__ = "spacy"
 | 
				
			||||||
__version__ = "3.0.6"
 | 
					__version__ = "3.1.0.dev0"
 | 
				
			||||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 | 
					__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 | 
				
			||||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
 | 
					__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
 | 
				
			||||||
__projects__ = "https://github.com/explosion/projects"
 | 
					__projects__ = "https://github.com/explosion/projects"
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -115,7 +115,8 @@ def convert(
 | 
				
			||||||
    ner_map = srsly.read_json(ner_map) if ner_map is not None else None
 | 
					    ner_map = srsly.read_json(ner_map) if ner_map is not None else None
 | 
				
			||||||
    doc_files = []
 | 
					    doc_files = []
 | 
				
			||||||
    for input_loc in walk_directory(Path(input_path), converter):
 | 
					    for input_loc in walk_directory(Path(input_path), converter):
 | 
				
			||||||
        input_data = input_loc.open("r", encoding="utf-8").read()
 | 
					        with input_loc.open("r", encoding="utf-8") as infile:
 | 
				
			||||||
 | 
					            input_data = infile.read()
 | 
				
			||||||
        # Use converter function to convert data
 | 
					        # Use converter function to convert data
 | 
				
			||||||
        func = CONVERTERS[converter]
 | 
					        func = CONVERTERS[converter]
 | 
				
			||||||
        docs = func(
 | 
					        docs = func(
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -112,7 +112,9 @@ def package(
 | 
				
			||||||
        msg.fail("Invalid pipeline meta.json")
 | 
					        msg.fail("Invalid pipeline meta.json")
 | 
				
			||||||
        print("\n".join(errors))
 | 
					        print("\n".join(errors))
 | 
				
			||||||
        sys.exit(1)
 | 
					        sys.exit(1)
 | 
				
			||||||
    model_name = meta["lang"] + "_" + meta["name"]
 | 
					    model_name = meta["name"]
 | 
				
			||||||
 | 
					    if not model_name.startswith(meta["lang"] + "_"):
 | 
				
			||||||
 | 
					        model_name = f"{meta['lang']}_{model_name}"
 | 
				
			||||||
    model_name_v = model_name + "-" + meta["version"]
 | 
					    model_name_v = model_name + "-" + meta["version"]
 | 
				
			||||||
    main_path = output_dir / model_name_v
 | 
					    main_path = output_dir / model_name_v
 | 
				
			||||||
    package_path = main_path / model_name
 | 
					    package_path = main_path / model_name
 | 
				
			||||||
| 
						 | 
					@ -128,9 +130,10 @@ def package(
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
    Path.mkdir(package_path, parents=True)
 | 
					    Path.mkdir(package_path, parents=True)
 | 
				
			||||||
    shutil.copytree(str(input_dir), str(package_path / model_name_v))
 | 
					    shutil.copytree(str(input_dir), str(package_path / model_name_v))
 | 
				
			||||||
    license_path = package_path / model_name_v / "LICENSE"
 | 
					    for file_name in FILENAMES_DOCS:
 | 
				
			||||||
    if license_path.exists():
 | 
					        file_path = package_path / model_name_v / file_name
 | 
				
			||||||
        shutil.move(str(license_path), str(main_path))
 | 
					        if file_path.exists():
 | 
				
			||||||
 | 
					            shutil.move(str(file_path), str(main_path))
 | 
				
			||||||
    imports = []
 | 
					    imports = []
 | 
				
			||||||
    for code_path in code_paths:
 | 
					    for code_path in code_paths:
 | 
				
			||||||
        imports.append(code_path.stem)
 | 
					        imports.append(code_path.stem)
 | 
				
			||||||
| 
						 | 
					@ -294,7 +297,7 @@ def setup_package():
 | 
				
			||||||
 | 
					
 | 
				
			||||||
if __name__ == '__main__':
 | 
					if __name__ == '__main__':
 | 
				
			||||||
    setup_package()
 | 
					    setup_package()
 | 
				
			||||||
""".strip()
 | 
					""".lstrip()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
TEMPLATE_MANIFEST = """
 | 
					TEMPLATE_MANIFEST = """
 | 
				
			||||||
| 
						 | 
					@ -314,4 +317,7 @@ __version__ = get_model_meta(Path(__file__).parent)['version']
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def load(**overrides):
 | 
					def load(**overrides):
 | 
				
			||||||
    return load_model_from_init_py(__file__, **overrides)
 | 
					    return load_model_from_init_py(__file__, **overrides)
 | 
				
			||||||
""".strip()
 | 
					""".lstrip()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					FILENAMES_DOCS = ["LICENSE", "LICENSES_SOURCES", "README.md"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -372,7 +372,7 @@ factory = "{{ pipe }}"
 | 
				
			||||||
[corpora.train]
 | 
					[corpora.train]
 | 
				
			||||||
@readers = "spacy.Corpus.v1"
 | 
					@readers = "spacy.Corpus.v1"
 | 
				
			||||||
path = ${paths.train}
 | 
					path = ${paths.train}
 | 
				
			||||||
max_length = {{ 500 if hardware == "gpu" else 2000 }}
 | 
					max_length = 0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
[corpora.dev]
 | 
					[corpora.dev]
 | 
				
			||||||
@readers = "spacy.Corpus.v1"
 | 
					@readers = "spacy.Corpus.v1"
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -80,6 +80,8 @@ eval_frequency = 200
 | 
				
			||||||
score_weights = {}
 | 
					score_weights = {}
 | 
				
			||||||
# Names of pipeline components that shouldn't be updated during training
 | 
					# Names of pipeline components that shouldn't be updated during training
 | 
				
			||||||
frozen_components = []
 | 
					frozen_components = []
 | 
				
			||||||
 | 
					# Names of pipeline components that should set annotations during training
 | 
				
			||||||
 | 
					annotating_components = []
 | 
				
			||||||
# Location in the config where the dev corpus is defined
 | 
					# Location in the config where the dev corpus is defined
 | 
				
			||||||
dev_corpus = "corpora.dev"
 | 
					dev_corpus = "corpora.dev"
 | 
				
			||||||
# Location in the config where the train corpus is defined
 | 
					# Location in the config where the train corpus is defined
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -24,6 +24,9 @@ def setup_default_warnings():
 | 
				
			||||||
    for pipe in ["matcher", "entity_ruler"]:
 | 
					    for pipe in ["matcher", "entity_ruler"]:
 | 
				
			||||||
        filter_warning("once", error_msg=Warnings.W036.format(name=pipe))
 | 
					        filter_warning("once", error_msg=Warnings.W036.format(name=pipe))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # warn once about lemmatizer without required POS
 | 
				
			||||||
 | 
					    filter_warning("once", error_msg="[W108]")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def filter_warning(action: str, error_msg: str):
 | 
					def filter_warning(action: str, error_msg: str):
 | 
				
			||||||
    """Customize how spaCy should handle a certain warning.
 | 
					    """Customize how spaCy should handle a certain warning.
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -28,7 +28,7 @@ cdef class Candidate:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef class KnowledgeBase:
 | 
					cdef class KnowledgeBase:
 | 
				
			||||||
    cdef Pool mem
 | 
					    cdef Pool mem
 | 
				
			||||||
    cpdef readonly Vocab vocab
 | 
					    cdef readonly Vocab vocab
 | 
				
			||||||
    cdef int64_t entity_vector_length
 | 
					    cdef int64_t entity_vector_length
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # This maps 64bit keys (hash of unique entity string)
 | 
					    # This maps 64bit keys (hash of unique entity string)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,15 +1,23 @@
 | 
				
			||||||
 | 
					from typing import Optional
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from thinc.api import Model
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
					from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
				
			||||||
from .punctuation import TOKENIZER_INFIXES
 | 
					from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
 | 
					from .syntax_iterators import SYNTAX_ITERATORS
 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
 | 
					from .lemmatizer import CatalanLemmatizer
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class CatalanDefaults(Language.Defaults):
 | 
					class CatalanDefaults(Language.Defaults):
 | 
				
			||||||
    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
 | 
					    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
 | 
				
			||||||
    infixes = TOKENIZER_INFIXES
 | 
					    infixes = TOKENIZER_INFIXES
 | 
				
			||||||
 | 
					    suffixes = TOKENIZER_SUFFIXES
 | 
				
			||||||
    stop_words = STOP_WORDS
 | 
					    stop_words = STOP_WORDS
 | 
				
			||||||
    lex_attr_getters = LEX_ATTRS
 | 
					    lex_attr_getters = LEX_ATTRS
 | 
				
			||||||
 | 
					    syntax_iterators = SYNTAX_ITERATORS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Catalan(Language):
 | 
					class Catalan(Language):
 | 
				
			||||||
| 
						 | 
					@ -17,4 +25,16 @@ class Catalan(Language):
 | 
				
			||||||
    Defaults = CatalanDefaults
 | 
					    Defaults = CatalanDefaults
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@Catalan.factory(
 | 
				
			||||||
 | 
					    "lemmatizer",
 | 
				
			||||||
 | 
					    assigns=["token.lemma"],
 | 
				
			||||||
 | 
					    default_config={"model": None, "mode": "rule", "overwrite": False},
 | 
				
			||||||
 | 
					    default_score_weights={"lemma_acc": 1.0},
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					def make_lemmatizer(
 | 
				
			||||||
 | 
					    nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
 | 
				
			||||||
 | 
					):
 | 
				
			||||||
 | 
					    return CatalanLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Catalan"]
 | 
					__all__ = ["Catalan"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										81
									
								
								spacy/lang/ca/lemmatizer.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										81
									
								
								spacy/lang/ca/lemmatizer.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,81 @@
 | 
				
			||||||
 | 
					from typing import List, Tuple
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from ...pipeline import Lemmatizer
 | 
				
			||||||
 | 
					from ...tokens import Token
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class CatalanLemmatizer(Lemmatizer):
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    Copied from French Lemmatizer
 | 
				
			||||||
 | 
					    Catalan language lemmatizer applies the default rule based lemmatization
 | 
				
			||||||
 | 
					    procedure with some modifications for better Catalan language support.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    The parts of speech 'ADV', 'PRON', 'DET', 'ADP' and 'AUX' are added to use
 | 
				
			||||||
 | 
					    the rule-based lemmatization. As a last resort, the lemmatizer checks in
 | 
				
			||||||
 | 
					    the lookup table.
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    @classmethod
 | 
				
			||||||
 | 
					    def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]:
 | 
				
			||||||
 | 
					        if mode == "rule":
 | 
				
			||||||
 | 
					            required = ["lemma_lookup", "lemma_rules", "lemma_exc", "lemma_index"]
 | 
				
			||||||
 | 
					            return (required, [])
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            return super().get_lookups_config(mode)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def rule_lemmatize(self, token: Token) -> List[str]:
 | 
				
			||||||
 | 
					        cache_key = (token.orth, token.pos)
 | 
				
			||||||
 | 
					        if cache_key in self.cache:
 | 
				
			||||||
 | 
					            return self.cache[cache_key]
 | 
				
			||||||
 | 
					        string = token.text
 | 
				
			||||||
 | 
					        univ_pos = token.pos_.lower()
 | 
				
			||||||
 | 
					        if univ_pos in ("", "eol", "space"):
 | 
				
			||||||
 | 
					            return [string.lower()]
 | 
				
			||||||
 | 
					        elif "lemma_rules" not in self.lookups or univ_pos not in (
 | 
				
			||||||
 | 
					            "noun",
 | 
				
			||||||
 | 
					            "verb",
 | 
				
			||||||
 | 
					            "adj",
 | 
				
			||||||
 | 
					            "adp",
 | 
				
			||||||
 | 
					            "adv",
 | 
				
			||||||
 | 
					            "aux",
 | 
				
			||||||
 | 
					            "cconj",
 | 
				
			||||||
 | 
					            "det",
 | 
				
			||||||
 | 
					            "pron",
 | 
				
			||||||
 | 
					            "punct",
 | 
				
			||||||
 | 
					            "sconj",
 | 
				
			||||||
 | 
					        ):
 | 
				
			||||||
 | 
					            return self.lookup_lemmatize(token)
 | 
				
			||||||
 | 
					        index_table = self.lookups.get_table("lemma_index", {})
 | 
				
			||||||
 | 
					        exc_table = self.lookups.get_table("lemma_exc", {})
 | 
				
			||||||
 | 
					        rules_table = self.lookups.get_table("lemma_rules", {})
 | 
				
			||||||
 | 
					        lookup_table = self.lookups.get_table("lemma_lookup", {})
 | 
				
			||||||
 | 
					        index = index_table.get(univ_pos, {})
 | 
				
			||||||
 | 
					        exceptions = exc_table.get(univ_pos, {})
 | 
				
			||||||
 | 
					        rules = rules_table.get(univ_pos, [])
 | 
				
			||||||
 | 
					        string = string.lower()
 | 
				
			||||||
 | 
					        forms = []
 | 
				
			||||||
 | 
					        if string in index:
 | 
				
			||||||
 | 
					            forms.append(string)
 | 
				
			||||||
 | 
					            self.cache[cache_key] = forms
 | 
				
			||||||
 | 
					            return forms
 | 
				
			||||||
 | 
					        forms.extend(exceptions.get(string, []))
 | 
				
			||||||
 | 
					        oov_forms = []
 | 
				
			||||||
 | 
					        if not forms:
 | 
				
			||||||
 | 
					            for old, new in rules:
 | 
				
			||||||
 | 
					                if string.endswith(old):
 | 
				
			||||||
 | 
					                    form = string[: len(string) - len(old)] + new
 | 
				
			||||||
 | 
					                    if not form:
 | 
				
			||||||
 | 
					                        pass
 | 
				
			||||||
 | 
					                    elif form in index or not form.isalpha():
 | 
				
			||||||
 | 
					                        forms.append(form)
 | 
				
			||||||
 | 
					                    else:
 | 
				
			||||||
 | 
					                        oov_forms.append(form)
 | 
				
			||||||
 | 
					        if not forms:
 | 
				
			||||||
 | 
					            forms.extend(oov_forms)
 | 
				
			||||||
 | 
					        if not forms and string in lookup_table.keys():
 | 
				
			||||||
 | 
					            forms.append(self.lookup_lemmatize(token)[0])
 | 
				
			||||||
 | 
					        if not forms:
 | 
				
			||||||
 | 
					            forms.append(string)
 | 
				
			||||||
 | 
					        forms = list(set(forms))
 | 
				
			||||||
 | 
					        self.cache[cache_key] = forms
 | 
				
			||||||
 | 
					        return forms
 | 
				
			||||||
| 
						 | 
					@ -1,12 +1,46 @@
 | 
				
			||||||
from ..punctuation import TOKENIZER_INFIXES
 | 
					from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS
 | 
				
			||||||
from ..char_classes import ALPHA
 | 
					from ..char_classes import CURRENCY
 | 
				
			||||||
 | 
					from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT
 | 
				
			||||||
 | 
					from ..char_classes import merge_chars, _units
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
ELISION = " ' ’ ".strip().replace(" ", "").replace("\n", "")
 | 
					ELISION = " ' ’ ".strip().replace(" ", "").replace("\n", "")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_infixes = TOKENIZER_INFIXES + [
 | 
					_infixes = (
 | 
				
			||||||
    r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION)
 | 
					    LIST_ELLIPSES
 | 
				
			||||||
]
 | 
					    + LIST_ICONS
 | 
				
			||||||
 | 
					    + [
 | 
				
			||||||
 | 
					        r"(?<=[0-9])[+\-\*^](?=[0-9-])",
 | 
				
			||||||
 | 
					        r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
 | 
				
			||||||
 | 
					            al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
 | 
				
			||||||
 | 
					        ),
 | 
				
			||||||
 | 
					        r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
 | 
				
			||||||
 | 
					        r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
 | 
				
			||||||
 | 
					        r"(?<=[{a}][{el}])(?=[{a}0-9])".format(a=ALPHA, el=ELISION),
 | 
				
			||||||
 | 
					    ]
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					_units = _units.replace("% ", "")
 | 
				
			||||||
 | 
					UNITS = merge_chars(_units)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					_suffixes = (
 | 
				
			||||||
 | 
					    LIST_PUNCT
 | 
				
			||||||
 | 
					    + LIST_ELLIPSES
 | 
				
			||||||
 | 
					    + LIST_QUOTES
 | 
				
			||||||
 | 
					    + LIST_ICONS
 | 
				
			||||||
 | 
					    + [r"-", "—", "–"]
 | 
				
			||||||
 | 
					    + [
 | 
				
			||||||
 | 
					        r"(?<=[0-9])\+",
 | 
				
			||||||
 | 
					        r"(?<=°[FfCcKk])\.",
 | 
				
			||||||
 | 
					        r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
 | 
				
			||||||
 | 
					        r"(?<=[0-9])(?:{u})".format(u=UNITS),
 | 
				
			||||||
 | 
					        r"(?<=[0-9{al}{e}{p}(?:{q})])\.".format(
 | 
				
			||||||
 | 
					            al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, p=PUNCT
 | 
				
			||||||
 | 
					        ),
 | 
				
			||||||
 | 
					        r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
 | 
				
			||||||
 | 
					    ]
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
TOKENIZER_INFIXES = _infixes
 | 
					TOKENIZER_INFIXES = _infixes
 | 
				
			||||||
 | 
					TOKENIZER_SUFFIXES = _suffixes
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										46
									
								
								spacy/lang/ca/syntax_iterators.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										46
									
								
								spacy/lang/ca/syntax_iterators.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,46 @@
 | 
				
			||||||
 | 
					from ...symbols import NOUN, PROPN
 | 
				
			||||||
 | 
					from ...errors import Errors
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def noun_chunks(doclike):
 | 
				
			||||||
 | 
					    """Detect base noun phrases from a dependency parse. Works on Doc and Span."""
 | 
				
			||||||
 | 
					    # fmt: off
 | 
				
			||||||
 | 
					    labels = ["nsubj", "nsubj:pass", "obj", "obl", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
 | 
				
			||||||
 | 
					    # fmt: on
 | 
				
			||||||
 | 
					    doc = doclike.doc  # Ensure works on both Doc and Span.
 | 
				
			||||||
 | 
					    if not doc.has_annotation("DEP"):
 | 
				
			||||||
 | 
					        raise ValueError(Errors.E029)
 | 
				
			||||||
 | 
					    np_deps = [doc.vocab.strings[label] for label in labels]
 | 
				
			||||||
 | 
					    np_label = doc.vocab.strings.add("NP")
 | 
				
			||||||
 | 
					    prev_end = -1
 | 
				
			||||||
 | 
					    for i, word in enumerate(doclike):
 | 
				
			||||||
 | 
					        if word.pos not in (NOUN, PROPN):
 | 
				
			||||||
 | 
					            continue
 | 
				
			||||||
 | 
					        # Prevent nested chunks from being produced
 | 
				
			||||||
 | 
					        if word.left_edge.i <= prev_end:
 | 
				
			||||||
 | 
					            continue
 | 
				
			||||||
 | 
					        if word.dep in np_deps:
 | 
				
			||||||
 | 
					            left = word.left_edge.i
 | 
				
			||||||
 | 
					            right = word.right_edge.i + 1
 | 
				
			||||||
 | 
					            # leave prepositions and punctuation out of the left side of the chunk
 | 
				
			||||||
 | 
					            if word.left_edge.pos_ == "ADP" or word.left_edge.pos_ == "PUNCT":
 | 
				
			||||||
 | 
					                left = word.left_edge.i + 1
 | 
				
			||||||
 | 
					            prev_end = word.right_edge.i
 | 
				
			||||||
 | 
					            # leave subordinated clauses and appositions out of the chunk
 | 
				
			||||||
 | 
					            a = word.i + 1
 | 
				
			||||||
 | 
					            while a < word.right_edge.i:
 | 
				
			||||||
 | 
					                paraula = doc[a]
 | 
				
			||||||
 | 
					                if paraula.pos_ == "VERB":
 | 
				
			||||||
 | 
					                    right = paraula.left_edge.i
 | 
				
			||||||
 | 
					                    prev_end = paraula.left_edge.i - 1
 | 
				
			||||||
 | 
					                elif paraula.dep_ == "appos":
 | 
				
			||||||
 | 
					                    right = paraula.left_edge.i + 1
 | 
				
			||||||
 | 
					                    prev_end = paraula.left_edge.i - 1
 | 
				
			||||||
 | 
					                a += 1
 | 
				
			||||||
 | 
					            # leave punctuation out of the right side of the chunk
 | 
				
			||||||
 | 
					            if word.right_edge.pos_ == "PUNCT":
 | 
				
			||||||
 | 
					                right = right - 1
 | 
				
			||||||
 | 
					            yield left, right, np_label
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
 | 
				
			||||||
| 
						 | 
					@ -24,6 +24,13 @@ for exc_data in [
 | 
				
			||||||
    {ORTH: "núm", NORM: "número"},
 | 
					    {ORTH: "núm", NORM: "número"},
 | 
				
			||||||
    {ORTH: "St.", NORM: "sant"},
 | 
					    {ORTH: "St.", NORM: "sant"},
 | 
				
			||||||
    {ORTH: "Sta.", NORM: "santa"},
 | 
					    {ORTH: "Sta.", NORM: "santa"},
 | 
				
			||||||
 | 
					    {ORTH: "'l"},
 | 
				
			||||||
 | 
					    {ORTH: "'ls"},
 | 
				
			||||||
 | 
					    {ORTH: "'m"},
 | 
				
			||||||
 | 
					    {ORTH: "'n"},
 | 
				
			||||||
 | 
					    {ORTH: "'ns"},
 | 
				
			||||||
 | 
					    {ORTH: "'s"},
 | 
				
			||||||
 | 
					    {ORTH: "'t"},
 | 
				
			||||||
]:
 | 
					]:
 | 
				
			||||||
    _exc[exc_data[ORTH]] = [exc_data]
 | 
					    _exc[exc_data[ORTH]] = [exc_data]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -72,7 +72,7 @@ steste stesti stette stettero stetti stia stiamo stiano stiate sto su sua
 | 
				
			||||||
subito successivamente successivo sue sugl sugli sui sul sull sulla sulle
 | 
					subito successivamente successivo sue sugl sugli sui sul sull sulla sulle
 | 
				
			||||||
sullo suo suoi
 | 
					sullo suo suoi
 | 
				
			||||||
 | 
					
 | 
				
			||||||
tale tali talvolta tanto te tempo ti titolo torino tra tranne tre trenta
 | 
					tale tali talvolta tanto te tempo ti titolo tra tranne tre trenta
 | 
				
			||||||
troppo trovato tu tua tue tuo tuoi tutta tuttavia tutte tutti tutto
 | 
					troppo trovato tu tua tue tuo tuoi tutta tuttavia tutte tutti tutto
 | 
				
			||||||
 | 
					
 | 
				
			||||||
uguali ulteriore ultimo un una uno uomo
 | 
					uguali ulteriore ultimo un una uno uomo
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,8 +1,15 @@
 | 
				
			||||||
 | 
					from typing import Any, Dict, Union
 | 
				
			||||||
 | 
					from pathlib import Path
 | 
				
			||||||
 | 
					import re
 | 
				
			||||||
 | 
					import srsly
 | 
				
			||||||
 | 
					import string
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...tokens import Doc
 | 
					from ...tokens import Doc
 | 
				
			||||||
from ...util import DummyTokenizer, registry, load_config_from_str
 | 
					from ...util import DummyTokenizer, registry, load_config_from_str
 | 
				
			||||||
 | 
					from ... import util
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
DEFAULT_CONFIG = """
 | 
					DEFAULT_CONFIG = """
 | 
				
			||||||
| 
						 | 
					@ -40,17 +47,108 @@ class VietnameseTokenizer(DummyTokenizer):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __call__(self, text: str) -> Doc:
 | 
					    def __call__(self, text: str) -> Doc:
 | 
				
			||||||
        if self.use_pyvi:
 | 
					        if self.use_pyvi:
 | 
				
			||||||
            words, spaces = self.ViTokenizer.spacy_tokenize(text)
 | 
					            words = self.pyvi_tokenize(text)
 | 
				
			||||||
 | 
					            words, spaces = util.get_words_and_spaces(words, text)
 | 
				
			||||||
            return Doc(self.vocab, words=words, spaces=spaces)
 | 
					            return Doc(self.vocab, words=words, spaces=spaces)
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            words = []
 | 
					            words, spaces = util.get_words_and_spaces(text.split(), text)
 | 
				
			||||||
            spaces = []
 | 
					 | 
				
			||||||
            for token in self.tokenizer(text):
 | 
					 | 
				
			||||||
                words.extend(list(token.text))
 | 
					 | 
				
			||||||
                spaces.extend([False] * len(token.text))
 | 
					 | 
				
			||||||
                spaces[-1] = bool(token.whitespace_)
 | 
					 | 
				
			||||||
            return Doc(self.vocab, words=words, spaces=spaces)
 | 
					            return Doc(self.vocab, words=words, spaces=spaces)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # The methods pyvi_sylabelize_with_ws and pyvi_tokenize are adapted from
 | 
				
			||||||
 | 
					    # pyvi v0.1, MIT License, Copyright (c) 2016 Viet-Trung Tran.
 | 
				
			||||||
 | 
					    # See licenses/3rd_party_licenses.txt
 | 
				
			||||||
 | 
					    def pyvi_sylabelize_with_ws(self, text):
 | 
				
			||||||
 | 
					        """Modified from pyvi to preserve whitespace and skip unicode
 | 
				
			||||||
 | 
					        normalization."""
 | 
				
			||||||
 | 
					        specials = [r"==>", r"->", r"\.\.\.", r">>"]
 | 
				
			||||||
 | 
					        digit = r"\d+([\.,_]\d+)+"
 | 
				
			||||||
 | 
					        email = r"([a-zA-Z0-9_.+-]+@([a-zA-Z0-9-]+\.)+[a-zA-Z0-9-]+)"
 | 
				
			||||||
 | 
					        web = r"\w+://[^\s]+"
 | 
				
			||||||
 | 
					        word = r"\w+"
 | 
				
			||||||
 | 
					        non_word = r"[^\w\s]"
 | 
				
			||||||
 | 
					        abbreviations = [
 | 
				
			||||||
 | 
					            r"[A-ZĐ]+\.",
 | 
				
			||||||
 | 
					            r"Tp\.",
 | 
				
			||||||
 | 
					            r"Mr\.",
 | 
				
			||||||
 | 
					            r"Mrs\.",
 | 
				
			||||||
 | 
					            r"Ms\.",
 | 
				
			||||||
 | 
					            r"Dr\.",
 | 
				
			||||||
 | 
					            r"ThS\.",
 | 
				
			||||||
 | 
					        ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        patterns = []
 | 
				
			||||||
 | 
					        patterns.extend(abbreviations)
 | 
				
			||||||
 | 
					        patterns.extend(specials)
 | 
				
			||||||
 | 
					        patterns.extend([web, email])
 | 
				
			||||||
 | 
					        patterns.extend([digit, non_word, word])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        patterns = r"(\s+|" + "|".join(patterns) + ")"
 | 
				
			||||||
 | 
					        tokens = re.findall(patterns, text, re.UNICODE)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        return [token[0] for token in tokens]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def pyvi_tokenize(self, text):
 | 
				
			||||||
 | 
					        """Modified from pyvi to preserve text and whitespace."""
 | 
				
			||||||
 | 
					        if len(text) == 0:
 | 
				
			||||||
 | 
					            return []
 | 
				
			||||||
 | 
					        elif text.isspace():
 | 
				
			||||||
 | 
					            return [text]
 | 
				
			||||||
 | 
					        segs = self.pyvi_sylabelize_with_ws(text)
 | 
				
			||||||
 | 
					        words = []
 | 
				
			||||||
 | 
					        preceding_ws = []
 | 
				
			||||||
 | 
					        for i, token in enumerate(segs):
 | 
				
			||||||
 | 
					            if not token.isspace():
 | 
				
			||||||
 | 
					                words.append(token)
 | 
				
			||||||
 | 
					                preceding_ws.append(
 | 
				
			||||||
 | 
					                    "" if (i == 0 or not segs[i - 1].isspace()) else segs[i - 1]
 | 
				
			||||||
 | 
					                )
 | 
				
			||||||
 | 
					        labels = self.ViTokenizer.ViTokenizer.model.predict(
 | 
				
			||||||
 | 
					            [self.ViTokenizer.ViTokenizer.sent2features(words, False)]
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					        token = words[0]
 | 
				
			||||||
 | 
					        tokens = []
 | 
				
			||||||
 | 
					        for i in range(1, len(labels[0])):
 | 
				
			||||||
 | 
					            if (
 | 
				
			||||||
 | 
					                labels[0][i] == "I_W"
 | 
				
			||||||
 | 
					                and words[i] not in string.punctuation
 | 
				
			||||||
 | 
					                and words[i - 1] not in string.punctuation
 | 
				
			||||||
 | 
					                and not words[i][0].isdigit()
 | 
				
			||||||
 | 
					                and not words[i - 1][0].isdigit()
 | 
				
			||||||
 | 
					                and not (words[i][0].istitle() and not words[i - 1][0].istitle())
 | 
				
			||||||
 | 
					            ):
 | 
				
			||||||
 | 
					                token = token + preceding_ws[i] + words[i]
 | 
				
			||||||
 | 
					            else:
 | 
				
			||||||
 | 
					                tokens.append(token)
 | 
				
			||||||
 | 
					                token = words[i]
 | 
				
			||||||
 | 
					        tokens.append(token)
 | 
				
			||||||
 | 
					        return tokens
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def _get_config(self) -> Dict[str, Any]:
 | 
				
			||||||
 | 
					        return {"use_pyvi": self.use_pyvi}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def _set_config(self, config: Dict[str, Any] = {}) -> None:
 | 
				
			||||||
 | 
					        self.use_pyvi = config.get("use_pyvi", False)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def to_bytes(self, **kwargs) -> bytes:
 | 
				
			||||||
 | 
					        serializers = {"cfg": lambda: srsly.json_dumps(self._get_config())}
 | 
				
			||||||
 | 
					        return util.to_bytes(serializers, [])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def from_bytes(self, data: bytes, **kwargs) -> "VietnameseTokenizer":
 | 
				
			||||||
 | 
					        deserializers = {"cfg": lambda b: self._set_config(srsly.json_loads(b))}
 | 
				
			||||||
 | 
					        util.from_bytes(data, deserializers, [])
 | 
				
			||||||
 | 
					        return self
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def to_disk(self, path: Union[str, Path], **kwargs) -> None:
 | 
				
			||||||
 | 
					        path = util.ensure_path(path)
 | 
				
			||||||
 | 
					        serializers = {"cfg": lambda p: srsly.write_json(p, self._get_config())}
 | 
				
			||||||
 | 
					        return util.to_disk(path, serializers, [])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def from_disk(self, path: Union[str, Path], **kwargs) -> "VietnameseTokenizer":
 | 
				
			||||||
 | 
					        path = util.ensure_path(path)
 | 
				
			||||||
 | 
					        serializers = {"cfg": lambda p: self._set_config(srsly.read_json(p))}
 | 
				
			||||||
 | 
					        util.from_disk(path, serializers, [])
 | 
				
			||||||
 | 
					        return self
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class VietnameseDefaults(Language.Defaults):
 | 
					class VietnameseDefaults(Language.Defaults):
 | 
				
			||||||
    config = load_config_from_str(DEFAULT_CONFIG)
 | 
					    config = load_config_from_str(DEFAULT_CONFIG)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -690,7 +690,7 @@ class Language:
 | 
				
			||||||
        if self.vocab.vectors.shape != source.vocab.vectors.shape or \
 | 
					        if self.vocab.vectors.shape != source.vocab.vectors.shape or \
 | 
				
			||||||
                self.vocab.vectors.key2row != source.vocab.vectors.key2row or \
 | 
					                self.vocab.vectors.key2row != source.vocab.vectors.key2row or \
 | 
				
			||||||
                self.vocab.vectors.to_bytes() != source.vocab.vectors.to_bytes():
 | 
					                self.vocab.vectors.to_bytes() != source.vocab.vectors.to_bytes():
 | 
				
			||||||
            util.logger.warning(Warnings.W113.format(name=source_name))
 | 
					            warnings.warn(Warnings.W113.format(name=source_name))
 | 
				
			||||||
        if not source_name in source.component_names:
 | 
					        if not source_name in source.component_names:
 | 
				
			||||||
            raise KeyError(
 | 
					            raise KeyError(
 | 
				
			||||||
                Errors.E944.format(
 | 
					                Errors.E944.format(
 | 
				
			||||||
| 
						 | 
					@ -1075,6 +1075,7 @@ class Language:
 | 
				
			||||||
        losses: Optional[Dict[str, float]] = None,
 | 
					        losses: Optional[Dict[str, float]] = None,
 | 
				
			||||||
        component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
 | 
					        component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
 | 
				
			||||||
        exclude: Iterable[str] = SimpleFrozenList(),
 | 
					        exclude: Iterable[str] = SimpleFrozenList(),
 | 
				
			||||||
 | 
					        annotates: Iterable[str] = SimpleFrozenList(),
 | 
				
			||||||
    ):
 | 
					    ):
 | 
				
			||||||
        """Update the models in the pipeline.
 | 
					        """Update the models in the pipeline.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1082,10 +1083,13 @@ class Language:
 | 
				
			||||||
        _: Should not be set - serves to catch backwards-incompatible scripts.
 | 
					        _: Should not be set - serves to catch backwards-incompatible scripts.
 | 
				
			||||||
        drop (float): The dropout rate.
 | 
					        drop (float): The dropout rate.
 | 
				
			||||||
        sgd (Optimizer): An optimizer.
 | 
					        sgd (Optimizer): An optimizer.
 | 
				
			||||||
        losses (Dict[str, float]): Dictionary to update with the loss, keyed by component.
 | 
					        losses (Dict[str, float]): Dictionary to update with the loss, keyed by
 | 
				
			||||||
 | 
					            component.
 | 
				
			||||||
        component_cfg (Dict[str, Dict]): Config parameters for specific pipeline
 | 
					        component_cfg (Dict[str, Dict]): Config parameters for specific pipeline
 | 
				
			||||||
            components, keyed by component name.
 | 
					            components, keyed by component name.
 | 
				
			||||||
        exclude (Iterable[str]): Names of components that shouldn't be updated.
 | 
					        exclude (Iterable[str]): Names of components that shouldn't be updated.
 | 
				
			||||||
 | 
					        annotates (Iterable[str]): Names of components that should set
 | 
				
			||||||
 | 
					            annotations on the predicted examples after updating.
 | 
				
			||||||
        RETURNS (Dict[str, float]): The updated losses dictionary
 | 
					        RETURNS (Dict[str, float]): The updated losses dictionary
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        DOCS: https://spacy.io/api/language#update
 | 
					        DOCS: https://spacy.io/api/language#update
 | 
				
			||||||
| 
						 | 
					@ -1104,15 +1108,16 @@ class Language:
 | 
				
			||||||
            sgd = self._optimizer
 | 
					            sgd = self._optimizer
 | 
				
			||||||
        if component_cfg is None:
 | 
					        if component_cfg is None:
 | 
				
			||||||
            component_cfg = {}
 | 
					            component_cfg = {}
 | 
				
			||||||
 | 
					        pipe_kwargs = {}
 | 
				
			||||||
        for i, (name, proc) in enumerate(self.pipeline):
 | 
					        for i, (name, proc) in enumerate(self.pipeline):
 | 
				
			||||||
            component_cfg.setdefault(name, {})
 | 
					            component_cfg.setdefault(name, {})
 | 
				
			||||||
 | 
					            pipe_kwargs[name] = deepcopy(component_cfg[name])
 | 
				
			||||||
            component_cfg[name].setdefault("drop", drop)
 | 
					            component_cfg[name].setdefault("drop", drop)
 | 
				
			||||||
 | 
					            pipe_kwargs[name].setdefault("batch_size", self.batch_size)
 | 
				
			||||||
        for name, proc in self.pipeline:
 | 
					        for name, proc in self.pipeline:
 | 
				
			||||||
            if name in exclude or not hasattr(proc, "update"):
 | 
					            if name not in exclude and hasattr(proc, "update"):
 | 
				
			||||||
                continue
 | 
					                proc.update(examples, sgd=None, losses=losses, **component_cfg[name])
 | 
				
			||||||
            proc.update(examples, sgd=None, losses=losses, **component_cfg[name])
 | 
					            if sgd not in (None, False):
 | 
				
			||||||
        if sgd not in (None, False):
 | 
					 | 
				
			||||||
            for name, proc in self.pipeline:
 | 
					 | 
				
			||||||
                if (
 | 
					                if (
 | 
				
			||||||
                    name not in exclude
 | 
					                    name not in exclude
 | 
				
			||||||
                    and hasattr(proc, "is_trainable")
 | 
					                    and hasattr(proc, "is_trainable")
 | 
				
			||||||
| 
						 | 
					@ -1120,6 +1125,18 @@ class Language:
 | 
				
			||||||
                    and proc.model not in (True, False, None)
 | 
					                    and proc.model not in (True, False, None)
 | 
				
			||||||
                ):
 | 
					                ):
 | 
				
			||||||
                    proc.finish_update(sgd)
 | 
					                    proc.finish_update(sgd)
 | 
				
			||||||
 | 
					            if name in annotates:
 | 
				
			||||||
 | 
					                for doc, eg in zip(
 | 
				
			||||||
 | 
					                    _pipe(
 | 
				
			||||||
 | 
					                        (eg.predicted for eg in examples),
 | 
				
			||||||
 | 
					                        proc=proc,
 | 
				
			||||||
 | 
					                        name=name,
 | 
				
			||||||
 | 
					                        default_error_handler=self.default_error_handler,
 | 
				
			||||||
 | 
					                        kwargs=pipe_kwargs[name],
 | 
				
			||||||
 | 
					                    ),
 | 
				
			||||||
 | 
					                    examples,
 | 
				
			||||||
 | 
					                ):
 | 
				
			||||||
 | 
					                    eg.predicted = doc
 | 
				
			||||||
        return losses
 | 
					        return losses
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def rehearse(
 | 
					    def rehearse(
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -4,6 +4,7 @@ from collections import defaultdict
 | 
				
			||||||
from itertools import product
 | 
					from itertools import product
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import numpy
 | 
					import numpy
 | 
				
			||||||
 | 
					import warnings
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .matcher cimport Matcher
 | 
					from .matcher cimport Matcher
 | 
				
			||||||
from ..vocab cimport Vocab
 | 
					from ..vocab cimport Vocab
 | 
				
			||||||
| 
						 | 
					@ -11,7 +12,6 @@ from ..tokens.doc cimport Doc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ..errors import Errors, Warnings
 | 
					from ..errors import Errors, Warnings
 | 
				
			||||||
from ..tokens import Span
 | 
					from ..tokens import Span
 | 
				
			||||||
from ..util import logger
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
DELIMITER = "||"
 | 
					DELIMITER = "||"
 | 
				
			||||||
| 
						 | 
					@ -282,7 +282,7 @@ cdef class DependencyMatcher:
 | 
				
			||||||
        keys_to_position_maps = defaultdict(lambda: defaultdict(list))
 | 
					        keys_to_position_maps = defaultdict(lambda: defaultdict(list))
 | 
				
			||||||
        for match_id, start, end in self._matcher(doc):
 | 
					        for match_id, start, end in self._matcher(doc):
 | 
				
			||||||
            if start + 1 != end:
 | 
					            if start + 1 != end:
 | 
				
			||||||
                logger.warning(Warnings.W110.format(tokens=[t.text for t in doc[start:end]], pattern=self._matcher.get(match_id)[1][0][0]))
 | 
					                warnings.warn(Warnings.W110.format(tokens=[t.text for t in doc[start:end]], pattern=self._matcher.get(match_id)[1][0][0]))
 | 
				
			||||||
            token = doc[start]
 | 
					            token = doc[start]
 | 
				
			||||||
            root = ([token] + list(token.ancestors))[-1]
 | 
					            root = ([token] + list(token.ancestors))[-1]
 | 
				
			||||||
            keys_to_position_maps[root.i][match_id].append(start)
 | 
					            keys_to_position_maps[root.i][match_id].append(start)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -50,6 +50,8 @@ cdef class PhraseMatcher:
 | 
				
			||||||
        if isinstance(attr, (int, long)):
 | 
					        if isinstance(attr, (int, long)):
 | 
				
			||||||
            self.attr = attr
 | 
					            self.attr = attr
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
 | 
					            if attr is None:
 | 
				
			||||||
 | 
					                attr = "ORTH"
 | 
				
			||||||
            attr = attr.upper()
 | 
					            attr = attr.upper()
 | 
				
			||||||
            if attr == "TEXT":
 | 
					            if attr == "TEXT":
 | 
				
			||||||
                attr = "ORTH"
 | 
					                attr = "ORTH"
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,14 +1,11 @@
 | 
				
			||||||
from cymem.cymem cimport Pool
 | 
					from cymem.cymem cimport Pool
 | 
				
			||||||
from preshed.maps cimport PreshMap, PreshMapArray
 | 
					from preshed.maps cimport PreshMap
 | 
				
			||||||
from libc.stdint cimport uint64_t
 | 
					 | 
				
			||||||
from murmurhash cimport mrmr
 | 
					 | 
				
			||||||
cimport numpy as np
 | 
					cimport numpy as np
 | 
				
			||||||
 | 
					from libc.stdint cimport uint64_t
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .structs cimport TokenC, MorphAnalysisC
 | 
					from .structs cimport MorphAnalysisC
 | 
				
			||||||
from .strings cimport StringStore
 | 
					from .strings cimport StringStore
 | 
				
			||||||
from .typedefs cimport hash_t, attr_t, flags_t
 | 
					from .typedefs cimport attr_t, hash_t
 | 
				
			||||||
from .parts_of_speech cimport univ_pos_t
 | 
					 | 
				
			||||||
from . cimport symbols
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef class Morphology:
 | 
					cdef class Morphology:
 | 
				
			||||||
| 
						 | 
					@ -16,14 +13,6 @@ cdef class Morphology:
 | 
				
			||||||
    cdef readonly StringStore strings
 | 
					    cdef readonly StringStore strings
 | 
				
			||||||
    cdef PreshMap tags # Keyed by hash, value is pointer to tag
 | 
					    cdef PreshMap tags # Keyed by hash, value is pointer to tag
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef public object lemmatizer
 | 
					 | 
				
			||||||
    cdef readonly object tag_map
 | 
					 | 
				
			||||||
    cdef readonly object tag_names
 | 
					 | 
				
			||||||
    cdef readonly object reverse_index
 | 
					 | 
				
			||||||
    cdef readonly object _exc
 | 
					 | 
				
			||||||
    cdef readonly PreshMapArray _cache
 | 
					 | 
				
			||||||
    cdef readonly int n_tags
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    cdef MorphAnalysisC create_morph_tag(self, field_feature_pairs) except *
 | 
					    cdef MorphAnalysisC create_morph_tag(self, field_feature_pairs) except *
 | 
				
			||||||
    cdef int insert(self, MorphAnalysisC tag) except -1
 | 
					    cdef int insert(self, MorphAnalysisC tag) except -1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,20 +1,11 @@
 | 
				
			||||||
# cython: infer_types
 | 
					# cython: infer_types
 | 
				
			||||||
from libc.string cimport memset
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
import srsly
 | 
					 | 
				
			||||||
from collections import Counter
 | 
					 | 
				
			||||||
import numpy
 | 
					import numpy
 | 
				
			||||||
import warnings
 | 
					import warnings
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .attrs cimport POS, IS_SPACE
 | 
					from .attrs cimport POS
 | 
				
			||||||
from .parts_of_speech cimport SPACE
 | 
					 | 
				
			||||||
from .lexeme cimport Lexeme
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .strings import get_string_id
 | 
					 | 
				
			||||||
from .attrs import LEMMA, intify_attrs
 | 
					 | 
				
			||||||
from .parts_of_speech import IDS as POS_IDS
 | 
					from .parts_of_speech import IDS as POS_IDS
 | 
				
			||||||
from .errors import Errors, Warnings
 | 
					from .errors import Warnings
 | 
				
			||||||
from .util import ensure_path
 | 
					 | 
				
			||||||
from . import symbols
 | 
					from . import symbols
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -481,7 +481,8 @@ class EntityLinker(TrainablePipe):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        def load_model(p):
 | 
					        def load_model(p):
 | 
				
			||||||
            try:
 | 
					            try:
 | 
				
			||||||
                self.model.from_bytes(p.open("rb").read())
 | 
					                with p.open("rb") as infile:
 | 
				
			||||||
 | 
					                    self.model.from_bytes(infile.read())
 | 
				
			||||||
            except AttributeError:
 | 
					            except AttributeError:
 | 
				
			||||||
                raise ValueError(Errors.E149) from None
 | 
					                raise ValueError(Errors.E149) from None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -102,17 +102,12 @@ class EntityRuler(Pipe):
 | 
				
			||||||
        self.overwrite = overwrite_ents
 | 
					        self.overwrite = overwrite_ents
 | 
				
			||||||
        self.token_patterns = defaultdict(list)
 | 
					        self.token_patterns = defaultdict(list)
 | 
				
			||||||
        self.phrase_patterns = defaultdict(list)
 | 
					        self.phrase_patterns = defaultdict(list)
 | 
				
			||||||
 | 
					        self._validate = validate
 | 
				
			||||||
        self.matcher = Matcher(nlp.vocab, validate=validate)
 | 
					        self.matcher = Matcher(nlp.vocab, validate=validate)
 | 
				
			||||||
        if phrase_matcher_attr is not None:
 | 
					        self.phrase_matcher_attr = phrase_matcher_attr
 | 
				
			||||||
            if phrase_matcher_attr.upper() == "TEXT":
 | 
					        self.phrase_matcher = PhraseMatcher(
 | 
				
			||||||
                phrase_matcher_attr = "ORTH"
 | 
					            nlp.vocab, attr=self.phrase_matcher_attr, validate=validate
 | 
				
			||||||
            self.phrase_matcher_attr = phrase_matcher_attr
 | 
					        )
 | 
				
			||||||
            self.phrase_matcher = PhraseMatcher(
 | 
					 | 
				
			||||||
                nlp.vocab, attr=self.phrase_matcher_attr, validate=validate
 | 
					 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
        else:
 | 
					 | 
				
			||||||
            self.phrase_matcher_attr = None
 | 
					 | 
				
			||||||
            self.phrase_matcher = PhraseMatcher(nlp.vocab, validate=validate)
 | 
					 | 
				
			||||||
        self.ent_id_sep = ent_id_sep
 | 
					        self.ent_id_sep = ent_id_sep
 | 
				
			||||||
        self._ent_ids = defaultdict(dict)
 | 
					        self._ent_ids = defaultdict(dict)
 | 
				
			||||||
        if patterns is not None:
 | 
					        if patterns is not None:
 | 
				
			||||||
| 
						 | 
					@ -317,20 +312,27 @@ class EntityRuler(Pipe):
 | 
				
			||||||
                pattern = entry["pattern"]
 | 
					                pattern = entry["pattern"]
 | 
				
			||||||
                if isinstance(pattern, Doc):
 | 
					                if isinstance(pattern, Doc):
 | 
				
			||||||
                    self.phrase_patterns[label].append(pattern)
 | 
					                    self.phrase_patterns[label].append(pattern)
 | 
				
			||||||
 | 
					                    self.phrase_matcher.add(label, [pattern])
 | 
				
			||||||
                elif isinstance(pattern, list):
 | 
					                elif isinstance(pattern, list):
 | 
				
			||||||
                    self.token_patterns[label].append(pattern)
 | 
					                    self.token_patterns[label].append(pattern)
 | 
				
			||||||
 | 
					                    self.matcher.add(label, [pattern])
 | 
				
			||||||
                else:
 | 
					                else:
 | 
				
			||||||
                    raise ValueError(Errors.E097.format(pattern=pattern))
 | 
					                    raise ValueError(Errors.E097.format(pattern=pattern))
 | 
				
			||||||
            for label, patterns in self.token_patterns.items():
 | 
					 | 
				
			||||||
                self.matcher.add(label, patterns)
 | 
					 | 
				
			||||||
            for label, patterns in self.phrase_patterns.items():
 | 
					 | 
				
			||||||
                self.phrase_matcher.add(label, patterns)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def clear(self) -> None:
 | 
					    def clear(self) -> None:
 | 
				
			||||||
        """Reset all patterns."""
 | 
					        """Reset all patterns."""
 | 
				
			||||||
        self.token_patterns = defaultdict(list)
 | 
					        self.token_patterns = defaultdict(list)
 | 
				
			||||||
        self.phrase_patterns = defaultdict(list)
 | 
					        self.phrase_patterns = defaultdict(list)
 | 
				
			||||||
        self._ent_ids = defaultdict(dict)
 | 
					        self._ent_ids = defaultdict(dict)
 | 
				
			||||||
 | 
					        self.matcher = Matcher(self.nlp.vocab, validate=self._validate)
 | 
				
			||||||
 | 
					        self.phrase_matcher = PhraseMatcher(
 | 
				
			||||||
 | 
					            self.nlp.vocab, attr=self.phrase_matcher_attr, validate=self._validate
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def _require_patterns(self) -> None:
 | 
				
			||||||
 | 
					        """Raise a warning if this component has no patterns defined."""
 | 
				
			||||||
 | 
					        if len(self) == 0:
 | 
				
			||||||
 | 
					            warnings.warn(Warnings.W036.format(name=self.name))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def _require_patterns(self) -> None:
 | 
					    def _require_patterns(self) -> None:
 | 
				
			||||||
        """Raise a warning if this component has no patterns defined."""
 | 
					        """Raise a warning if this component has no patterns defined."""
 | 
				
			||||||
| 
						 | 
					@ -381,10 +383,9 @@ class EntityRuler(Pipe):
 | 
				
			||||||
            self.add_patterns(cfg.get("patterns", cfg))
 | 
					            self.add_patterns(cfg.get("patterns", cfg))
 | 
				
			||||||
            self.overwrite = cfg.get("overwrite", False)
 | 
					            self.overwrite = cfg.get("overwrite", False)
 | 
				
			||||||
            self.phrase_matcher_attr = cfg.get("phrase_matcher_attr", None)
 | 
					            self.phrase_matcher_attr = cfg.get("phrase_matcher_attr", None)
 | 
				
			||||||
            if self.phrase_matcher_attr is not None:
 | 
					            self.phrase_matcher = PhraseMatcher(
 | 
				
			||||||
                self.phrase_matcher = PhraseMatcher(
 | 
					                self.nlp.vocab, attr=self.phrase_matcher_attr
 | 
				
			||||||
                    self.nlp.vocab, attr=self.phrase_matcher_attr
 | 
					            )
 | 
				
			||||||
                )
 | 
					 | 
				
			||||||
            self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP)
 | 
					            self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP)
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            self.add_patterns(cfg)
 | 
					            self.add_patterns(cfg)
 | 
				
			||||||
| 
						 | 
					@ -435,10 +436,9 @@ class EntityRuler(Pipe):
 | 
				
			||||||
            self.phrase_matcher_attr = cfg.get("phrase_matcher_attr")
 | 
					            self.phrase_matcher_attr = cfg.get("phrase_matcher_attr")
 | 
				
			||||||
            self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP)
 | 
					            self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            if self.phrase_matcher_attr is not None:
 | 
					            self.phrase_matcher = PhraseMatcher(
 | 
				
			||||||
                self.phrase_matcher = PhraseMatcher(
 | 
					                self.nlp.vocab, attr=self.phrase_matcher_attr
 | 
				
			||||||
                    self.nlp.vocab, attr=self.phrase_matcher_attr
 | 
					            )
 | 
				
			||||||
                )
 | 
					 | 
				
			||||||
            from_disk(path, deserializers_patterns, {})
 | 
					            from_disk(path, deserializers_patterns, {})
 | 
				
			||||||
        return self
 | 
					        return self
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -2,6 +2,8 @@ from typing import Optional, List, Dict, Any, Callable, Iterable, Union, Tuple
 | 
				
			||||||
from thinc.api import Model
 | 
					from thinc.api import Model
 | 
				
			||||||
from pathlib import Path
 | 
					from pathlib import Path
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import warnings
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .pipe import Pipe
 | 
					from .pipe import Pipe
 | 
				
			||||||
from ..errors import Errors, Warnings
 | 
					from ..errors import Errors, Warnings
 | 
				
			||||||
from ..language import Language
 | 
					from ..language import Language
 | 
				
			||||||
| 
						 | 
					@ -182,7 +184,7 @@ class Lemmatizer(Pipe):
 | 
				
			||||||
        univ_pos = token.pos_.lower()
 | 
					        univ_pos = token.pos_.lower()
 | 
				
			||||||
        if univ_pos in ("", "eol", "space"):
 | 
					        if univ_pos in ("", "eol", "space"):
 | 
				
			||||||
            if univ_pos == "":
 | 
					            if univ_pos == "":
 | 
				
			||||||
                logger.warning(Warnings.W108.format(text=string))
 | 
					                warnings.warn(Warnings.W108.format(text=string))
 | 
				
			||||||
            return [string.lower()]
 | 
					            return [string.lower()]
 | 
				
			||||||
        # See Issue #435 for example of where this logic is requied.
 | 
					        # See Issue #435 for example of where this logic is requied.
 | 
				
			||||||
        if self.is_base_form(token):
 | 
					        if self.is_base_form(token):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -324,7 +324,8 @@ cdef class TrainablePipe(Pipe):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        def load_model(p):
 | 
					        def load_model(p):
 | 
				
			||||||
            try:
 | 
					            try:
 | 
				
			||||||
                self.model.from_bytes(p.open("rb").read())
 | 
					                with open(p, "rb") as mfile:
 | 
				
			||||||
 | 
					                    self.model.from_bytes(mfile.read())
 | 
				
			||||||
            except AttributeError:
 | 
					            except AttributeError:
 | 
				
			||||||
                raise ValueError(Errors.E149) from None
 | 
					                raise ValueError(Errors.E149) from None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -313,6 +313,7 @@ class ConfigSchemaTraining(BaseModel):
 | 
				
			||||||
    optimizer: Optimizer = Field(..., title="The optimizer to use")
 | 
					    optimizer: Optimizer = Field(..., title="The optimizer to use")
 | 
				
			||||||
    logger: Logger = Field(..., title="The logger to track training progress")
 | 
					    logger: Logger = Field(..., title="The logger to track training progress")
 | 
				
			||||||
    frozen_components: List[str] = Field(..., title="Pipeline components that shouldn't be updated during training")
 | 
					    frozen_components: List[str] = Field(..., title="Pipeline components that shouldn't be updated during training")
 | 
				
			||||||
 | 
					    annotating_components: List[str] = Field(..., title="Pipeline components that should set annotations during training")
 | 
				
			||||||
    before_to_disk: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after training, before it's saved to disk")
 | 
					    before_to_disk: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after training, before it's saved to disk")
 | 
				
			||||||
    # fmt: on
 | 
					    # fmt: on
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -293,6 +293,12 @@ def ur_tokenizer():
 | 
				
			||||||
    return get_lang_class("ur")().tokenizer
 | 
					    return get_lang_class("ur")().tokenizer
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.fixture(scope="session")
 | 
				
			||||||
 | 
					def vi_tokenizer():
 | 
				
			||||||
 | 
					    pytest.importorskip("pyvi")
 | 
				
			||||||
 | 
					    return get_lang_class("vi")().tokenizer
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@pytest.fixture(scope="session")
 | 
					@pytest.fixture(scope="session")
 | 
				
			||||||
def yo_tokenizer():
 | 
					def yo_tokenizer():
 | 
				
			||||||
    return get_lang_class("yo")().tokenizer
 | 
					    return get_lang_class("yo")().tokenizer
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -2,8 +2,6 @@ import weakref
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import pytest
 | 
					import pytest
 | 
				
			||||||
import numpy
 | 
					import numpy
 | 
				
			||||||
import logging
 | 
					 | 
				
			||||||
import mock
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
from spacy.lang.xx import MultiLanguage
 | 
					from spacy.lang.xx import MultiLanguage
 | 
				
			||||||
from spacy.tokens import Doc, Span, Token
 | 
					from spacy.tokens import Doc, Span, Token
 | 
				
			||||||
| 
						 | 
					@ -158,13 +156,10 @@ def test_doc_api_serialize(en_tokenizer, text):
 | 
				
			||||||
    def inner_func(d1, d2):
 | 
					    def inner_func(d1, d2):
 | 
				
			||||||
        return "hello!"
 | 
					        return "hello!"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    logger = logging.getLogger("spacy")
 | 
					    _ = tokens.to_bytes()  # noqa: F841
 | 
				
			||||||
    with mock.patch.object(logger, "warning") as mock_warning:
 | 
					    with pytest.warns(UserWarning):
 | 
				
			||||||
        _ = tokens.to_bytes()  # noqa: F841
 | 
					 | 
				
			||||||
        mock_warning.assert_not_called()
 | 
					 | 
				
			||||||
        tokens.user_hooks["similarity"] = inner_func
 | 
					        tokens.user_hooks["similarity"] = inner_func
 | 
				
			||||||
        _ = tokens.to_bytes()  # noqa: F841
 | 
					        _ = tokens.to_bytes()  # noqa: F841
 | 
				
			||||||
        mock_warning.assert_called_once()
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_doc_api_set_ents(en_tokenizer):
 | 
					def test_doc_api_set_ents(en_tokenizer):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -12,13 +12,13 @@ def test_ca_tokenizer_handles_long_text(ca_tokenizer):
 | 
				
			||||||
    una gerra de cervesa. Ens asseiem -fotògraf i periodista- en una terrassa buida."""
 | 
					    una gerra de cervesa. Ens asseiem -fotògraf i periodista- en una terrassa buida."""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    tokens = ca_tokenizer(text)
 | 
					    tokens = ca_tokenizer(text)
 | 
				
			||||||
    assert len(tokens) == 138
 | 
					    assert len(tokens) == 140
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@pytest.mark.parametrize(
 | 
					@pytest.mark.parametrize(
 | 
				
			||||||
    "text,length",
 | 
					    "text,length",
 | 
				
			||||||
    [
 | 
					    [
 | 
				
			||||||
        ("Perquè va anar-hi?", 6),
 | 
					        ("Perquè va anar-hi?", 4),
 | 
				
			||||||
        ("“Ah no?”", 5),
 | 
					        ("“Ah no?”", 5),
 | 
				
			||||||
        ("""Sí! "Anem", va contestar el Joan Carles""", 11),
 | 
					        ("""Sí! "Anem", va contestar el Joan Carles""", 11),
 | 
				
			||||||
        ("Van córrer aprox. 10km", 5),
 | 
					        ("Van córrer aprox. 10km", 5),
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -8,7 +8,7 @@ from spacy.util import get_lang_class
 | 
				
			||||||
# Only include languages with no external dependencies
 | 
					# Only include languages with no external dependencies
 | 
				
			||||||
# excluded: ru, uk
 | 
					# excluded: ru, uk
 | 
				
			||||||
# excluded for custom tables: es, pl
 | 
					# excluded for custom tables: es, pl
 | 
				
			||||||
LANGUAGES = ["bn", "el", "en", "fa", "fr", "nb", "nl", "sv"]
 | 
					LANGUAGES = ["bn", "ca", "el", "en", "fa", "fr", "nb", "nl", "sv"]
 | 
				
			||||||
# fmt: on
 | 
					# fmt: on
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										0
									
								
								spacy/tests/lang/vi/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								spacy/tests/lang/vi/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										33
									
								
								spacy/tests/lang/vi/test_serialize.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										33
									
								
								spacy/tests/lang/vi/test_serialize.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,33 @@
 | 
				
			||||||
 | 
					from spacy.lang.vi import Vietnamese
 | 
				
			||||||
 | 
					from ...util import make_tempdir
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_vi_tokenizer_serialize(vi_tokenizer):
 | 
				
			||||||
 | 
					    tokenizer_bytes = vi_tokenizer.to_bytes()
 | 
				
			||||||
 | 
					    nlp = Vietnamese()
 | 
				
			||||||
 | 
					    nlp.tokenizer.from_bytes(tokenizer_bytes)
 | 
				
			||||||
 | 
					    assert tokenizer_bytes == nlp.tokenizer.to_bytes()
 | 
				
			||||||
 | 
					    assert nlp.tokenizer.use_pyvi is True
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    with make_tempdir() as d:
 | 
				
			||||||
 | 
					        file_path = d / "tokenizer"
 | 
				
			||||||
 | 
					        vi_tokenizer.to_disk(file_path)
 | 
				
			||||||
 | 
					        nlp = Vietnamese()
 | 
				
			||||||
 | 
					        nlp.tokenizer.from_disk(file_path)
 | 
				
			||||||
 | 
					        assert tokenizer_bytes == nlp.tokenizer.to_bytes()
 | 
				
			||||||
 | 
					        assert nlp.tokenizer.use_pyvi is True
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # mode is (de)serialized correctly
 | 
				
			||||||
 | 
					    nlp = Vietnamese.from_config({"nlp": {"tokenizer": {"use_pyvi": False}}})
 | 
				
			||||||
 | 
					    nlp_bytes = nlp.to_bytes()
 | 
				
			||||||
 | 
					    nlp_r = Vietnamese()
 | 
				
			||||||
 | 
					    nlp_r.from_bytes(nlp_bytes)
 | 
				
			||||||
 | 
					    assert nlp_bytes == nlp_r.to_bytes()
 | 
				
			||||||
 | 
					    assert nlp_r.tokenizer.use_pyvi == False
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    with make_tempdir() as d:
 | 
				
			||||||
 | 
					        nlp.to_disk(d)
 | 
				
			||||||
 | 
					        nlp_r = Vietnamese()
 | 
				
			||||||
 | 
					        nlp_r.from_disk(d)
 | 
				
			||||||
 | 
					        assert nlp_bytes == nlp_r.to_bytes()
 | 
				
			||||||
 | 
					        assert nlp_r.tokenizer.use_pyvi == False
 | 
				
			||||||
							
								
								
									
										47
									
								
								spacy/tests/lang/vi/test_tokenizer.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										47
									
								
								spacy/tests/lang/vi/test_tokenizer.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,47 @@
 | 
				
			||||||
 | 
					import pytest
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from ...tokenizer.test_naughty_strings import NAUGHTY_STRINGS
 | 
				
			||||||
 | 
					from spacy.lang.vi import Vietnamese
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# fmt: off
 | 
				
			||||||
 | 
					TOKENIZER_TESTS = [
 | 
				
			||||||
 | 
					    ("Đây là một văn  bản bằng tiếng Việt Sau đó, đây là một văn bản khác bằng ngôn ngữ này", ['Đây', 'là', 'một', 'văn  bản', 'bằng', 'tiếng', 'Việt', 'Sau', 'đó', ',', 'đây', 'là', 'một', 'văn bản', 'khác', 'bằng', 'ngôn ngữ', 'này']),
 | 
				
			||||||
 | 
					]
 | 
				
			||||||
 | 
					# fmt: on
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.mark.parametrize("text,expected_tokens", TOKENIZER_TESTS)
 | 
				
			||||||
 | 
					def test_vi_tokenizer(vi_tokenizer, text, expected_tokens):
 | 
				
			||||||
 | 
					    tokens = [token.text for token in vi_tokenizer(text)]
 | 
				
			||||||
 | 
					    assert tokens == expected_tokens
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_vi_tokenizer_extra_spaces(vi_tokenizer):
 | 
				
			||||||
 | 
					    # note: three spaces after "I"
 | 
				
			||||||
 | 
					    tokens = vi_tokenizer("I   like cheese.")
 | 
				
			||||||
 | 
					    assert tokens[1].orth_ == "  "
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.mark.parametrize("text", NAUGHTY_STRINGS)
 | 
				
			||||||
 | 
					def test_vi_tokenizer_naughty_strings(vi_tokenizer, text):
 | 
				
			||||||
 | 
					    tokens = vi_tokenizer(text)
 | 
				
			||||||
 | 
					    assert tokens.text_with_ws == text
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_vi_tokenizer_emptyish_texts(vi_tokenizer):
 | 
				
			||||||
 | 
					    doc = vi_tokenizer("")
 | 
				
			||||||
 | 
					    assert len(doc) == 0
 | 
				
			||||||
 | 
					    doc = vi_tokenizer(" ")
 | 
				
			||||||
 | 
					    assert len(doc) == 1
 | 
				
			||||||
 | 
					    doc = vi_tokenizer("\n\n\n \t\t \n\n\n")
 | 
				
			||||||
 | 
					    assert len(doc) == 1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_vi_tokenizer_no_pyvi():
 | 
				
			||||||
 | 
					    """Test for whitespace tokenization without pyvi"""
 | 
				
			||||||
 | 
					    nlp = Vietnamese.from_config({"nlp": {"tokenizer": {"use_pyvi": False}}})
 | 
				
			||||||
 | 
					    text = "Đây là một văn  bản bằng tiếng Việt Sau đó, đây là một văn bản khác bằng ngôn ngữ này"
 | 
				
			||||||
 | 
					    doc = nlp(text)
 | 
				
			||||||
 | 
					    assert [t.text for t in doc if not t.is_space] == text.split()
 | 
				
			||||||
 | 
					    assert doc[4].text == " "
 | 
				
			||||||
| 
						 | 
					@ -252,12 +252,12 @@ def test_ruler_before_ner():
 | 
				
			||||||
    # 1 : Entity Ruler - should set "this" to B and everything else to empty
 | 
					    # 1 : Entity Ruler - should set "this" to B and everything else to empty
 | 
				
			||||||
    patterns = [{"label": "THING", "pattern": "This"}]
 | 
					    patterns = [{"label": "THING", "pattern": "This"}]
 | 
				
			||||||
    ruler = nlp.add_pipe("entity_ruler")
 | 
					    ruler = nlp.add_pipe("entity_ruler")
 | 
				
			||||||
    ruler.add_patterns(patterns)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # 2: untrained NER - should set everything else to O
 | 
					    # 2: untrained NER - should set everything else to O
 | 
				
			||||||
    untrained_ner = nlp.add_pipe("ner")
 | 
					    untrained_ner = nlp.add_pipe("ner")
 | 
				
			||||||
    untrained_ner.add_label("MY_LABEL")
 | 
					    untrained_ner.add_label("MY_LABEL")
 | 
				
			||||||
    nlp.initialize()
 | 
					    nlp.initialize()
 | 
				
			||||||
 | 
					    ruler.add_patterns(patterns)
 | 
				
			||||||
    doc = nlp("This is Antti Korhonen speaking in Finland")
 | 
					    doc = nlp("This is Antti Korhonen speaking in Finland")
 | 
				
			||||||
    expected_iobs = ["B", "O", "O", "O", "O", "O", "O"]
 | 
					    expected_iobs = ["B", "O", "O", "O", "O", "O", "O"]
 | 
				
			||||||
    expected_types = ["THING", "", "", "", "", "", ""]
 | 
					    expected_types = ["THING", "", "", "", "", "", ""]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										113
									
								
								spacy/tests/pipeline/test_annotates_on_update.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										113
									
								
								spacy/tests/pipeline/test_annotates_on_update.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,113 @@
 | 
				
			||||||
 | 
					from typing import Callable, Iterable, Iterator
 | 
				
			||||||
 | 
					import pytest
 | 
				
			||||||
 | 
					import io
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from thinc.api import Config
 | 
				
			||||||
 | 
					from spacy.language import Language
 | 
				
			||||||
 | 
					from spacy.training import Example
 | 
				
			||||||
 | 
					from spacy.training.loop import train
 | 
				
			||||||
 | 
					from spacy.lang.en import English
 | 
				
			||||||
 | 
					from spacy.util import registry, load_model_from_config
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.fixture
 | 
				
			||||||
 | 
					def config_str():
 | 
				
			||||||
 | 
					    return """
 | 
				
			||||||
 | 
					    [nlp]
 | 
				
			||||||
 | 
					    lang = "en"
 | 
				
			||||||
 | 
					    pipeline = ["sentencizer","assert_sents"]
 | 
				
			||||||
 | 
					    disabled = []
 | 
				
			||||||
 | 
					    before_creation = null
 | 
				
			||||||
 | 
					    after_creation = null
 | 
				
			||||||
 | 
					    after_pipeline_creation = null
 | 
				
			||||||
 | 
					    batch_size = 1000
 | 
				
			||||||
 | 
					    tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    [components]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    [components.assert_sents]
 | 
				
			||||||
 | 
					    factory = "assert_sents"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    [components.sentencizer]
 | 
				
			||||||
 | 
					    factory = "sentencizer"
 | 
				
			||||||
 | 
					    punct_chars = null
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    [training]
 | 
				
			||||||
 | 
					    dev_corpus = "corpora.dev"
 | 
				
			||||||
 | 
					    train_corpus = "corpora.train"
 | 
				
			||||||
 | 
					    annotating_components = ["sentencizer"]
 | 
				
			||||||
 | 
					    max_steps = 2
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    [corpora]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    [corpora.dev]
 | 
				
			||||||
 | 
					    @readers = "unannotated_corpus"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    [corpora.train]
 | 
				
			||||||
 | 
					    @readers = "unannotated_corpus"
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_annotates_on_update():
 | 
				
			||||||
 | 
					    # The custom component checks for sentence annotation
 | 
				
			||||||
 | 
					    @Language.factory("assert_sents", default_config={})
 | 
				
			||||||
 | 
					    def assert_sents(nlp, name):
 | 
				
			||||||
 | 
					        return AssertSents(name)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    class AssertSents:
 | 
				
			||||||
 | 
					        def __init__(self, name, **cfg):
 | 
				
			||||||
 | 
					            self.name = name
 | 
				
			||||||
 | 
					            pass
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        def __call__(self, doc):
 | 
				
			||||||
 | 
					            if not doc.has_annotation("SENT_START"):
 | 
				
			||||||
 | 
					                raise ValueError("No sents")
 | 
				
			||||||
 | 
					            return doc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        def update(self, examples, *, drop=0.0, sgd=None, losses=None):
 | 
				
			||||||
 | 
					            for example in examples:
 | 
				
			||||||
 | 
					                if not example.predicted.has_annotation("SENT_START"):
 | 
				
			||||||
 | 
					                    raise ValueError("No sents")
 | 
				
			||||||
 | 
					            return {}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    nlp = English()
 | 
				
			||||||
 | 
					    nlp.add_pipe("sentencizer")
 | 
				
			||||||
 | 
					    nlp.add_pipe("assert_sents")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # When the pipeline runs, annotations are set
 | 
				
			||||||
 | 
					    doc = nlp("This is a sentence.")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    examples = []
 | 
				
			||||||
 | 
					    for text in ["a a", "b b", "c c"]:
 | 
				
			||||||
 | 
					        examples.append(Example(nlp.make_doc(text), nlp(text)))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    for example in examples:
 | 
				
			||||||
 | 
					        assert not example.predicted.has_annotation("SENT_START")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # If updating without setting annotations, assert_sents will raise an error
 | 
				
			||||||
 | 
					    with pytest.raises(ValueError):
 | 
				
			||||||
 | 
					        nlp.update(examples)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Updating while setting annotations for the sentencizer succeeds
 | 
				
			||||||
 | 
					    nlp.update(examples, annotates=["sentencizer"])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_annotating_components_from_config(config_str):
 | 
				
			||||||
 | 
					    @registry.readers("unannotated_corpus")
 | 
				
			||||||
 | 
					    def create_unannotated_corpus() -> Callable[[Language], Iterable[Example]]:
 | 
				
			||||||
 | 
					        return UnannotatedCorpus()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    class UnannotatedCorpus:
 | 
				
			||||||
 | 
					        def __call__(self, nlp: Language) -> Iterator[Example]:
 | 
				
			||||||
 | 
					            for text in ["a a", "b b", "c c"]:
 | 
				
			||||||
 | 
					                doc = nlp.make_doc(text)
 | 
				
			||||||
 | 
					                yield Example(doc, doc)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    orig_config = Config().from_str(config_str)
 | 
				
			||||||
 | 
					    nlp = load_model_from_config(orig_config, auto_fill=True, validate=True)
 | 
				
			||||||
 | 
					    assert nlp.config["training"]["annotating_components"] == ["sentencizer"]
 | 
				
			||||||
 | 
					    train(nlp)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    nlp.config["training"]["annotating_components"] = []
 | 
				
			||||||
 | 
					    with pytest.raises(ValueError):
 | 
				
			||||||
 | 
					        train(nlp)
 | 
				
			||||||
| 
						 | 
					@ -89,6 +89,19 @@ def test_entity_ruler_init_clear(nlp, patterns):
 | 
				
			||||||
    assert len(ruler.labels) == 0
 | 
					    assert len(ruler.labels) == 0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_entity_ruler_clear(nlp, patterns):
 | 
				
			||||||
 | 
					    """Test that initialization clears patterns."""
 | 
				
			||||||
 | 
					    ruler = nlp.add_pipe("entity_ruler")
 | 
				
			||||||
 | 
					    ruler.add_patterns(patterns)
 | 
				
			||||||
 | 
					    assert len(ruler.labels) == 4
 | 
				
			||||||
 | 
					    doc = nlp("hello world")
 | 
				
			||||||
 | 
					    assert len(doc.ents) == 1
 | 
				
			||||||
 | 
					    ruler.clear()
 | 
				
			||||||
 | 
					    assert len(ruler.labels) == 0
 | 
				
			||||||
 | 
					    doc = nlp("hello world")
 | 
				
			||||||
 | 
					    assert len(doc.ents) == 0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_entity_ruler_existing(nlp, patterns):
 | 
					def test_entity_ruler_existing(nlp, patterns):
 | 
				
			||||||
    ruler = nlp.add_pipe("entity_ruler")
 | 
					    ruler = nlp.add_pipe("entity_ruler")
 | 
				
			||||||
    ruler.add_patterns(patterns)
 | 
					    ruler.add_patterns(patterns)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,4 @@
 | 
				
			||||||
import pytest
 | 
					import pytest
 | 
				
			||||||
import logging
 | 
					 | 
				
			||||||
import mock
 | 
					 | 
				
			||||||
import pickle
 | 
					import pickle
 | 
				
			||||||
from spacy import util, registry
 | 
					from spacy import util, registry
 | 
				
			||||||
from spacy.lang.en import English
 | 
					from spacy.lang.en import English
 | 
				
			||||||
| 
						 | 
					@ -59,10 +57,10 @@ def test_lemmatizer_config(nlp):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # warning if no POS assigned
 | 
					    # warning if no POS assigned
 | 
				
			||||||
    doc = nlp.make_doc("coping")
 | 
					    doc = nlp.make_doc("coping")
 | 
				
			||||||
    logger = logging.getLogger("spacy")
 | 
					    with pytest.warns(UserWarning):
 | 
				
			||||||
    with mock.patch.object(logger, "warning") as mock_warning:
 | 
					 | 
				
			||||||
        doc = lemmatizer(doc)
 | 
					        doc = lemmatizer(doc)
 | 
				
			||||||
        mock_warning.assert_called_once()
 | 
					    # warns once by default
 | 
				
			||||||
 | 
					    doc = lemmatizer(doc)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # works with POS
 | 
					    # works with POS
 | 
				
			||||||
    doc = nlp.make_doc("coping")
 | 
					    doc = nlp.make_doc("coping")
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,4 @@
 | 
				
			||||||
import pytest
 | 
					import pytest
 | 
				
			||||||
import mock
 | 
					 | 
				
			||||||
import logging
 | 
					 | 
				
			||||||
from spacy.language import Language
 | 
					from spacy.language import Language
 | 
				
			||||||
from spacy.lang.en import English
 | 
					from spacy.lang.en import English
 | 
				
			||||||
from spacy.lang.de import German
 | 
					from spacy.lang.de import German
 | 
				
			||||||
| 
						 | 
					@ -334,24 +332,31 @@ def test_language_factories_invalid():
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@pytest.mark.parametrize(
 | 
					@pytest.mark.parametrize(
 | 
				
			||||||
    "weights,expected",
 | 
					    "weights,override,expected",
 | 
				
			||||||
    [
 | 
					    [
 | 
				
			||||||
        ([{"a": 1.0}, {"b": 1.0}, {"c": 1.0}], {"a": 0.33, "b": 0.33, "c": 0.33}),
 | 
					        ([{"a": 1.0}, {"b": 1.0}, {"c": 1.0}], {}, {"a": 0.33, "b": 0.33, "c": 0.33}),
 | 
				
			||||||
        ([{"a": 1.0}, {"b": 50}, {"c": 123}], {"a": 0.33, "b": 0.33, "c": 0.33}),
 | 
					        ([{"a": 1.0}, {"b": 50}, {"c": 100}], {}, {"a": 0.01, "b": 0.33, "c": 0.66}),
 | 
				
			||||||
        (
 | 
					        (
 | 
				
			||||||
            [{"a": 0.7, "b": 0.3}, {"c": 1.0}, {"d": 0.5, "e": 0.5}],
 | 
					            [{"a": 0.7, "b": 0.3}, {"c": 1.0}, {"d": 0.5, "e": 0.5}],
 | 
				
			||||||
 | 
					            {},
 | 
				
			||||||
            {"a": 0.23, "b": 0.1, "c": 0.33, "d": 0.17, "e": 0.17},
 | 
					            {"a": 0.23, "b": 0.1, "c": 0.33, "d": 0.17, "e": 0.17},
 | 
				
			||||||
        ),
 | 
					        ),
 | 
				
			||||||
        (
 | 
					        (
 | 
				
			||||||
            [{"a": 100, "b": 400}, {"c": 0.5, "d": 0.5}],
 | 
					            [{"a": 100, "b": 300}, {"c": 50, "d": 50}],
 | 
				
			||||||
            {"a": 0.1, "b": 0.4, "c": 0.25, "d": 0.25},
 | 
					            {},
 | 
				
			||||||
 | 
					            {"a": 0.2, "b": 0.6, "c": 0.1, "d": 0.1},
 | 
				
			||||||
        ),
 | 
					        ),
 | 
				
			||||||
        ([{"a": 0.5, "b": 0.5}, {"b": 1.0}], {"a": 0.25, "b": 0.75}),
 | 
					        ([{"a": 0.5, "b": 0.5}, {"b": 1.0}], {}, {"a": 0.33, "b": 0.67}),
 | 
				
			||||||
        ([{"a": 0.0, "b": 0.0}, {"c": 0.0}], {"a": 0.0, "b": 0.0, "c": 0.0}),
 | 
					        ([{"a": 0.5, "b": 0.0}], {}, {"a": 1.0, "b": 0.0}),
 | 
				
			||||||
 | 
					        ([{"a": 0.5, "b": 0.5}, {"b": 1.0}], {"a": 0.0}, {"a": 0.0, "b": 1.0}),
 | 
				
			||||||
 | 
					        ([{"a": 0.0, "b": 0.0}, {"c": 0.0}], {}, {"a": 0.0, "b": 0.0, "c": 0.0}),
 | 
				
			||||||
 | 
					        ([{"a": 0.0, "b": 0.0}, {"c": 1.0}], {}, {"a": 0.0, "b": 0.0, "c": 1.0}),
 | 
				
			||||||
 | 
					        ([{"a": 0.0, "b": 0.0}, {"c": 0.0}], {"c": 0.2}, {"a": 0.0, "b": 0.0, "c": 1.0}),
 | 
				
			||||||
 | 
					        ([{"a": 0.5, "b": 0.5, "c": 1.0, "d": 1.0}], {"a": 0.0, "b": 0.0}, {"a": 0.0, "b": 0.0, "c": 0.5, "d": 0.5}),
 | 
				
			||||||
    ],
 | 
					    ],
 | 
				
			||||||
)
 | 
					)
 | 
				
			||||||
def test_language_factories_combine_score_weights(weights, expected):
 | 
					def test_language_factories_combine_score_weights(weights, override, expected):
 | 
				
			||||||
    result = combine_score_weights(weights)
 | 
					    result = combine_score_weights(weights, override)
 | 
				
			||||||
    assert sum(result.values()) in (0.99, 1.0, 0.0)
 | 
					    assert sum(result.values()) in (0.99, 1.0, 0.0)
 | 
				
			||||||
    assert result == expected
 | 
					    assert result == expected
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -377,17 +382,17 @@ def test_language_factories_scores():
 | 
				
			||||||
    # Test with custom defaults
 | 
					    # Test with custom defaults
 | 
				
			||||||
    config = nlp.config.copy()
 | 
					    config = nlp.config.copy()
 | 
				
			||||||
    config["training"]["score_weights"]["a1"] = 0.0
 | 
					    config["training"]["score_weights"]["a1"] = 0.0
 | 
				
			||||||
    config["training"]["score_weights"]["b3"] = 1.0
 | 
					    config["training"]["score_weights"]["b3"] = 1.3
 | 
				
			||||||
    nlp = English.from_config(config)
 | 
					    nlp = English.from_config(config)
 | 
				
			||||||
    score_weights = nlp.config["training"]["score_weights"]
 | 
					    score_weights = nlp.config["training"]["score_weights"]
 | 
				
			||||||
    expected = {"a1": 0.0, "a2": 0.5, "b1": 0.03, "b2": 0.12, "b3": 0.34}
 | 
					    expected = {"a1": 0.0, "a2": 0.12, "b1": 0.05, "b2": 0.17, "b3": 0.65}
 | 
				
			||||||
    assert score_weights == expected
 | 
					    assert score_weights == expected
 | 
				
			||||||
    # Test with null values
 | 
					    # Test with null values
 | 
				
			||||||
    config = nlp.config.copy()
 | 
					    config = nlp.config.copy()
 | 
				
			||||||
    config["training"]["score_weights"]["a1"] = None
 | 
					    config["training"]["score_weights"]["a1"] = None
 | 
				
			||||||
    nlp = English.from_config(config)
 | 
					    nlp = English.from_config(config)
 | 
				
			||||||
    score_weights = nlp.config["training"]["score_weights"]
 | 
					    score_weights = nlp.config["training"]["score_weights"]
 | 
				
			||||||
    expected = {"a1": None, "a2": 0.5, "b1": 0.03, "b2": 0.12, "b3": 0.35}
 | 
					    expected = {"a1": None, "a2": 0.12, "b1": 0.05, "b2": 0.17, "b3": 0.66}
 | 
				
			||||||
    assert score_weights == expected
 | 
					    assert score_weights == expected
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -430,10 +435,8 @@ def test_pipe_factories_from_source_language_subclass():
 | 
				
			||||||
    nlp = English()
 | 
					    nlp = English()
 | 
				
			||||||
    nlp.vocab.vectors.resize((1, 4))
 | 
					    nlp.vocab.vectors.resize((1, 4))
 | 
				
			||||||
    nlp.vocab.vectors.add("cat", vector=[1, 2, 3, 4])
 | 
					    nlp.vocab.vectors.add("cat", vector=[1, 2, 3, 4])
 | 
				
			||||||
    logger = logging.getLogger("spacy")
 | 
					    with pytest.warns(UserWarning):
 | 
				
			||||||
    with mock.patch.object(logger, "warning") as mock_warning:
 | 
					 | 
				
			||||||
        nlp.add_pipe("tagger", source=source_nlp)
 | 
					        nlp.add_pipe("tagger", source=source_nlp)
 | 
				
			||||||
        mock_warning.assert_called()
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_pipe_factories_from_source_custom():
 | 
					def test_pipe_factories_from_source_custom():
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,7 +1,9 @@
 | 
				
			||||||
import pytest
 | 
					import pytest
 | 
				
			||||||
from spacy.language import Language
 | 
					from spacy.language import Language
 | 
				
			||||||
from spacy.pipeline import TrainablePipe
 | 
					from spacy.pipeline import TrainablePipe
 | 
				
			||||||
 | 
					from spacy.training import Example
 | 
				
			||||||
from spacy.util import SimpleFrozenList, get_arg_names
 | 
					from spacy.util import SimpleFrozenList, get_arg_names
 | 
				
			||||||
 | 
					from spacy.lang.en import English
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@pytest.fixture
 | 
					@pytest.fixture
 | 
				
			||||||
| 
						 | 
					@ -417,3 +419,41 @@ def test_pipe_methods_initialize():
 | 
				
			||||||
    assert "test" in nlp.config["initialize"]["components"]
 | 
					    assert "test" in nlp.config["initialize"]["components"]
 | 
				
			||||||
    nlp.remove_pipe("test")
 | 
					    nlp.remove_pipe("test")
 | 
				
			||||||
    assert "test" not in nlp.config["initialize"]["components"]
 | 
					    assert "test" not in nlp.config["initialize"]["components"]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_update_with_annotates():
 | 
				
			||||||
 | 
					    name = "test_with_annotates"
 | 
				
			||||||
 | 
					    results = {}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def make_component(name):
 | 
				
			||||||
 | 
					        results[name] = ""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        def component(doc):
 | 
				
			||||||
 | 
					            nonlocal results
 | 
				
			||||||
 | 
					            results[name] += doc.text
 | 
				
			||||||
 | 
					            return doc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        return component
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    c1 = Language.component(f"{name}1", func=make_component(f"{name}1"))
 | 
				
			||||||
 | 
					    c2 = Language.component(f"{name}2", func=make_component(f"{name}2"))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    components = set([f"{name}1", f"{name}2"])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    nlp = English()
 | 
				
			||||||
 | 
					    texts = ["a", "bb", "ccc"]
 | 
				
			||||||
 | 
					    examples = []
 | 
				
			||||||
 | 
					    for text in texts:
 | 
				
			||||||
 | 
					        examples.append(Example(nlp.make_doc(text), nlp.make_doc(text)))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    for components_to_annotate in [[], [f"{name}1"], [f"{name}1", f"{name}2"], [f"{name}2", f"{name}1"]]:
 | 
				
			||||||
 | 
					        for key in results:
 | 
				
			||||||
 | 
					            results[key] = ""
 | 
				
			||||||
 | 
					        nlp = English(vocab=nlp.vocab)
 | 
				
			||||||
 | 
					        nlp.add_pipe(f"{name}1")
 | 
				
			||||||
 | 
					        nlp.add_pipe(f"{name}2")
 | 
				
			||||||
 | 
					        nlp.update(examples, annotates=components_to_annotate)
 | 
				
			||||||
 | 
					        for component in components_to_annotate:
 | 
				
			||||||
 | 
					            assert results[component] == "".join(eg.predicted.text for eg in examples)
 | 
				
			||||||
 | 
					        for component in components - set(components_to_annotate):
 | 
				
			||||||
 | 
					            assert results[component] == ""
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										34
									
								
								spacy/tests/regression/test_issue8216.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										34
									
								
								spacy/tests/regression/test_issue8216.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,34 @@
 | 
				
			||||||
 | 
					import pytest
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from spacy import registry
 | 
				
			||||||
 | 
					from spacy.language import Language
 | 
				
			||||||
 | 
					from spacy.pipeline import EntityRuler
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.fixture
 | 
				
			||||||
 | 
					def nlp():
 | 
				
			||||||
 | 
					    return Language()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.fixture
 | 
				
			||||||
 | 
					@registry.misc("entity_ruler_patterns")
 | 
				
			||||||
 | 
					def patterns():
 | 
				
			||||||
 | 
					    return [
 | 
				
			||||||
 | 
					        {"label": "HELLO", "pattern": "hello world"},
 | 
				
			||||||
 | 
					        {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
 | 
				
			||||||
 | 
					        {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
 | 
				
			||||||
 | 
					        {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
 | 
				
			||||||
 | 
					        {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
 | 
				
			||||||
 | 
					        {"label": "TECH_ORG", "pattern": "Microsoft", "id": "a2"},
 | 
				
			||||||
 | 
					    ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_entity_ruler_fix8216(nlp, patterns):
 | 
				
			||||||
 | 
					    """Test that patterns don't get added excessively."""
 | 
				
			||||||
 | 
					    ruler = nlp.add_pipe("entity_ruler", config={"validate": True})
 | 
				
			||||||
 | 
					    ruler.add_patterns(patterns)
 | 
				
			||||||
 | 
					    pattern_count = sum(len(mm) for mm in ruler.matcher._patterns.values())
 | 
				
			||||||
 | 
					    assert pattern_count > 0
 | 
				
			||||||
 | 
					    ruler.add_patterns([])
 | 
				
			||||||
 | 
					    after_count = sum(len(mm) for mm in ruler.matcher._patterns.values())
 | 
				
			||||||
 | 
					    assert after_count == pattern_count
 | 
				
			||||||
| 
						 | 
					@ -84,7 +84,8 @@ Phasellus tincidunt, augue quis porta finibus, massa sapien consectetur augue, n
 | 
				
			||||||
@pytest.mark.parametrize("file_name", ["sun.txt"])
 | 
					@pytest.mark.parametrize("file_name", ["sun.txt"])
 | 
				
			||||||
def test_tokenizer_handle_text_from_file(tokenizer, file_name):
 | 
					def test_tokenizer_handle_text_from_file(tokenizer, file_name):
 | 
				
			||||||
    loc = ensure_path(__file__).parent / file_name
 | 
					    loc = ensure_path(__file__).parent / file_name
 | 
				
			||||||
    text = loc.open("r", encoding="utf8").read()
 | 
					    with loc.open("r", encoding="utf8") as infile:
 | 
				
			||||||
 | 
					        text = infile.read()
 | 
				
			||||||
    assert len(text) != 0
 | 
					    assert len(text) != 0
 | 
				
			||||||
    tokens = tokenizer(text)
 | 
					    tokens = tokenizer(text)
 | 
				
			||||||
    assert len(tokens) > 100
 | 
					    assert len(tokens) > 100
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -14,7 +14,7 @@ cdef class Tokenizer:
 | 
				
			||||||
    cdef Pool mem
 | 
					    cdef Pool mem
 | 
				
			||||||
    cdef PreshMap _cache
 | 
					    cdef PreshMap _cache
 | 
				
			||||||
    cdef PreshMap _specials
 | 
					    cdef PreshMap _specials
 | 
				
			||||||
    cpdef readonly Vocab vocab
 | 
					    cdef readonly Vocab vocab
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef object _token_match
 | 
					    cdef object _token_match
 | 
				
			||||||
    cdef object _url_match
 | 
					    cdef object _url_match
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1321,7 +1321,7 @@ cdef class Doc:
 | 
				
			||||||
            if "user_data_values" not in exclude:
 | 
					            if "user_data_values" not in exclude:
 | 
				
			||||||
                serializers["user_data_values"] = lambda: srsly.msgpack_dumps(user_data_values)
 | 
					                serializers["user_data_values"] = lambda: srsly.msgpack_dumps(user_data_values)
 | 
				
			||||||
        if "user_hooks" not in exclude and any((self.user_hooks, self.user_token_hooks, self.user_span_hooks)):
 | 
					        if "user_hooks" not in exclude and any((self.user_hooks, self.user_token_hooks, self.user_span_hooks)):
 | 
				
			||||||
            util.logger.warning(Warnings.W109)
 | 
					            warnings.warn(Warnings.W109)
 | 
				
			||||||
        return util.to_dict(serializers, exclude)
 | 
					        return util.to_dict(serializers, exclude)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def from_dict(self, msg, *, exclude=tuple()):
 | 
					    def from_dict(self, msg, *, exclude=tuple()):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -74,6 +74,8 @@ def train(
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # Components that shouldn't be updated during training
 | 
					    # Components that shouldn't be updated during training
 | 
				
			||||||
    frozen_components = T["frozen_components"]
 | 
					    frozen_components = T["frozen_components"]
 | 
				
			||||||
 | 
					    # Components that should set annotations on update
 | 
				
			||||||
 | 
					    annotating_components = T["annotating_components"]
 | 
				
			||||||
    # Create iterator, which yields out info after each optimization step.
 | 
					    # Create iterator, which yields out info after each optimization step.
 | 
				
			||||||
    training_step_iterator = train_while_improving(
 | 
					    training_step_iterator = train_while_improving(
 | 
				
			||||||
        nlp,
 | 
					        nlp,
 | 
				
			||||||
| 
						 | 
					@ -86,11 +88,17 @@ def train(
 | 
				
			||||||
        max_steps=T["max_steps"],
 | 
					        max_steps=T["max_steps"],
 | 
				
			||||||
        eval_frequency=T["eval_frequency"],
 | 
					        eval_frequency=T["eval_frequency"],
 | 
				
			||||||
        exclude=frozen_components,
 | 
					        exclude=frozen_components,
 | 
				
			||||||
 | 
					        annotating_components=annotating_components,
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
    clean_output_dir(output_path)
 | 
					    clean_output_dir(output_path)
 | 
				
			||||||
    stdout.write(msg.info(f"Pipeline: {nlp.pipe_names}") + "\n")
 | 
					    stdout.write(msg.info(f"Pipeline: {nlp.pipe_names}") + "\n")
 | 
				
			||||||
    if frozen_components:
 | 
					    if frozen_components:
 | 
				
			||||||
        stdout.write(msg.info(f"Frozen components: {frozen_components}") + "\n")
 | 
					        stdout.write(msg.info(f"Frozen components: {frozen_components}") + "\n")
 | 
				
			||||||
 | 
					    if annotating_components:
 | 
				
			||||||
 | 
					        stdout.write(
 | 
				
			||||||
 | 
					            msg.info(f"Set annotations on update for: {annotating_components}")
 | 
				
			||||||
 | 
					            + "\n"
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
    stdout.write(msg.info(f"Initial learn rate: {optimizer.learn_rate}") + "\n")
 | 
					    stdout.write(msg.info(f"Initial learn rate: {optimizer.learn_rate}") + "\n")
 | 
				
			||||||
    with nlp.select_pipes(disable=frozen_components):
 | 
					    with nlp.select_pipes(disable=frozen_components):
 | 
				
			||||||
        log_step, finalize_logger = train_logger(nlp, stdout, stderr)
 | 
					        log_step, finalize_logger = train_logger(nlp, stdout, stderr)
 | 
				
			||||||
| 
						 | 
					@ -142,6 +150,7 @@ def train_while_improving(
 | 
				
			||||||
    patience: int,
 | 
					    patience: int,
 | 
				
			||||||
    max_steps: int,
 | 
					    max_steps: int,
 | 
				
			||||||
    exclude: List[str],
 | 
					    exclude: List[str],
 | 
				
			||||||
 | 
					    annotating_components: List[str],
 | 
				
			||||||
):
 | 
					):
 | 
				
			||||||
    """Train until an evaluation stops improving. Works as a generator,
 | 
					    """Train until an evaluation stops improving. Works as a generator,
 | 
				
			||||||
    with each iteration yielding a tuple `(batch, info, is_best_checkpoint)`,
 | 
					    with each iteration yielding a tuple `(batch, info, is_best_checkpoint)`,
 | 
				
			||||||
| 
						 | 
					@ -193,7 +202,12 @@ def train_while_improving(
 | 
				
			||||||
        dropout = next(dropouts)
 | 
					        dropout = next(dropouts)
 | 
				
			||||||
        for subbatch in subdivide_batch(batch, accumulate_gradient):
 | 
					        for subbatch in subdivide_batch(batch, accumulate_gradient):
 | 
				
			||||||
            nlp.update(
 | 
					            nlp.update(
 | 
				
			||||||
                subbatch, drop=dropout, losses=losses, sgd=False, exclude=exclude
 | 
					                subbatch,
 | 
				
			||||||
 | 
					                drop=dropout,
 | 
				
			||||||
 | 
					                losses=losses,
 | 
				
			||||||
 | 
					                sgd=False,
 | 
				
			||||||
 | 
					                exclude=exclude,
 | 
				
			||||||
 | 
					                annotates=annotating_components,
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
        # TODO: refactor this so we don't have to run it separately in here
 | 
					        # TODO: refactor this so we don't have to run it separately in here
 | 
				
			||||||
        for name, proc in nlp.pipeline:
 | 
					        for name, proc in nlp.pipeline:
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1370,32 +1370,14 @@ def combine_score_weights(
 | 
				
			||||||
        should be preserved.
 | 
					        should be preserved.
 | 
				
			||||||
    RETURNS (Dict[str, float]): The combined and normalized weights.
 | 
					    RETURNS (Dict[str, float]): The combined and normalized weights.
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
 | 
					    # We divide each weight by the total weight sum.
 | 
				
			||||||
    # We first need to extract all None/null values for score weights that
 | 
					    # We first need to extract all None/null values for score weights that
 | 
				
			||||||
    # shouldn't be shown in the table *or* be weighted
 | 
					    # shouldn't be shown in the table *or* be weighted
 | 
				
			||||||
    result = {}
 | 
					    result = {key: overrides.get(key, value) for w_dict in weights for (key, value) in w_dict.items()}
 | 
				
			||||||
    all_weights = []
 | 
					    weight_sum = sum([v if v else 0.0 for v in result.values()])
 | 
				
			||||||
    for w_dict in weights:
 | 
					    for key, value in result.items():
 | 
				
			||||||
        filtered_weights = {}
 | 
					        if value and weight_sum > 0:
 | 
				
			||||||
        for key, value in w_dict.items():
 | 
					            result[key] = round(value / weight_sum, 2)
 | 
				
			||||||
            value = overrides.get(key, value)
 | 
					 | 
				
			||||||
            if value is None:
 | 
					 | 
				
			||||||
                result[key] = None
 | 
					 | 
				
			||||||
            else:
 | 
					 | 
				
			||||||
                filtered_weights[key] = value
 | 
					 | 
				
			||||||
        all_weights.append(filtered_weights)
 | 
					 | 
				
			||||||
    for w_dict in all_weights:
 | 
					 | 
				
			||||||
        # We need to account for weights that don't sum to 1.0 and normalize
 | 
					 | 
				
			||||||
        # the score weights accordingly, then divide score by the number of
 | 
					 | 
				
			||||||
        # components.
 | 
					 | 
				
			||||||
        total = sum(w_dict.values())
 | 
					 | 
				
			||||||
        for key, value in w_dict.items():
 | 
					 | 
				
			||||||
            if total == 0:
 | 
					 | 
				
			||||||
                weight = 0.0
 | 
					 | 
				
			||||||
            else:
 | 
					 | 
				
			||||||
                weight = round(value / total / len(all_weights), 2)
 | 
					 | 
				
			||||||
            prev_weight = result.get(key, 0.0)
 | 
					 | 
				
			||||||
            prev_weight = 0.0 if prev_weight is None else prev_weight
 | 
					 | 
				
			||||||
            result[key] = prev_weight + weight
 | 
					 | 
				
			||||||
    return result
 | 
					    return result
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -25,12 +25,12 @@ cdef struct _Cached:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef class Vocab:
 | 
					cdef class Vocab:
 | 
				
			||||||
    cdef Pool mem
 | 
					    cdef Pool mem
 | 
				
			||||||
    cpdef readonly StringStore strings
 | 
					    cdef readonly StringStore strings
 | 
				
			||||||
    cpdef public Morphology morphology
 | 
					    cdef public Morphology morphology
 | 
				
			||||||
    cpdef public object vectors
 | 
					    cdef public object vectors
 | 
				
			||||||
    cpdef public object _lookups
 | 
					    cdef public object _lookups
 | 
				
			||||||
    cpdef public object writing_system
 | 
					    cdef public object writing_system
 | 
				
			||||||
    cpdef public object get_noun_chunks
 | 
					    cdef public object get_noun_chunks
 | 
				
			||||||
    cdef readonly int length
 | 
					    cdef readonly int length
 | 
				
			||||||
    cdef public object data_dir
 | 
					    cdef public object data_dir
 | 
				
			||||||
    cdef public object lex_attr_getters
 | 
					    cdef public object lex_attr_getters
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -182,24 +182,25 @@ single corpus once and then divide it up into `train` and `dev` partitions.
 | 
				
			||||||
This section defines settings and controls for the training and evaluation
 | 
					This section defines settings and controls for the training and evaluation
 | 
				
			||||||
process that are used when you run [`spacy train`](/api/cli#train).
 | 
					process that are used when you run [`spacy train`](/api/cli#train).
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| Name                  | Description                                                                                                                                                                                                                                                                                                                         |
 | 
					| Name                    | Description                                                                                                                                                                                                                                                                                                                         |
 | 
				
			||||||
| --------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
					| ----------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
				
			||||||
| `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~                                                                                                                                                                                                                                                              |
 | 
					| `accumulate_gradient`   | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~                                                                                                                                                                                                                                                              |
 | 
				
			||||||
| `batcher`             | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~                                                                                                                        |
 | 
					| `batcher`               | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~                                                                                                                        |
 | 
				
			||||||
| `before_to_disk`      | Optional callback to modify `nlp` object right before it is saved to disk during and after training. Can be used to remove or reset config values or disable components. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~                                                                                           |
 | 
					| `before_to_disk`        | Optional callback to modify `nlp` object right before it is saved to disk during and after training. Can be used to remove or reset config values or disable components. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~                                                                                           |
 | 
				
			||||||
| `dev_corpus`          | Dot notation of the config location defining the dev corpus. Defaults to `corpora.dev`. ~~str~~                                                                                                                                                                                                                                     |
 | 
					| `dev_corpus`            | Dot notation of the config location defining the dev corpus. Defaults to `corpora.dev`. ~~str~~                                                                                                                                                                                                                                     |
 | 
				
			||||||
| `dropout`             | The dropout rate. Defaults to `0.1`. ~~float~~                                                                                                                                                                                                                                                                                      |
 | 
					| `dropout`               | The dropout rate. Defaults to `0.1`. ~~float~~                                                                                                                                                                                                                                                                                      |
 | 
				
			||||||
| `eval_frequency`      | How often to evaluate during training (steps). Defaults to `200`. ~~int~~                                                                                                                                                                                                                                                           |
 | 
					| `eval_frequency`        | How often to evaluate during training (steps). Defaults to `200`. ~~int~~                                                                                                                                                                                                                                                           |
 | 
				
			||||||
| `frozen_components`   | Pipeline component names that are "frozen" and shouldn't be initialized or updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~                                                                                                                                      |
 | 
					| `frozen_components`     | Pipeline component names that are "frozen" and shouldn't be initialized or updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~                                                                                                                                      |
 | 
				
			||||||
| `gpu_allocator`       | Library for cupy to route GPU memory allocation to. Can be `"pytorch"` or `"tensorflow"`. Defaults to variable `${system.gpu_allocator}`. ~~str~~                                                                                                                                                                                   |
 | 
					| `annotating_components` | Pipeline component names that should set annotations on the predicted docs during training. See [here](/usage/training#annotating-components) for details. Defaults to `[]`. ~~List[str]~~                                                                                                                                          |
 | 
				
			||||||
| `logger`              | Callable that takes the `nlp` and stdout and stderr `IO` objects, sets up the logger, and returns two new callables to log a training step and to finalize the logger. Defaults to [`ConsoleLogger`](/api/top-level#ConsoleLogger). ~~Callable[[Language, IO, IO], [Tuple[Callable[[Dict[str, Any]], None], Callable[[], None]]]]~~ |
 | 
					| `gpu_allocator`         | Library for cupy to route GPU memory allocation to. Can be `"pytorch"` or `"tensorflow"`. Defaults to variable `${system.gpu_allocator}`. ~~str~~                                                                                                                                                                                   |
 | 
				
			||||||
| `max_epochs`          | Maximum number of epochs to train for. `0` means an unlimited number of epochs. `-1` means that the train corpus should be streamed rather than loaded into memory with no shuffling within the training loop. Defaults to `0`. ~~int~~                                                                                             |
 | 
					| `logger`                | Callable that takes the `nlp` and stdout and stderr `IO` objects, sets up the logger, and returns two new callables to log a training step and to finalize the logger. Defaults to [`ConsoleLogger`](/api/top-level#ConsoleLogger). ~~Callable[[Language, IO, IO], [Tuple[Callable[[Dict[str, Any]], None], Callable[[], None]]]]~~ |
 | 
				
			||||||
| `max_steps`           | Maximum number of update steps to train for. `0` means an unlimited number of steps. Defaults to `20000`. ~~int~~                                                                                                                                                                                                                   |
 | 
					| `max_epochs`            | Maximum number of epochs to train for. `0` means an unlimited number of epochs. `-1` means that the train corpus should be streamed rather than loaded into memory with no shuffling within the training loop. Defaults to `0`. ~~int~~                                                                                             |
 | 
				
			||||||
| `optimizer`           | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~                                                                                                                                             |
 | 
					| `max_steps`             | Maximum number of update steps to train for. `0` means an unlimited number of steps. Defaults to `20000`. ~~int~~                                                                                                                                                                                                                   |
 | 
				
			||||||
| `patience`            | How many steps to continue without improvement in evaluation score. `0` disables early stopping. Defaults to `1600`. ~~int~~                                                                                                                                                                                                        |
 | 
					| `optimizer`             | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~                                                                                                                                             |
 | 
				
			||||||
| `score_weights`       | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~                                                                                                                                                       |
 | 
					| `patience`              | How many steps to continue without improvement in evaluation score. `0` disables early stopping. Defaults to `1600`. ~~int~~                                                                                                                                                                                                        |
 | 
				
			||||||
| `seed`                | The random seed. Defaults to variable `${system.seed}`. ~~int~~                                                                                                                                                                                                                                                                     |
 | 
					| `score_weights`         | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~                                                                                                                                                       |
 | 
				
			||||||
| `train_corpus`        | Dot notation of the config location defining the train corpus. Defaults to `corpora.train`. ~~str~~                                                                                                                                                                                                                                 |
 | 
					| `seed`                  | The random seed. Defaults to variable `${system.seed}`. ~~int~~                                                                                                                                                                                                                                                                     |
 | 
				
			||||||
 | 
					| `train_corpus`          | Dot notation of the config location defining the train corpus. Defaults to `corpora.train`. ~~str~~                                                                                                                                                                                                                                 |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
### pretraining {#config-pretraining tag="section,optional"}
 | 
					### pretraining {#config-pretraining tag="section,optional"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -245,14 +245,14 @@ and call the optimizer, while the others simply increment the gradients.
 | 
				
			||||||
> losses = trf.update(examples, sgd=optimizer)
 | 
					> losses = trf.update(examples, sgd=optimizer)
 | 
				
			||||||
> ```
 | 
					> ```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| Name              | Description                                                                                                                                                                      |
 | 
					| Name           | Description                                                                                                                                                                      |
 | 
				
			||||||
| ----------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
					| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
				
			||||||
| `examples`        | A batch of [`Example`](/api/example) objects. Only the [`Example.predicted`](/api/example#predicted) `Doc` object is used, the reference `Doc` is ignored. ~~Iterable[Example]~~ |
 | 
					| `examples`     | A batch of [`Example`](/api/example) objects. Only the [`Example.predicted`](/api/example#predicted) `Doc` object is used, the reference `Doc` is ignored. ~~Iterable[Example]~~ |
 | 
				
			||||||
| _keyword-only_    |                                                                                                                                                                                  |
 | 
					| _keyword-only_ |                                                                                                                                                                                  |
 | 
				
			||||||
| `drop`            | The dropout rate. ~~float~~                                                                                                                                                      |
 | 
					| `drop`         | The dropout rate. ~~float~~                                                                                                                                                      |
 | 
				
			||||||
| `sgd`             | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                                                                    |
 | 
					| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                                                                    |
 | 
				
			||||||
| `losses`          | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                                                         |
 | 
					| `losses`       | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                                                         |
 | 
				
			||||||
| **RETURNS**       | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                                                            |
 | 
					| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                                                            |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
## Transformer.create_optimizer {#create_optimizer tag="method"}
 | 
					## Transformer.create_optimizer {#create_optimizer tag="method"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -493,6 +493,11 @@ This requires sentence boundaries to be set (e.g. by the
 | 
				
			||||||
depending on the sentence lengths. However, it does provide the transformer with
 | 
					depending on the sentence lengths. However, it does provide the transformer with
 | 
				
			||||||
more meaningful windows to attend over.
 | 
					more meaningful windows to attend over.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					To set sentence boundaries with the `sentencizer` during training, add a
 | 
				
			||||||
 | 
					`sentencizer` to the beginning of the pipeline and include it in
 | 
				
			||||||
 | 
					[`[training.annotating_components]`](/usage/training#annotating-components) to
 | 
				
			||||||
 | 
					have it set the sentence boundaries before the `transformer` component runs.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
### strided_spans.v1 {#strided_spans tag="registered function"}
 | 
					### strided_spans.v1 {#strided_spans tag="registered function"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
> #### Example config
 | 
					> #### Example config
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -422,11 +422,11 @@ as-is. They are also excluded when calling
 | 
				
			||||||
> #### Note on frozen components
 | 
					> #### Note on frozen components
 | 
				
			||||||
>
 | 
					>
 | 
				
			||||||
> Even though frozen components are not **updated** during training, they will
 | 
					> Even though frozen components are not **updated** during training, they will
 | 
				
			||||||
> still **run** during training and evaluation. This is very important, because
 | 
					> still **run** during evaluation. This is very important, because they may
 | 
				
			||||||
> they may still impact your model's performance – for instance, a sentence
 | 
					> still impact your model's performance – for instance, a sentence boundary
 | 
				
			||||||
> boundary detector can impact what the parser or entity recognizer considers a
 | 
					> detector can impact what the parser or entity recognizer considers a valid
 | 
				
			||||||
> valid parse. So the evaluation results should always reflect what your
 | 
					> parse. So the evaluation results should always reflect what your pipeline will
 | 
				
			||||||
> pipeline will produce at runtime.
 | 
					> produce at runtime.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
```ini
 | 
					```ini
 | 
				
			||||||
[nlp]
 | 
					[nlp]
 | 
				
			||||||
| 
						 | 
					@ -463,6 +463,64 @@ replace_listeners = ["model.tok2vec"]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
</Infobox>
 | 
					</Infobox>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					### Using predictions from preceding components {#annotating-components new="3.1"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					By default, components are updated in isolation during training, which means
 | 
				
			||||||
 | 
					that they don't see the predictions of any earlier components in the pipeline. A
 | 
				
			||||||
 | 
					component receives [`Example.predicted`](/api/example) as input and compares its
 | 
				
			||||||
 | 
					predictions to [`Example.reference`](/api/example) without saving its
 | 
				
			||||||
 | 
					annotations in the `predicted` doc.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Instead, if certain components should **set their annotations** during training,
 | 
				
			||||||
 | 
					use the setting `annotating_components` in the `[training]` block to specify a
 | 
				
			||||||
 | 
					list of components. For example, the feature `DEP` from the parser could be used
 | 
				
			||||||
 | 
					as a tagger feature by including `DEP` in the tok2vec `attrs` and including
 | 
				
			||||||
 | 
					`parser` in `annotating_components`:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					```ini
 | 
				
			||||||
 | 
					### config.cfg (excerpt) {highlight="7,12"}
 | 
				
			||||||
 | 
					[nlp]
 | 
				
			||||||
 | 
					pipeline = ["parser", "tagger"]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[components.tagger.model.tok2vec.embed]
 | 
				
			||||||
 | 
					@architectures = "spacy.MultiHashEmbed.v1"
 | 
				
			||||||
 | 
					width = ${components.tagger.model.tok2vec.encode.width}
 | 
				
			||||||
 | 
					attrs = ["NORM","DEP"]
 | 
				
			||||||
 | 
					rows = [5000,2500]
 | 
				
			||||||
 | 
					include_static_vectors = false
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[training]
 | 
				
			||||||
 | 
					annotating_components = ["parser"]
 | 
				
			||||||
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Any component in the pipeline can be included as an annotating component,
 | 
				
			||||||
 | 
					including frozen components. Frozen components can set annotations during
 | 
				
			||||||
 | 
					training just as they would set annotations during evaluation or when the final
 | 
				
			||||||
 | 
					pipeline is run. The config excerpt below shows how a frozen `ner` component and
 | 
				
			||||||
 | 
					a `sentencizer` can provide the required `doc.sents` and `doc.ents` for the
 | 
				
			||||||
 | 
					entity linker during training:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					```ini
 | 
				
			||||||
 | 
					### config.cfg (excerpt)
 | 
				
			||||||
 | 
					[nlp]
 | 
				
			||||||
 | 
					pipeline = ["sentencizer", "ner", "entity_linker"]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[components.ner]
 | 
				
			||||||
 | 
					source = "en_core_web_sm"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[training]
 | 
				
			||||||
 | 
					frozen_components = ["ner"]
 | 
				
			||||||
 | 
					annotating_components = ["sentencizer", "ner"]
 | 
				
			||||||
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					<Infobox variant="warning" title="Training speed with annotating components" id="annotating-components-speed">
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Be aware that non-frozen annotating components with statistical models will
 | 
				
			||||||
 | 
					**run twice** on each batch, once to update the model and once to apply the
 | 
				
			||||||
 | 
					now-updated model to the predicted docs.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					</Infobox>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
### Using registered functions {#config-functions}
 | 
					### Using registered functions {#config-functions}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
The training configuration defined in the config file doesn't have to only
 | 
					The training configuration defined in the config file doesn't have to only
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -25,7 +25,13 @@
 | 
				
			||||||
            "code": "ca",
 | 
					            "code": "ca",
 | 
				
			||||||
            "name": "Catalan",
 | 
					            "name": "Catalan",
 | 
				
			||||||
            "example": "Això és una frase.",
 | 
					            "example": "Això és una frase.",
 | 
				
			||||||
            "has_examples": true
 | 
					            "has_examples": true,
 | 
				
			||||||
 | 
					            "models": [
 | 
				
			||||||
 | 
					                "ca_core_news_sm",
 | 
				
			||||||
 | 
					                "ca_core_news_md",
 | 
				
			||||||
 | 
					                "ca_core_news_lg",
 | 
				
			||||||
 | 
					                "ca_core_news_trf"
 | 
				
			||||||
 | 
					            ]
 | 
				
			||||||
        },
 | 
					        },
 | 
				
			||||||
        {
 | 
					        {
 | 
				
			||||||
            "code": "cs",
 | 
					            "code": "cs",
 | 
				
			||||||
| 
						 | 
					@ -40,7 +46,8 @@
 | 
				
			||||||
            "models": [
 | 
					            "models": [
 | 
				
			||||||
                "da_core_news_sm",
 | 
					                "da_core_news_sm",
 | 
				
			||||||
                "da_core_news_md",
 | 
					                "da_core_news_md",
 | 
				
			||||||
                "da_core_news_lg"
 | 
					                "da_core_news_lg",
 | 
				
			||||||
 | 
					                "da_core_news_trf"
 | 
				
			||||||
            ]
 | 
					            ]
 | 
				
			||||||
        },
 | 
					        },
 | 
				
			||||||
        {
 | 
					        {
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user