diff --git a/.github/contributors/tupui.md b/.github/contributors/tupui.md new file mode 100644 index 000000000..5f53a72f8 --- /dev/null +++ b/.github/contributors/tupui.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Pamphile Roy | +| Company name (if applicable) | N/A | +| Title or role (if applicable) | N/A | +| Date | January 29th, 2021 | +| GitHub username | tupui | +| Website (optional) | N/A | diff --git a/requirements.txt b/requirements.txt index 3cb30597a..c01399907 100644 --- a/requirements.txt +++ b/requirements.txt @@ -24,7 +24,7 @@ importlib_metadata>=0.20; python_version < "3.8" typing_extensions>=3.7.4; python_version < "3.8" # Development dependencies cython>=0.25 -pytest>=4.6.5 +pytest>=5.2.0 pytest-timeout>=1.3.0,<2.0.0 mock>=2.0.0,<3.0.0 flake8>=3.5.0,<3.6.0 diff --git a/spacy/cli/download.py b/spacy/cli/download.py index a7ebbfd77..75f68373d 100644 --- a/spacy/cli/download.py +++ b/spacy/cli/download.py @@ -105,8 +105,6 @@ def download_model( filename: str, user_pip_args: Optional[Sequence[str]] = None ) -> None: download_url = about.__download_url__ + "/" + filename - pip_args = ["--no-cache-dir"] - if user_pip_args: - pip_args.extend(user_pip_args) + pip_args = user_pip_args if user_pip_args is not None else [] cmd = [sys.executable, "-m", "pip", "install"] + pip_args + [download_url] run_command(cmd) diff --git a/spacy/cli/info.py b/spacy/cli/info.py index 64d597890..16404a6a0 100644 --- a/spacy/cli/info.py +++ b/spacy/cli/info.py @@ -105,12 +105,15 @@ def info_model(model: str, *, silent: bool = True) -> Dict[str, Any]: def get_markdown( - data: Dict[str, Any], title: Optional[str] = None, exclude: Optional[List[str]] = None + data: Dict[str, Any], + title: Optional[str] = None, + exclude: Optional[List[str]] = None, ) -> str: """Get data in GitHub-flavoured Markdown format for issues etc. - data (dict or list of tuples): Label/value pairs. - title (str / None): Title, will be rendered as headline 2. + data (Dict[str, Any]): Label/value pairs. + title (str): Optional title, will be rendered as headline 2. + exclude (List[str]): Names of keys to exclude. RETURNS (str): The Markdown string. """ md = MarkdownRenderer() diff --git a/spacy/errors.py b/spacy/errors.py index 6874f9a0c..f16f4fd9d 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -78,7 +78,7 @@ class Warnings: W035 = ('Discarding subpattern "{pattern}" due to an unrecognized ' "attribute or operator.") - # TODO: fix numbering after merging develop into master + # New warnings added in v3.x W086 = ("Component '{listener}' will be (re)trained, but it needs the component " "'{name}' which is frozen. You can either freeze both, or neither " "of the two. If you're sourcing the component from " @@ -483,8 +483,9 @@ class Errors: E199 = ("Unable to merge 0-length span at `doc[{start}:{end}]`.") E200 = ("Can't yet set {attr} from Span. Vote for this feature on the " "issue tracker: http://github.com/explosion/spaCy/issues") + E202 = ("Unsupported alignment mode '{mode}'. Supported modes: {modes}.") - # TODO: fix numbering after merging develop into master + # New errors added in v3.x E886 = ("Can't replace {name} -> {tok2vec} listeners: path '{path}' not " "found in config for component '{name}'.") E887 = ("Can't replace {name} -> {tok2vec} listeners: the paths to replace " diff --git a/spacy/lang/es/__init__.py b/spacy/lang/es/__init__.py index a0230b850..4b329b6f7 100644 --- a/spacy/lang/es/__init__.py +++ b/spacy/lang/es/__init__.py @@ -29,7 +29,9 @@ class Spanish(Language): default_config={"model": None, "mode": "rule", "overwrite": False}, default_score_weights={"lemma_acc": 1.0}, ) -def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool): +def make_lemmatizer( + nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool +): return SpanishLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) diff --git a/spacy/lang/ky/tokenizer_exceptions.py b/spacy/lang/ky/tokenizer_exceptions.py index cd51c2714..8ec727ac1 100644 --- a/spacy/lang/ky/tokenizer_exceptions.py +++ b/spacy/lang/ky/tokenizer_exceptions.py @@ -47,6 +47,7 @@ for exc_data in [ # "etc." abbreviations {ORTH: "көч.", NORM: "көчөсү"}, {ORTH: "м-н", NORM: "менен"}, {ORTH: "б-ча", NORM: "боюнча"}, -]: _exc[exc_data[ORTH]] = [exc_data] +]: + _exc[exc_data[ORTH]] = [exc_data] TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc) diff --git a/spacy/lang/lt/punctuation.py b/spacy/lang/lt/punctuation.py index 506aa8f32..22aee0941 100644 --- a/spacy/lang/lt/punctuation.py +++ b/spacy/lang/lt/punctuation.py @@ -19,7 +19,7 @@ _infixes = ( ) -_suffixes = ["\."] + list(TOKENIZER_SUFFIXES) +_suffixes = [r"\."] + list(TOKENIZER_SUFFIXES) TOKENIZER_INFIXES = _infixes diff --git a/spacy/lang/mk/lemmatizer.py b/spacy/lang/mk/lemmatizer.py index ce3e73b7a..a792095e7 100644 --- a/spacy/lang/mk/lemmatizer.py +++ b/spacy/lang/mk/lemmatizer.py @@ -9,7 +9,6 @@ class MacedonianLemmatizer(Lemmatizer): def rule_lemmatize(self, token: Token) -> List[str]: string = token.text univ_pos = token.pos_.lower() - morphology = token.morph.to_dict() if univ_pos in ("", "eol", "space"): return [string.lower()] diff --git a/spacy/language.py b/spacy/language.py index eca311e8f..aab7c9b7a 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1686,7 +1686,10 @@ class Language: return nlp def replace_listeners( - self, tok2vec_name: str, pipe_name: str, listeners: Iterable[str], + self, + tok2vec_name: str, + pipe_name: str, + listeners: Iterable[str], ) -> None: """Find listener layers (connecting to a token-to-vector embedding component) of a given pipeline component model and replace diff --git a/spacy/pipeline/attributeruler.py b/spacy/pipeline/attributeruler.py index 3d7d81c70..5e2278b5f 100644 --- a/spacy/pipeline/attributeruler.py +++ b/spacy/pipeline/attributeruler.py @@ -135,7 +135,6 @@ class AttributeRuler(Pipe): ) from None set_token_attrs(span[index], attrs) - def load_from_tag_map( self, tag_map: Dict[str, Dict[Union[int, str], Union[int, str]]] ) -> None: diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index 09debc993..5f4420233 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -217,7 +217,6 @@ class EntityLinker(TrainablePipe): return losses validate_examples(examples, "EntityLinker.update") sentence_docs = [] - docs = [eg.predicted for eg in examples] for eg in examples: sentences = [s for s in eg.reference.sents] kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True) diff --git a/spacy/pipeline/functions.py b/spacy/pipeline/functions.py index d955e970d..b233d1623 100644 --- a/spacy/pipeline/functions.py +++ b/spacy/pipeline/functions.py @@ -1,10 +1,9 @@ -import srsly -from thinc.api import Config from typing import Dict, Any +import srsly + from ..language import Language from ..matcher import Matcher from ..tokens import Doc -from ..util import filter_spans from .. import util @@ -64,7 +63,7 @@ def merge_subtokens(doc: Doc, label: str = "subtok") -> Doc: merger = Matcher(doc.vocab) merger.add("SUBTOK", [[{"DEP": label, "op": "+"}]]) matches = merger(doc) - spans = filter_spans([doc[start : end + 1] for _, start, end in matches]) + spans = util.filter_spans([doc[start : end + 1] for _, start, end in matches]) with doc.retokenize() as retokenizer: for span in spans: retokenizer.merge(span) @@ -77,15 +76,9 @@ def merge_subtokens(doc: Doc, label: str = "subtok") -> Doc: retokenizes=True, ) def make_token_splitter( - nlp: Language, - name: str, - *, - min_length=0, - split_length=0, + nlp: Language, name: str, *, min_length=0, split_length=0, ): - return TokenSplitter( - min_length=min_length, split_length=split_length - ) + return TokenSplitter(min_length=min_length, split_length=split_length) class TokenSplitter: diff --git a/spacy/pipeline/lemmatizer.py b/spacy/pipeline/lemmatizer.py index d70c5867c..07d5b4af1 100644 --- a/spacy/pipeline/lemmatizer.py +++ b/spacy/pipeline/lemmatizer.py @@ -1,5 +1,4 @@ -from typing import Optional, List, Dict, Any, Callable, Iterable, Iterator, Union -from typing import Tuple +from typing import Optional, List, Dict, Any, Callable, Iterable, Union, Tuple from thinc.api import Model from pathlib import Path diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index ea95ca772..3ccdcc228 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -641,7 +641,8 @@ def test_doc_noun_chunks_not_implemented(): nlp = MultiLanguage() doc = nlp(text) with pytest.raises(NotImplementedError): - chunks = list(doc.noun_chunks) + _ = list(doc.noun_chunks) # noqa: F841 + def test_span_groups(en_tokenizer): doc = en_tokenizer("Some text about Colombia and the Czech Republic") diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py index 4c7f0c86b..078cc81b1 100644 --- a/spacy/tests/doc/test_span.py +++ b/spacy/tests/doc/test_span.py @@ -195,6 +195,12 @@ def test_spans_by_character(doc): assert span1.end_char == span2.end_char assert span2.label_ == "GPE" + # unsupported alignment mode + with pytest.raises(ValueError): + span2 = doc.char_span( + span1.start_char + 1, span1.end_char, label="GPE", alignment_mode="unk" + ) + def test_span_to_array(doc): span = doc[1:-2] diff --git a/spacy/tests/lang/ky/test_tokenizer.py b/spacy/tests/lang/ky/test_tokenizer.py index 99dab2b16..91a048764 100644 --- a/spacy/tests/lang/ky/test_tokenizer.py +++ b/spacy/tests/lang/ky/test_tokenizer.py @@ -18,9 +18,7 @@ PUNC_INSIDE_WORDS_TESTS = [ ('То"кой', 'То " кой'.split()), ] -MIXED_ORDINAL_NUMS_TESTS = [ - ("Эртең 22-январь...", "Эртең 22 - январь ...".split()) -] +MIXED_ORDINAL_NUMS_TESTS = [("Эртең 22-январь...", "Эртең 22 - январь ...".split())] ABBREV_TESTS = [ ("Маселе б-ча эртең келет", "Маселе б-ча эртең келет".split()), diff --git a/spacy/tests/matcher/test_dependency_matcher.py b/spacy/tests/matcher/test_dependency_matcher.py index d7492aa67..a563ddaa2 100644 --- a/spacy/tests/matcher/test_dependency_matcher.py +++ b/spacy/tests/matcher/test_dependency_matcher.py @@ -2,7 +2,6 @@ import pytest import pickle import re import copy -import logging from mock import Mock from spacy.matcher import DependencyMatcher from spacy.tokens import Doc @@ -343,6 +342,5 @@ def test_dependency_matcher_long_matches(en_vocab, doc): ] matcher = DependencyMatcher(en_vocab) - logger = logging.getLogger("spacy") with pytest.raises(ValueError): matcher.add("pattern", [pattern]) diff --git a/spacy/tests/matcher/test_phrase_matcher.py b/spacy/tests/matcher/test_phrase_matcher.py index e95bd5eba..c7a3fef7d 100644 --- a/spacy/tests/matcher/test_phrase_matcher.py +++ b/spacy/tests/matcher/test_phrase_matcher.py @@ -322,4 +322,4 @@ def test_phrase_matcher_deprecated(en_vocab): @pytest.mark.parametrize("attr", ["SENT_START", "IS_SENT_START"]) def test_phrase_matcher_sent_start(en_vocab, attr): - matcher = PhraseMatcher(en_vocab, attr=attr) + _ = PhraseMatcher(en_vocab, attr=attr) # noqa: F841 diff --git a/spacy/tests/pipeline/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py index c6ac42dd2..ac5428de6 100644 --- a/spacy/tests/pipeline/test_tok2vec.py +++ b/spacy/tests/pipeline/test_tok2vec.py @@ -6,7 +6,6 @@ from spacy.pipeline.tok2vec import Tok2Vec, Tok2VecListener from spacy.vocab import Vocab from spacy.tokens import Doc from spacy.training import Example -from spacy.training.initialize import init_nlp from spacy import util from spacy.lang.en import English from thinc.api import Config diff --git a/spacy/tests/regression/test_issue6258.py b/spacy/tests/regression/test_issue6001-6500.py similarity index 55% rename from spacy/tests/regression/test_issue6258.py rename to spacy/tests/regression/test_issue6001-6500.py index 9ce9026c0..470b2f388 100644 --- a/spacy/tests/regression/test_issue6258.py +++ b/spacy/tests/regression/test_issue6001-6500.py @@ -1,6 +1,21 @@ -import pytest +from spacy.util import filter_spans from pydantic import ValidationError from spacy.schemas import TokenPattern, TokenPatternSchema +import pytest + + +def test_issue6207(en_tokenizer): + doc = en_tokenizer("zero one two three four five six") + + # Make spans + s1 = doc[:4] + s2 = doc[3:6] # overlaps with s1 + s3 = doc[5:7] # overlaps with s2, not s1 + + result = filter_spans((s1, s2, s3)) + assert s1 in result + assert s2 not in result + assert s3 in result def test_issue6258(): diff --git a/spacy/tests/regression/test_issue6207.py b/spacy/tests/regression/test_issue6207.py deleted file mode 100644 index 9d8b047bf..000000000 --- a/spacy/tests/regression/test_issue6207.py +++ /dev/null @@ -1,15 +0,0 @@ -from spacy.util import filter_spans - - -def test_issue6207(en_tokenizer): - doc = en_tokenizer("zero one two three four five six") - - # Make spans - s1 = doc[:4] - s2 = doc[3:6] # overlaps with s1 - s3 = doc[5:7] # overlaps with s2, not s1 - - result = filter_spans((s1, s2, s3)) - assert s1 in result - assert s2 not in result - assert s3 in result diff --git a/spacy/tests/regression/test_issue6815.py b/spacy/tests/regression/test_issue6815.py index 7c888a314..7d523e00b 100644 --- a/spacy/tests/regression/test_issue6815.py +++ b/spacy/tests/regression/test_issue6815.py @@ -1,12 +1,11 @@ import pytest from spacy.lang.en import English -import numpy as np +import numpy as np @pytest.mark.parametrize( - "sentence, start_idx,end_idx,label", [ - ('Welcome to Mumbai, my friend',11,17,'GPE') - ] + "sentence, start_idx,end_idx,label", + [("Welcome to Mumbai, my friend", 11, 17, "GPE")], ) def test_char_span_label(sentence, start_idx, end_idx, label): nlp = English() @@ -14,10 +13,9 @@ def test_char_span_label(sentence, start_idx, end_idx, label): span = doc[:].char_span(start_idx, end_idx, label=label) assert span.label_ == label + @pytest.mark.parametrize( - "sentence, start_idx,end_idx,kb_id", [ - ('Welcome to Mumbai, my friend',11,17,5) - ] + "sentence, start_idx,end_idx,kb_id", [("Welcome to Mumbai, my friend", 11, 17, 5)] ) def test_char_span_kb_id(sentence, start_idx, end_idx, kb_id): nlp = English() @@ -25,13 +23,13 @@ def test_char_span_kb_id(sentence, start_idx, end_idx, kb_id): span = doc[:].char_span(start_idx, end_idx, kb_id=kb_id) assert span.kb_id == kb_id + @pytest.mark.parametrize( - "sentence, start_idx,end_idx,vector", [ - ('Welcome to Mumbai, my friend',11,17,np.array([0.1,0.2,0.3])) - ] + "sentence, start_idx,end_idx,vector", + [("Welcome to Mumbai, my friend", 11, 17, np.array([0.1, 0.2, 0.3]))], ) def test_char_span_vector(sentence, start_idx, end_idx, vector): nlp = English() doc = nlp(sentence) span = doc[:].char_span(start_idx, end_idx, vector=vector) - assert (span.vector == vector).all() \ No newline at end of file + assert (span.vector == vector).all() diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py index f1b4eba6e..bec85a1a2 100644 --- a/spacy/tests/test_language.py +++ b/spacy/tests/test_language.py @@ -186,6 +186,7 @@ def test_language_pipe_error_handler(): def test_language_pipe_error_handler_custom(en_vocab): """Test the error handling of a custom component that has no pipe method""" + @Language.component("my_evil_component") def evil_component(doc): if "2" in doc.text: @@ -194,6 +195,7 @@ def test_language_pipe_error_handler_custom(en_vocab): def warn_error(proc_name, proc, docs, e): from spacy.util import logger + logger.warning(f"Trouble with component {proc_name}.") nlp = English() @@ -217,6 +219,7 @@ def test_language_pipe_error_handler_custom(en_vocab): def test_language_pipe_error_handler_pipe(en_vocab): """Test the error handling of a component's pipe method""" + @Language.component("my_sentences") def perhaps_set_sentences(doc): if not doc.text.startswith("4"): diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 489de2201..a40931720 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -525,8 +525,9 @@ cdef class Doc: label = self.vocab.strings.add(label) if not isinstance(kb_id, int): kb_id = self.vocab.strings.add(kb_id) - if alignment_mode not in ("strict", "contract", "expand"): - alignment_mode = "strict" + alignment_modes = ("strict", "contract", "expand") + if alignment_mode not in alignment_modes: + raise ValueError(Errors.E202.format(mode=alignment_mode, modes=", ".join(alignment_modes))) cdef int start = token_by_char(self.c, self.length, start_idx) if start < 0 or (alignment_mode == "strict" and start_idx != self[start].idx): return None