From f00254ae276eca963991efb8a45748b2948b1c77 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Mon, 20 Jun 2022 08:48:40 +0100 Subject: [PATCH 01/25] add counts to verbose list of NER labels (#10957) --- spacy/cli/debug_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index 8a6dde955..bd05471b1 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -361,7 +361,7 @@ def debug_data( if label != "-" ] labels_with_counts = _format_labels(labels_with_counts, counts=True) - msg.text(f"Labels in train data: {_format_labels(labels)}", show=verbose) + msg.text(f"Labels in train data: {labels_with_counts}", show=verbose) missing_labels = model_labels - labels if missing_labels: msg.warn( From cdad815c6854a5349abbde469f2478585b118e6a Mon Sep 17 00:00:00 2001 From: Lucaterre Date: Mon, 20 Jun 2022 14:28:49 +0200 Subject: [PATCH 02/25] updated spacy universe for spacyfishing --- .github/contributors/Lucaterre.md | 106 ++++++++++++++++++++++++++++++ website/meta/universe.json | 29 ++++++++ 2 files changed, 135 insertions(+) create mode 100644 .github/contributors/Lucaterre.md diff --git a/.github/contributors/Lucaterre.md b/.github/contributors/Lucaterre.md new file mode 100644 index 000000000..5da763b22 --- /dev/null +++ b/.github/contributors/Lucaterre.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- |---------------| +| Name | Lucas Terriel | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2022-06-20 | +| GitHub username | Lucaterre | +| Website (optional) | | \ No newline at end of file diff --git a/website/meta/universe.json b/website/meta/universe.json index 9b644adf4..ce2c63739 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -1,5 +1,34 @@ { "resources": [ + { + "id": "spacyfishing", + "title": "spaCy fishing", + "slogan": "Named entity disambiguation and linking on Wikidata in spaCy with Entity-Fishing.", + "description": "A spaCy wrapper of Entity-Fishing for named entity disambiguation and linking against a Wikidata knowledge base.", + "github": "Lucaterre/spacyfishing", + "pip": "spacyfishing", + "code_example": [ + "import spacy", + "text = 'Victor Hugo and Honoré de Balzac are French writers who lived in Paris.'", + "nlp = spacy.load('en_core_web_sm')", + "nlp.add_pipe('spacyfishing')", + "doc = nlp(text)", + "for span in doc.ents:", + " print((ent.text, ent.label_, ent._.kb_qid, ent._.url_wikidata, ent._.nerd_score))", + "# ('Victor Hugo', 'PERSON', 'Q535', 'https://www.wikidata.org/wiki/Q535', 0.972)", + "# ('Honoré de Balzac', 'PERSON', 'Q9711', 'https://www.wikidata.org/wiki/Q9711', 0.9724)", + "# ('French', 'NORP', 'Q121842', 'https://www.wikidata.org/wiki/Q121842', 0.3739)", + "# ('Paris', 'GPE', 'Q90', 'https://www.wikidata.org/wiki/Q90', 0.5652)", + "## Set parameter `extra_info` to `True` and check also span._.description, span._.src_description, span._.normal_term, span._.other_ids" + ], + "category": ["models", "pipeline"], + "tags": ["NER", "NEL"], + "author": "Lucas Terriel", + "author_links": { + "twitter": "TerreLuca", + "github": "Lucaterre" + } + }, { "id": "aim-spacy", "title": "Aim-spaCy", From 2820d7dd8daa66e12bb7c07b1dcfb31423741a72 Mon Sep 17 00:00:00 2001 From: Lucaterre Date: Mon, 20 Jun 2022 15:26:23 +0200 Subject: [PATCH 03/25] correct typo in universe.json for 'code_example' key : pipe name 'entityfishing' --- website/meta/universe.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/meta/universe.json b/website/meta/universe.json index ce2c63739..4a3ec6225 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -11,7 +11,7 @@ "import spacy", "text = 'Victor Hugo and Honoré de Balzac are French writers who lived in Paris.'", "nlp = spacy.load('en_core_web_sm')", - "nlp.add_pipe('spacyfishing')", + "nlp.add_pipe('entityfishing')", "doc = nlp(text)", "for span in doc.ents:", " print((ent.text, ent.label_, ent._.kb_qid, ent._.url_wikidata, ent._.nerd_score))", From a08ca064e53810cf1c7c0aa1ee7030654d11b5aa Mon Sep 17 00:00:00 2001 From: Victoria <80417010+victorialslocum@users.noreply.github.com> Date: Tue, 21 Jun 2022 01:03:41 -0500 Subject: [PATCH 04/25] Update linguistic-features.md (#10993) Change link for downloading fasttext word vectors --- website/docs/usage/linguistic-features.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md index c547ec0bc..9dae6f2ee 100644 --- a/website/docs/usage/linguistic-features.md +++ b/website/docs/usage/linguistic-features.md @@ -1899,7 +1899,7 @@ access to some nice Latin vectors. You can then pass the directory path to > ``` ```cli -$ wget https://s3-us-west-1.amazonaws.com/fasttext-vectors/word-vectors-v2/cc.la.300.vec.gz +$ wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.la.300.vec.gz $ python -m spacy init vectors en cc.la.300.vec.gz /tmp/la_vectors_wiki_lg ``` From 0271306f1603a3f70870c1786e8783fe39e22bd2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= Date: Tue, 21 Jun 2022 08:26:59 +0200 Subject: [PATCH 05/25] Use thinc-apple-ops>=0.1.0.dev0 with `apple` extras (#10904) * Use thinc-apple-ops>=0.1.0.dev0 with `apple` extras Also test with thinc-apple-ops that is at least 0.1.0.dev0. * Check thinc-apple-ops on macOS with Python 3.10 Co-authored-by: Adriane Boyd * Use `pip install --pre` for installing thinc-apple-ops in CI Co-authored-by: Adriane Boyd --- .github/azure-steps.yml | 4 ++-- setup.cfg | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/azure-steps.yml b/.github/azure-steps.yml index 80c88b0b8..d7233328a 100644 --- a/.github/azure-steps.yml +++ b/.github/azure-steps.yml @@ -111,7 +111,7 @@ steps: condition: eq(variables['python_version'], '3.8') - script: | - ${{ parameters.prefix }} python -m pip install thinc-apple-ops + ${{ parameters.prefix }} python -m pip install --pre thinc-apple-ops ${{ parameters.prefix }} python -m pytest --pyargs spacy displayName: "Run CPU tests with thinc-apple-ops" - condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.9')) + condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.10')) diff --git a/setup.cfg b/setup.cfg index 110a2e4ee..d317847ba 100644 --- a/setup.cfg +++ b/setup.cfg @@ -104,7 +104,7 @@ cuda114 = cuda115 = cupy-cuda115>=5.0.0b4,<11.0.0 apple = - thinc-apple-ops>=0.0.4,<1.0.0 + thinc-apple-ops>=0.1.0.dev0,<1.0.0 # Language tokenizers with external dependencies ja = sudachipy>=0.5.2,!=0.6.1 From 0fa004c4cd718319d750abad896447c114f39106 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Tue, 21 Jun 2022 21:00:07 +0100 Subject: [PATCH 06/25] the 'new' indicator wants a 'number' (#10997) --- website/docs/api/spanruler.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/api/spanruler.md b/website/docs/api/spanruler.md index a1c222714..b573f7c58 100644 --- a/website/docs/api/spanruler.md +++ b/website/docs/api/spanruler.md @@ -2,7 +2,7 @@ title: SpanRuler tag: class source: spacy/pipeline/span_ruler.py -new: 3.3.1 +new: 3.3 teaser: 'Pipeline component for rule-based span and named entity recognition' api_string_name: span_ruler api_trainable: false From bed23ff291f3e97f5ba6ee42f1a80db7c713b691 Mon Sep 17 00:00:00 2001 From: jademlc <68696651+jademlc@users.noreply.github.com> Date: Wed, 22 Jun 2022 20:45:26 +0200 Subject: [PATCH 07/25] Update serialization methods code block (#11004) * Update serialization methods code block * Update website/docs/usage/saving-loading.md Co-authored-by: Adriane Boyd --- website/docs/usage/saving-loading.md | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/website/docs/usage/saving-loading.md b/website/docs/usage/saving-loading.md index af140e7a7..0fd713a49 100644 --- a/website/docs/usage/saving-loading.md +++ b/website/docs/usage/saving-loading.md @@ -203,11 +203,14 @@ the data to and from a JSON file. ```python ### {highlight="16-23,25-30"} +import json +from spacy import Language from spacy.util import ensure_path @Language.factory("my_component") class CustomComponent: - def __init__(self): + def __init__(self, nlp: Language, name: str = "my_component"): + self.name = name self.data = [] def __call__(self, doc): @@ -231,7 +234,7 @@ class CustomComponent: # This will receive the directory path + /my_component data_path = path / "data.json" with data_path.open("r", encoding="utf8") as f: - self.data = json.loads(f) + self.data = json.load(f) return self ``` From 3335bb9d0c9df99f20460ed18e07d8844200d7d7 Mon Sep 17 00:00:00 2001 From: Peter Baumgartner <5107405+pmbaumgartner@users.noreply.github.com> Date: Thu, 23 Jun 2022 02:15:28 -0400 Subject: [PATCH 08/25] remove `cuda116` extra from install widget (#11012) --- website/src/widgets/quickstart-install.js | 1 - 1 file changed, 1 deletion(-) diff --git a/website/src/widgets/quickstart-install.js b/website/src/widgets/quickstart-install.js index 926d76ae3..ccc6b56d9 100644 --- a/website/src/widgets/quickstart-install.js +++ b/website/src/widgets/quickstart-install.js @@ -24,7 +24,6 @@ const CUDA = { '11.3': 'cuda113', '11.4': 'cuda114', '11.5': 'cuda115', - '11.6': 'cuda116', } const LANG_EXTRAS = ['ja'] // only for languages with models From f1197d9175927b453312be633cd789157c17a6e7 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 23 Jun 2022 08:16:38 +0200 Subject: [PATCH 09/25] Add API docs for token attribute symbols (#10836) * Add API docs for token attribute symbols * Remove NBSP's * Fix typo * Rephrase Co-authored-by: svlandeg --- website/docs/api/attributes.md | 78 ++++++++++++++++++++++++++++++++++ website/meta/sidebars.json | 1 + 2 files changed, 79 insertions(+) create mode 100644 website/docs/api/attributes.md diff --git a/website/docs/api/attributes.md b/website/docs/api/attributes.md new file mode 100644 index 000000000..adacd3898 --- /dev/null +++ b/website/docs/api/attributes.md @@ -0,0 +1,78 @@ +--- +title: Attributes +teaser: Token attributes +source: spacy/attrs.pyx +--- + +[Token](/api/token) attributes are specified using internal IDs in many places +including: + +- [`Matcher` patterns](/api/matcher#patterns), +- [`Doc.to_array`](/api/doc#to_array) and + [`Doc.from_array`](/api/doc#from_array) +- [`Doc.has_annotation`](/api/doc#has_annotation) +- [`MultiHashEmbed`](/api/architectures#MultiHashEmbed) Tok2Vec architecture + `attrs` + +> ```python +> import spacy +> from spacy.attrs import DEP +> +> nlp = spacy.blank("en") +> doc = nlp("There are many attributes.") +> +> # DEP always has the same internal value +> assert DEP == 76 +> +> # "DEP" is automatically converted to DEP +> assert DEP == nlp.vocab.strings["DEP"] +> assert doc.has_annotation(DEP) == doc.has_annotation("DEP") +> +> # look up IDs in spacy.attrs.IDS +> from spacy.attrs import IDS +> assert IDS["DEP"] == DEP +> ``` + +All methods automatically convert between the string version of an ID (`"DEP"`) +and the internal integer symbols (`DEP`). The internal IDs can be imported from +`spacy.attrs` or retrieved from the [`StringStore`](/api/stringstore). A map +from string attribute names to internal attribute IDs is stored in +`spacy.attrs.IDS`. + +The corresponding [`Token` object attributes](/api/token#attributes) can be +accessed using the same names in lowercase, e.g. `token.orth` or `token.length`. +For attributes that represent string values, the internal integer ID is +accessed as `Token.attr`, e.g. `token.dep`, while the string value can be +retrieved by appending `_` as in `token.dep_`. + + +| Attribute | Description | +| ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `DEP` | The token's dependency label. ~~str~~ | +| `ENT_ID` | The token's entity ID (`ent_id`). ~~str~~ | +| `ENT_IOB` | The IOB part of the token's entity tag. Uses custom integer vaues rather than the string store: unset is `0`, `I` is `1`, `O` is `2`, and `B` is `3`. ~~str~~ | +| `ENT_KB_ID` | The token's entity knowledge base ID. ~~str~~ | +| `ENT_TYPE` | The token's entity label. ~~str~~ | +| `IS_ALPHA` | Token text consists of alphabetic characters. ~~bool~~ | +| `IS_ASCII` | Token text consists of ASCII characters. ~~bool~~ | +| `IS_DIGIT` | Token text consists of digits. ~~bool~~ | +| `IS_LOWER` | Token text is in lowercase. ~~bool~~ | +| `IS_PUNCT` | Token is punctuation. ~~bool~~ | +| `IS_SPACE` | Token is whitespace. ~~bool~~ | +| `IS_STOP` | Token is a stop word. ~~bool~~ | +| `IS_TITLE` | Token text is in titlecase. ~~bool~~ | +| `IS_UPPER` | Token text is in uppercase. ~~bool~~ | +| `LEMMA` | The token's lemma. ~~str~~ | +| `LENGTH` | The length of the token text. ~~int~~ | +| `LIKE_EMAIL` | Token text resembles an email address. ~~bool~~ | +| `LIKE_NUM` | Token text resembles a number. ~~bool~~ | +| `LIKE_URL` | Token text resembles a URL. ~~bool~~ | +| `LOWER` | The lowercase form of the token text. ~~str~~ | +| `MORPH` | The token's morphological analysis. ~~MorphAnalysis~~ | +| `NORM` | The normalized form of the token text. ~~str~~ | +| `ORTH` | The exact verbatim text of a token. ~~str~~ | +| `POS` | The token's universal part of speech (UPOS). ~~str~~ | +| `SENT_START` | Token is start of sentence. ~~bool~~ | +| `SHAPE` | The token's shape. ~~str~~ | +| `SPACY` | Token has a trailing space. ~~bool~~ | +| `TAG` | The token's fine-grained part of speech. ~~str~~ | diff --git a/website/meta/sidebars.json b/website/meta/sidebars.json index c23f0a255..1bc395a66 100644 --- a/website/meta/sidebars.json +++ b/website/meta/sidebars.json @@ -124,6 +124,7 @@ { "label": "Other", "items": [ + { "text": "Attributes", "url": "/api/attributes" }, { "text": "Corpus", "url": "/api/corpus" }, { "text": "KnowledgeBase", "url": "/api/kb" }, { "text": "Lookups", "url": "/api/lookups" }, From d4e3f43639a963125bad123abe9514a1e6da81fc Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 23 Jun 2022 09:50:25 +0200 Subject: [PATCH 10/25] Update thinc version to switch back to blis v0.7 (#11014) --- pyproject.toml | 2 +- requirements.txt | 2 +- setup.cfg | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 4fea41be2..4e388e54f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ requires = [ "cymem>=2.0.2,<2.1.0", "preshed>=3.0.2,<3.1.0", "murmurhash>=0.28.0,<1.1.0", - "thinc>=8.1.0.dev2,<8.2.0", + "thinc>=8.1.0.dev3,<8.2.0", "pathy", "numpy>=1.15.0", ] diff --git a/requirements.txt b/requirements.txt index 082ef1522..3b77140f6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,7 @@ spacy-legacy>=3.0.9,<3.1.0 spacy-loggers>=1.0.0,<2.0.0 cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 -thinc>=8.1.0.dev2,<8.2.0 +thinc>=8.1.0.dev3,<8.2.0 ml_datasets>=0.2.0,<0.3.0 murmurhash>=0.28.0,<1.1.0 wasabi>=0.9.1,<1.1.0 diff --git a/setup.cfg b/setup.cfg index d317847ba..ba5b46ff0 100644 --- a/setup.cfg +++ b/setup.cfg @@ -38,7 +38,7 @@ setup_requires = cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 murmurhash>=0.28.0,<1.1.0 - thinc>=8.1.0.dev2,<8.2.0 + thinc>=8.1.0.dev3,<8.2.0 install_requires = # Our libraries spacy-legacy>=3.0.9,<3.1.0 @@ -46,7 +46,7 @@ install_requires = murmurhash>=0.28.0,<1.1.0 cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 - thinc>=8.1.0.dev2,<8.2.0 + thinc>=8.1.0.dev3,<8.2.0 wasabi>=0.9.1,<1.1.0 srsly>=2.4.3,<3.0.0 catalogue>=2.0.6,<2.1.0 From f8116078ce2c5760ae218bc1657977ed116fcf18 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Thu, 23 Jun 2022 09:57:46 +0100 Subject: [PATCH 11/25] disable failing test because Stanford servers are down (#11015) --- spacy/tests/training/test_readers.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/tests/training/test_readers.py b/spacy/tests/training/test_readers.py index 8c5c81625..eb07a52b1 100644 --- a/spacy/tests/training/test_readers.py +++ b/spacy/tests/training/test_readers.py @@ -60,11 +60,12 @@ def test_readers(): assert isinstance(extra_corpus, Callable) +# TODO: enable IMDB test once Stanford servers are back up and running @pytest.mark.slow @pytest.mark.parametrize( "reader,additional_config", [ - ("ml_datasets.imdb_sentiment.v1", {"train_limit": 10, "dev_limit": 10}), + # ("ml_datasets.imdb_sentiment.v1", {"train_limit": 10, "dev_limit": 10}), ("ml_datasets.dbpedia.v1", {"train_limit": 10, "dev_limit": 10}), ("ml_datasets.cmu_movies.v1", {"limit": 10, "freq_cutoff": 200, "split": 0.8}), ], From 4cd8b4cc222bebc2108eb52b4400eea562db4ac2 Mon Sep 17 00:00:00 2001 From: Dmytro Sadovnychyi Date: Thu, 23 Jun 2022 17:53:00 +0200 Subject: [PATCH 12/25] Fix some of the broken links on universe pages (#11011) Currently some of the "AUTHOR INFO" links (e.g. here[0]) are broken: ``` https://github.com/https://github.com/explosion ``` [0] https://spacy.io/universe/project/spacy-experimental Also one remains broken with `https://szegedai.github.io/`. --- website/meta/universe.json | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/website/meta/universe.json b/website/meta/universe.json index 4a3ec6225..ab64fe895 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -84,7 +84,7 @@ "code_language": "python", "author": "Leap Beyond", "author_links": { - "github": "https://github.com/LeapBeyond", + "github": "LeapBeyond", "website": "https://leapbeyond.ai" }, "code_example": [ @@ -107,8 +107,8 @@ "code_language": "python", "author": "Peter Baumgartner", "author_links": { - "twitter" : "https://twitter.com/pmbaumgartner", - "github": "https://github.com/pmbaumgartner", + "twitter" : "pmbaumgartner", + "github": "pmbaumgartner", "website": "https://www.peterbaumgartner.com/" }, "code_example": [ @@ -127,8 +127,8 @@ "code_language": "python", "author": "Explosion", "author_links": { - "twitter" : "https://twitter.com/explosion_ai", - "github": "https://github.com/explosion", + "twitter" : "explosion_ai", + "github": "explosion", "website": "https://explosion.ai/" }, "code_example": [ @@ -600,8 +600,8 @@ "code_language": "python", "author": "Keith Rozario", "author_links": { - "twitter" : "https://twitter.com/keithrozario", - "github": "https://github.com/keithrozario", + "twitter" : "keithrozario", + "github": "keithrozario", "website": "https://www.keithrozario.com" }, "code_example": [ @@ -2324,7 +2324,7 @@ "author": "Daniel Whitenack & Chris Benson", "author_links": { "website": "https://changelog.com/practicalai", - "twitter": "https://twitter.com/PracticalAIFM" + "twitter": "PracticalAIFM" }, "category": ["podcasts"] }, From 9738b69c0e3babb365cafaa26b872ca1028c9696 Mon Sep 17 00:00:00 2001 From: Peter Baumgartner <5107405+pmbaumgartner@users.noreply.github.com> Date: Fri, 24 Jun 2022 02:11:29 -0400 Subject: [PATCH 13/25] Update Code Conventions.md (#11018) --- extra/DEVELOPER_DOCS/Code Conventions.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/extra/DEVELOPER_DOCS/Code Conventions.md b/extra/DEVELOPER_DOCS/Code Conventions.md index 37cd8ff27..31a87d362 100644 --- a/extra/DEVELOPER_DOCS/Code Conventions.md +++ b/extra/DEVELOPER_DOCS/Code Conventions.md @@ -455,6 +455,10 @@ Regression tests are tests that refer to bugs reported in specific issues. They The test suite also provides [fixtures](https://github.com/explosion/spaCy/blob/master/spacy/tests/conftest.py) for different language tokenizers that can be used as function arguments of the same name and will be passed in automatically. Those should only be used for tests related to those specific languages. We also have [test utility functions](https://github.com/explosion/spaCy/blob/master/spacy/tests/util.py) for common operations, like creating a temporary file. +### Testing Cython Code + +If you're developing Cython code (`.pyx` files), those extensions will need to be built before the test runner can test that code - otherwise it's going to run the tests with stale code from the last time the extension was built. You can build the extensions locally with `python setup.py build_ext -i`. + ### Constructing objects and state Test functions usually follow the same simple structure: they set up some state, perform the operation you want to test and `assert` conditions that you expect to be true, usually before and after the operation. From bffe54d02b840a73f8dec4d8cd50056507695853 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 24 Jun 2022 08:48:58 +0200 Subject: [PATCH 14/25] Set version to v3.4.0 --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index 03eabc2e9..ef0358e1a 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy" -__version__ = "3.3.0" +__version__ = "3.4.0" __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __projects__ = "https://github.com/explosion/projects" From d9320db7db74b970b3751e38ed6f14de5b7d16d5 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 1 Apr 2022 10:42:25 +0200 Subject: [PATCH 15/25] Temporarily skip tests that require models/compat --- .github/azure-steps.yml | 34 +++++++++++++++++----------------- spacy/tests/test_cli.py | 2 ++ 2 files changed, 19 insertions(+), 17 deletions(-) diff --git a/.github/azure-steps.yml b/.github/azure-steps.yml index d7233328a..41f743feb 100644 --- a/.github/azure-steps.yml +++ b/.github/azure-steps.yml @@ -64,12 +64,12 @@ steps: displayName: "Run GPU tests" condition: eq(${{ parameters.gpu }}, true) - - script: | - python -m spacy download ca_core_news_sm - python -m spacy download ca_core_news_md - python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')" - displayName: 'Test download CLI' - condition: eq(variables['python_version'], '3.8') +# - script: | +# python -m spacy download ca_core_news_sm +# python -m spacy download ca_core_news_md +# python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')" +# displayName: 'Test download CLI' +# condition: eq(variables['python_version'], '3.8') - script: | python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json . @@ -93,17 +93,17 @@ steps: displayName: 'Test train CLI' condition: eq(variables['python_version'], '3.8') - - script: | - python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')" - PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir - displayName: 'Test assemble CLI' - condition: eq(variables['python_version'], '3.8') - - - script: | - python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')" - python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113 - displayName: 'Test assemble CLI vectors warning' - condition: eq(variables['python_version'], '3.8') +# - script: | +# python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')" +# PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir +# displayName: 'Test assemble CLI' +# condition: eq(variables['python_version'], '3.8') +# +# - script: | +# python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')" +# python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113 +# displayName: 'Test assemble CLI vectors warning' +# condition: eq(variables['python_version'], '3.8') - script: | python .github/validate_universe_json.py website/meta/universe.json diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index 838e00369..fe8b3a8a1 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -589,6 +589,7 @@ def test_string_to_list_intify(value): assert string_to_list(value, intify=True) == [1, 2, 3] +@pytest.mark.skip(reason="Temporarily skip for dev version") def test_download_compatibility(): spec = SpecifierSet("==" + about.__version__) spec.prereleases = False @@ -599,6 +600,7 @@ def test_download_compatibility(): assert get_minor_version(about.__version__) == get_minor_version(version) +@pytest.mark.skip(reason="Temporarily skip for dev version") def test_validate_compatibility_table(): spec = SpecifierSet("==" + about.__version__) spec.prereleases = False From 8f1ba4de582c5e5282c022a7713a56b47302cabe Mon Sep 17 00:00:00 2001 From: Madeesh Kannan Date: Fri, 24 Jun 2022 13:39:52 +0200 Subject: [PATCH 16/25] Backport parser/alignment optimizations from `feature/refactor-parser` (#10952) --- spacy/training/alignment_array.pyx | 20 +++-- spacy/training/example.pyx | 129 +++++++++++++++++++++++------ spacy/util.py | 7 ++ 3 files changed, 123 insertions(+), 33 deletions(-) diff --git a/spacy/training/alignment_array.pyx b/spacy/training/alignment_array.pyx index b58f08786..01e9d9bf8 100644 --- a/spacy/training/alignment_array.pyx +++ b/spacy/training/alignment_array.pyx @@ -1,33 +1,39 @@ from typing import List from ..errors import Errors import numpy +from libc.stdint cimport int32_t cdef class AlignmentArray: """AlignmentArray is similar to Thinc's Ragged with two simplfications: indexing returns numpy arrays and this type can only be used for CPU arrays. - However, these changes make AlginmentArray more efficient for indexing in a + However, these changes make AlignmentArray more efficient for indexing in a tight loop.""" __slots__ = [] def __init__(self, alignment: List[List[int]]): - self._lengths = None - self._starts_ends = numpy.zeros(len(alignment) + 1, dtype="i") - cdef int data_len = 0 cdef int outer_len cdef int idx + + self._starts_ends = numpy.zeros(len(alignment) + 1, dtype='int32') + cdef int32_t* starts_ends_ptr = self._starts_ends.data + for idx, outer in enumerate(alignment): outer_len = len(outer) - self._starts_ends[idx + 1] = self._starts_ends[idx] + outer_len + starts_ends_ptr[idx + 1] = starts_ends_ptr[idx] + outer_len data_len += outer_len - self._data = numpy.empty(data_len, dtype="i") + self._lengths = None + self._data = numpy.empty(data_len, dtype="int32") + idx = 0 + cdef int32_t* data_ptr = self._data.data + for outer in alignment: for inner in outer: - self._data[idx] = inner + data_ptr[idx] = inner idx += 1 def __getitem__(self, idx): diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx index 045f0b483..473364f93 100644 --- a/spacy/training/example.pyx +++ b/spacy/training/example.pyx @@ -13,7 +13,7 @@ from .iob_utils import biluo_tags_to_spans, remove_bilu_prefix from ..errors import Errors, Warnings from ..pipeline._parser_internals import nonproj from ..tokens.token cimport MISSING_DEP -from ..util import logger, to_ternary_int +from ..util import logger, to_ternary_int, all_equal cpdef Doc annotations_to_doc(vocab, tok_annot, doc_annot): @@ -151,50 +151,127 @@ cdef class Example: self._y_sig = y_sig return self._cached_alignment + + def _get_aligned_vectorized(self, align, gold_values): + # Fast path for Doc attributes/fields that are predominantly a single value, + # i.e., TAG, POS, MORPH. + x2y_single_toks = [] + x2y_single_toks_i = [] + + x2y_multiple_toks = [] + x2y_multiple_toks_i = [] + + # Gather indices of gold tokens aligned to the candidate tokens into two buckets. + # Bucket 1: All tokens that have a one-to-one alignment. + # Bucket 2: All tokens that have a one-to-many alignment. + for idx, token in enumerate(self.predicted): + aligned_gold_i = align[token.i] + aligned_gold_len = len(aligned_gold_i) + + if aligned_gold_len == 1: + x2y_single_toks.append(aligned_gold_i.item()) + x2y_single_toks_i.append(idx) + elif aligned_gold_len > 1: + x2y_multiple_toks.append(aligned_gold_i) + x2y_multiple_toks_i.append(idx) + + # Map elements of the first bucket directly to the output array. + output = numpy.full(len(self.predicted), None) + output[x2y_single_toks_i] = gold_values[x2y_single_toks].squeeze() + + # Collapse many-to-one alignments into one-to-one alignments if they + # share the same value. Map to None in all other cases. + for i in range(len(x2y_multiple_toks)): + aligned_gold_values = gold_values[x2y_multiple_toks[i]] + + # If all aligned tokens have the same value, use it. + if all_equal(aligned_gold_values): + x2y_multiple_toks[i] = aligned_gold_values[0].item() + else: + x2y_multiple_toks[i] = None + + output[x2y_multiple_toks_i] = x2y_multiple_toks + + return output.tolist() + + + def _get_aligned_non_vectorized(self, align, gold_values): + # Slower path for fields that return multiple values (resulting + # in ragged arrays that cannot be vectorized trivially). + output = [None] * len(self.predicted) + + for token in self.predicted: + aligned_gold_i = align[token.i] + values = gold_values[aligned_gold_i].ravel() + if len(values) == 1: + output[token.i] = values.item() + elif all_equal(values): + # If all aligned tokens have the same value, use it. + output[token.i] = values[0].item() + + return output + + def get_aligned(self, field, as_string=False): """Return an aligned array for a token attribute.""" align = self.alignment.x2y + gold_values = self.reference.to_array([field]) + + if len(gold_values.shape) == 1: + output = self._get_aligned_vectorized(align, gold_values) + else: + output = self._get_aligned_non_vectorized(align, gold_values) vocab = self.reference.vocab - gold_values = self.reference.to_array([field]) - output = [None] * len(self.predicted) - for token in self.predicted: - values = gold_values[align[token.i]] - values = values.ravel() - if len(values) == 0: - output[token.i] = None - elif len(values) == 1: - output[token.i] = values[0] - elif len(set(list(values))) == 1: - # If all aligned tokens have the same value, use it. - output[token.i] = values[0] - else: - output[token.i] = None if as_string and field not in ["ENT_IOB", "SENT_START"]: output = [vocab.strings[o] if o is not None else o for o in output] + return output def get_aligned_parse(self, projectivize=True): cand_to_gold = self.alignment.x2y gold_to_cand = self.alignment.y2x - aligned_heads = [None] * self.x.length - aligned_deps = [None] * self.x.length - has_deps = [token.has_dep() for token in self.y] - has_heads = [token.has_head() for token in self.y] heads = [token.head.i for token in self.y] deps = [token.dep_ for token in self.y] + if projectivize: proj_heads, proj_deps = nonproj.projectivize(heads, deps) + has_deps = [token.has_dep() for token in self.y] + has_heads = [token.has_head() for token in self.y] + # ensure that missing data remains missing heads = [h if has_heads[i] else heads[i] for i, h in enumerate(proj_heads)] deps = [d if has_deps[i] else deps[i] for i, d in enumerate(proj_deps)] - for cand_i in range(self.x.length): - if cand_to_gold.lengths[cand_i] == 1: - gold_i = cand_to_gold[cand_i][0] - if gold_to_cand.lengths[heads[gold_i]] == 1: - aligned_heads[cand_i] = int(gold_to_cand[heads[gold_i]][0]) - aligned_deps[cand_i] = deps[gold_i] - return aligned_heads, aligned_deps + + # Select all candidate tokens that are aligned to a single gold token. + c2g_single_toks = numpy.where(cand_to_gold.lengths == 1)[0] + + # Fetch all aligned gold token incides. + if c2g_single_toks.shape == cand_to_gold.lengths.shape: + # This the most likely case. + gold_i = cand_to_gold[:].squeeze() + else: + gold_i = numpy.vectorize(lambda x: cand_to_gold[int(x)][0])(c2g_single_toks).squeeze() + + # Fetch indices of all gold heads for the aligned gold tokens. + heads = numpy.asarray(heads, dtype='i') + gold_head_i = heads[gold_i] + + # Select all gold tokens that are heads of the previously selected + # gold tokens (and are aligned to a single candidate token). + g2c_len_heads = gold_to_cand.lengths[gold_head_i] + g2c_len_heads = numpy.where(g2c_len_heads == 1)[0] + g2c_i = numpy.vectorize(lambda x: gold_to_cand[int(x)][0])(gold_head_i[g2c_len_heads]).squeeze() + + # Update head/dep alignments with the above. + aligned_heads = numpy.full((self.x.length), None) + aligned_heads[c2g_single_toks[g2c_len_heads]] = g2c_i + + deps = numpy.asarray(deps) + aligned_deps = numpy.full((self.x.length), None) + aligned_deps[c2g_single_toks] = deps[gold_i] + + return aligned_heads.tolist(), aligned_deps.tolist() def get_aligned_sent_starts(self): """Get list of SENT_START attributes aligned to the predicted tokenization. diff --git a/spacy/util.py b/spacy/util.py index 9b871b87b..4f21d618a 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -1716,3 +1716,10 @@ def packages_distributions() -> Dict[str, List[str]]: for pkg in (dist.read_text("top_level.txt") or "").split(): pkg_to_dist[pkg].append(dist.metadata["Name"]) return dict(pkg_to_dist) + + +def all_equal(iterable): + """Return True if all the elements are equal to each other + (or if the input is an empty sequence), False otherwise.""" + g = itertools.groupby(iterable) + return next(g, True) and not next(g, False) From 4155a59d470c231b5bfca26044a6d4f93bea7e48 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Mon, 27 Jun 2022 09:35:35 +0200 Subject: [PATCH 17/25] Auto-format code with black (#11022) Co-authored-by: explosion-bot --- spacy/tests/parser/test_ner.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index 53bb2d554..00889efdc 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -158,13 +158,18 @@ def test_issue3209(): def test_labels_from_BILUO(): - """Test that labels are inferred correctly when there's a - in label. - """ + """Test that labels are inferred correctly when there's a - in label.""" nlp = English() ner = nlp.add_pipe("ner") ner.add_label("LARGE-ANIMAL") nlp.initialize() - move_names = ["O", "B-LARGE-ANIMAL", "I-LARGE-ANIMAL", "L-LARGE-ANIMAL", "U-LARGE-ANIMAL"] + move_names = [ + "O", + "B-LARGE-ANIMAL", + "I-LARGE-ANIMAL", + "L-LARGE-ANIMAL", + "U-LARGE-ANIMAL", + ] labels = {"LARGE-ANIMAL"} assert ner.move_names == move_names assert set(ner.labels) == labels From 308a612ec98f27098fe7f69ec20be0b5e88d51fa Mon Sep 17 00:00:00 2001 From: Eric Holscher <25510+ericholscher@users.noreply.github.com> Date: Mon, 27 Jun 2022 00:45:22 -0700 Subject: [PATCH 18/25] Remove `simply` (#11017) I was reading this page, and as a relative beginner, nothing about it was simple :) --- website/docs/api/architectures.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md index 2bddcb28c..2537faff6 100644 --- a/website/docs/api/architectures.md +++ b/website/docs/api/architectures.md @@ -587,7 +587,7 @@ consists of either two or three subnetworks: run once for each batch. - **lower**: Construct a feature-specific vector for each `(token, feature)` pair. This is also run once for each batch. Constructing the state - representation is then simply a matter of summing the component features and + representation is then a matter of summing the component features and applying the non-linearity. - **upper** (optional): A feed-forward network that predicts scores from the state representation. If not present, the output from the lower model is used @@ -628,7 +628,7 @@ same signature, but the `use_upper` argument was `True` by default. > ``` Build a tagger model, using a provided token-to-vector component. The tagger -model simply adds a linear layer with softmax activation to predict scores given +model adds a linear layer with softmax activation to predict scores given the token vectors. | Name | Description | @@ -920,5 +920,5 @@ A function that reads an existing `KnowledgeBase` from file. A function that takes as input a [`KnowledgeBase`](/api/kb) and a [`Span`](/api/span) object denoting a named entity, and returns a list of plausible [`Candidate`](/api/kb/#candidate) objects. The default -`CandidateGenerator` simply uses the text of a mention to find its potential +`CandidateGenerator` uses the text of a mention to find its potential aliases in the `KnowledgeBase`. Note that this function is case-dependent. From 8ffff18ac4e6a1d4fdae76dd7a9ecdf251b149fa Mon Sep 17 00:00:00 2001 From: Zackere Date: Tue, 28 Jun 2022 15:11:15 +0200 Subject: [PATCH 19/25] Try cloning repo from main & master (#10843) * Try cloning repo from main & master * fixup! Try cloning repo from main & master * fixup! fixup! Try cloning repo from main & master * refactor clone and check for repo:branch existence * spacing fix * make mypy happy * type util function * Update spacy/cli/project/clone.py Co-authored-by: Sofie Van Landeghem Co-authored-by: Peter Baumgartner <5107405+pmbaumgartner@users.noreply.github.com> Co-authored-by: Sofie Van Landeghem --- spacy/cli/_util.py | 17 +++++++++++++++++ spacy/cli/project/clone.py | 30 +++++++++++++++++++++++------- 2 files changed, 40 insertions(+), 7 deletions(-) diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py index bb7f2d352..ae43b991b 100644 --- a/spacy/cli/_util.py +++ b/spacy/cli/_util.py @@ -462,6 +462,23 @@ def git_sparse_checkout(repo, subpath, dest, branch): shutil.move(str(source_path), str(dest)) +def git_repo_branch_exists(repo: str, branch: str) -> bool: + """Uses 'git ls-remote' to check if a repository and branch exists + + repo (str): URL to get repo. + branch (str): Branch on repo to check. + RETURNS (bool): True if repo:branch exists. + """ + get_git_version() + cmd = f"git ls-remote {repo} {branch}" + # We might be tempted to use `--exit-code` with `git ls-remote`, but + # `run_command` handles the `returncode` for us, so we'll rely on + # the fact that stdout returns '' if the requested branch doesn't exist + ret = run_command(cmd, capture=True) + exists = ret.stdout != "" + return exists + + def get_git_version( error: str = "Could not run 'git'. Make sure it's installed and the executable is available.", ) -> Tuple[int, int]: diff --git a/spacy/cli/project/clone.py b/spacy/cli/project/clone.py index 360ee3428..14b4ed9b5 100644 --- a/spacy/cli/project/clone.py +++ b/spacy/cli/project/clone.py @@ -7,11 +7,11 @@ import re from ... import about from ...util import ensure_path from .._util import project_cli, Arg, Opt, COMMAND, PROJECT_FILE -from .._util import git_checkout, get_git_version +from .._util import git_checkout, get_git_version, git_repo_branch_exists DEFAULT_REPO = about.__projects__ DEFAULT_PROJECTS_BRANCH = about.__projects_branch__ -DEFAULT_BRANCH = "master" +DEFAULT_BRANCHES = ["main", "master"] @project_cli.command("clone") @@ -20,7 +20,7 @@ def project_clone_cli( name: str = Arg(..., help="The name of the template to clone"), dest: Optional[Path] = Arg(None, help="Where to clone the project. Defaults to current working directory", exists=False), repo: str = Opt(DEFAULT_REPO, "--repo", "-r", help="The repository to clone from"), - branch: Optional[str] = Opt(None, "--branch", "-b", help="The branch to clone from"), + branch: Optional[str] = Opt(None, "--branch", "-b", help=f"The branch to clone from. If not provided, will attempt {', '.join(DEFAULT_BRANCHES)}"), sparse_checkout: bool = Opt(False, "--sparse", "-S", help="Use sparse Git checkout to only check out and clone the files needed. Requires Git v22.2+.") # fmt: on ): @@ -33,9 +33,25 @@ def project_clone_cli( """ if dest is None: dest = Path.cwd() / Path(name).parts[-1] + if repo == DEFAULT_REPO and branch is None: + branch = DEFAULT_PROJECTS_BRANCH + if branch is None: - # If it's a user repo, we want to default to other branch - branch = DEFAULT_PROJECTS_BRANCH if repo == DEFAULT_REPO else DEFAULT_BRANCH + for default_branch in DEFAULT_BRANCHES: + if git_repo_branch_exists(repo, default_branch): + branch = default_branch + break + if branch is None: + default_branches_msg = ", ".join(f"'{b}'" for b in DEFAULT_BRANCHES) + msg.fail( + "No branch provided and attempted default " + f"branches {default_branches_msg} do not exist.", + exits=1, + ) + else: + if not git_repo_branch_exists(repo, branch): + msg.fail(f"repo: {repo} (branch: {branch}) does not exist.", exits=1) + assert isinstance(branch, str) project_clone(name, dest, repo=repo, branch=branch, sparse_checkout=sparse_checkout) @@ -61,9 +77,9 @@ def project_clone( try: git_checkout(repo, name, dest, branch=branch, sparse=sparse_checkout) except subprocess.CalledProcessError: - err = f"Could not clone '{name}' from repo '{repo_name}'" + err = f"Could not clone '{name}' from repo '{repo_name}' (branch '{branch}')" msg.fail(err, exits=1) - msg.good(f"Cloned '{name}' from {repo_name}", project_dir) + msg.good(f"Cloned '{name}' from '{repo_name}' (branch '{branch}')", project_dir) if not (project_dir / PROJECT_FILE).exists(): msg.warn(f"No {PROJECT_FILE} found in directory") else: From a9559e7435f99648aa0004f301692f1a2dfe72fe Mon Sep 17 00:00:00 2001 From: Richard Hudson Date: Tue, 28 Jun 2022 15:35:32 +0200 Subject: [PATCH 20/25] Handle Cyrillic combining diacritics (#10837) * Handle Russian, Ukrainian and Bulgarian * Corrections * Correction * Correction to comment * Changes based on review * Correction * Reverted irrelevant change in punctuation.py * Remove unnecessary group * Reverted accidental change --- spacy/lang/bg/__init__.py | 5 +++- spacy/lang/char_classes.py | 4 ++++ spacy/lang/punctuation.py | 22 ++++++++++++++++- spacy/lang/ru/__init__.py | 4 ++++ spacy/lang/uk/__init__.py | 4 ++++ spacy/tests/lang/bg/test_tokenizer.py | 8 +++++++ spacy/tests/lang/ru/test_tokenizer.py | 34 +++++++++++++++++++++++++++ spacy/tests/lang/uk/test_tokenizer.py | 7 ++++++ 8 files changed, 86 insertions(+), 2 deletions(-) create mode 100644 spacy/tests/lang/bg/test_tokenizer.py diff --git a/spacy/lang/bg/__init__.py b/spacy/lang/bg/__init__.py index 559cc34c4..c9176b946 100644 --- a/spacy/lang/bg/__init__.py +++ b/spacy/lang/bg/__init__.py @@ -2,7 +2,8 @@ from .stop_words import STOP_WORDS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .lex_attrs import LEX_ATTRS from ..tokenizer_exceptions import BASE_EXCEPTIONS - +from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_INFIXES +from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_SUFFIXES from ...language import Language, BaseDefaults from ...attrs import LANG from ...util import update_exc @@ -16,6 +17,8 @@ class BulgarianDefaults(BaseDefaults): stop_words = STOP_WORDS tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) + suffixes = COMBINING_DIACRITICS_TOKENIZER_SUFFIXES + infixes = COMBINING_DIACRITICS_TOKENIZER_INFIXES class Bulgarian(Language): diff --git a/spacy/lang/char_classes.py b/spacy/lang/char_classes.py index b15bb3cf3..1d204c46c 100644 --- a/spacy/lang/char_classes.py +++ b/spacy/lang/char_classes.py @@ -258,6 +258,10 @@ ALPHA = group_chars( ALPHA_LOWER = group_chars(_lower + _uncased) ALPHA_UPPER = group_chars(_upper + _uncased) +_combining_diacritics = r"\u0300-\u036f" + +COMBINING_DIACRITICS = _combining_diacritics + _units = ( "km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm nm yd in ft " "kg g mg µg t lb oz m/s km/h kmh mph hPa Pa mbar mb MB kb KB gb GB tb " diff --git a/spacy/lang/punctuation.py b/spacy/lang/punctuation.py index e712e71d6..a1cfe6224 100644 --- a/spacy/lang/punctuation.py +++ b/spacy/lang/punctuation.py @@ -1,5 +1,5 @@ from .char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY -from .char_classes import LIST_ICONS, HYPHENS, CURRENCY, UNITS +from .char_classes import LIST_ICONS, HYPHENS, CURRENCY, UNITS, COMBINING_DIACRITICS from .char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT @@ -44,3 +44,23 @@ TOKENIZER_INFIXES = ( r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA), ] ) + + +# Some languages e.g. written with the Cyrillic alphabet permit the use of diacritics +# to mark stressed syllables in words where stress is distinctive. Such languages +# should use the COMBINING_DIACRITICS... suffix and infix regex lists in +# place of the standard ones. +COMBINING_DIACRITICS_TOKENIZER_SUFFIXES = list(TOKENIZER_SUFFIXES) + [ + r"(?<=[{a}][{d}])\.".format(a=ALPHA, d=COMBINING_DIACRITICS), +] + +COMBINING_DIACRITICS_TOKENIZER_INFIXES = list(TOKENIZER_INFIXES) + [ + r"(?<=[{al}][{d}])\.(?=[{au}{q}])".format( + al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES, d=COMBINING_DIACRITICS + ), + r"(?<=[{a}][{d}]),(?=[{a}])".format(a=ALPHA, d=COMBINING_DIACRITICS), + r"(?<=[{a}][{d}])(?:{h})(?=[{a}])".format( + a=ALPHA, d=COMBINING_DIACRITICS, h=HYPHENS + ), + r"(?<=[{a}][{d}])[:<>=/](?=[{a}])".format(a=ALPHA, d=COMBINING_DIACRITICS), +] diff --git a/spacy/lang/ru/__init__.py b/spacy/lang/ru/__init__.py index 5d31d8ea2..c118c26ff 100644 --- a/spacy/lang/ru/__init__.py +++ b/spacy/lang/ru/__init__.py @@ -5,6 +5,8 @@ from .stop_words import STOP_WORDS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .lex_attrs import LEX_ATTRS from .lemmatizer import RussianLemmatizer +from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_INFIXES +from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_SUFFIXES from ...language import Language, BaseDefaults @@ -12,6 +14,8 @@ class RussianDefaults(BaseDefaults): tokenizer_exceptions = TOKENIZER_EXCEPTIONS lex_attr_getters = LEX_ATTRS stop_words = STOP_WORDS + suffixes = COMBINING_DIACRITICS_TOKENIZER_SUFFIXES + infixes = COMBINING_DIACRITICS_TOKENIZER_INFIXES class Russian(Language): diff --git a/spacy/lang/uk/__init__.py b/spacy/lang/uk/__init__.py index 21f9649f2..737243b66 100644 --- a/spacy/lang/uk/__init__.py +++ b/spacy/lang/uk/__init__.py @@ -6,6 +6,8 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .lemmatizer import UkrainianLemmatizer +from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_INFIXES +from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_SUFFIXES from ...language import Language, BaseDefaults @@ -13,6 +15,8 @@ class UkrainianDefaults(BaseDefaults): tokenizer_exceptions = TOKENIZER_EXCEPTIONS lex_attr_getters = LEX_ATTRS stop_words = STOP_WORDS + suffixes = COMBINING_DIACRITICS_TOKENIZER_SUFFIXES + infixes = COMBINING_DIACRITICS_TOKENIZER_INFIXES class Ukrainian(Language): diff --git a/spacy/tests/lang/bg/test_tokenizer.py b/spacy/tests/lang/bg/test_tokenizer.py new file mode 100644 index 000000000..2e2c45001 --- /dev/null +++ b/spacy/tests/lang/bg/test_tokenizer.py @@ -0,0 +1,8 @@ +import pytest + + +def test_bg_tokenizer_handles_final_diacritics(bg_tokenizer): + text = "Ня̀маше яйца̀. Ня̀маше яйца̀." + tokens = bg_tokenizer(text) + assert tokens[1].text == "яйца̀" + assert tokens[2].text == "." diff --git a/spacy/tests/lang/ru/test_tokenizer.py b/spacy/tests/lang/ru/test_tokenizer.py index 1cfdc50ee..083b55a09 100644 --- a/spacy/tests/lang/ru/test_tokenizer.py +++ b/spacy/tests/lang/ru/test_tokenizer.py @@ -1,3 +1,4 @@ +from string import punctuation import pytest @@ -122,3 +123,36 @@ def test_ru_tokenizer_splits_bracket_period(ru_tokenizer): text = "(Раз, два, три, проверка)." tokens = ru_tokenizer(text) assert tokens[len(tokens) - 1].text == "." + + +@pytest.mark.parametrize( + "text", + [ + "рекоменду́я подда́ть жару́. Самого́ Баргамота", + "РЕКОМЕНДУ́Я ПОДДА́ТЬ ЖАРУ́. САМОГО́ БАРГАМОТА", + "рекоменду̍я подда̍ть жару̍.Самого̍ Баргамота", + "рекоменду̍я подда̍ть жару̍.'Самого̍ Баргамота", + "рекоменду̍я подда̍ть жару̍,самого̍ Баргамота", + "рекоменду̍я подда̍ть жару̍:самого̍ Баргамота", + "рекоменду̍я подда̍ть жару̍. самого̍ Баргамота", + "рекоменду̍я подда̍ть жару̍, самого̍ Баргамота", + "рекоменду̍я подда̍ть жару̍: самого̍ Баргамота", + "рекоменду̍я подда̍ть жару̍-самого̍ Баргамота", + ], +) +def test_ru_tokenizer_handles_final_diacritics(ru_tokenizer, text): + tokens = ru_tokenizer(text) + assert tokens[2].text in ("жару́", "ЖАРУ́", "жару̍") + assert tokens[3].text in punctuation + + +@pytest.mark.parametrize( + "text", + [ + "РЕКОМЕНДУ́Я ПОДДА́ТЬ ЖАРУ́.САМОГО́ БАРГАМОТА", + "рекоменду̍я подда̍ть жару́.самого́ Баргамота", + ], +) +def test_ru_tokenizer_handles_final_diacritic_and_period(ru_tokenizer, text): + tokens = ru_tokenizer(text) + assert tokens[2].text.lower() == "жару́.самого́" diff --git a/spacy/tests/lang/uk/test_tokenizer.py b/spacy/tests/lang/uk/test_tokenizer.py index 3d6e87301..6596f490a 100644 --- a/spacy/tests/lang/uk/test_tokenizer.py +++ b/spacy/tests/lang/uk/test_tokenizer.py @@ -140,3 +140,10 @@ def test_uk_tokenizer_splits_bracket_period(uk_tokenizer): text = "(Раз, два, три, проверка)." tokens = uk_tokenizer(text) assert tokens[len(tokens) - 1].text == "." + + +def test_uk_tokenizer_handles_final_diacritics(uk_tokenizer): + text = "Хлібі́в не було́. Хлібі́в не було́." + tokens = uk_tokenizer(text) + assert tokens[2].text == "було́" + assert tokens[3].text == "." From 1d5cad0b42c5919dde27a59808ff97f8e15cfaa0 Mon Sep 17 00:00:00 2001 From: Madeesh Kannan Date: Tue, 28 Jun 2022 19:42:58 +0200 Subject: [PATCH 21/25] `Example.get_aligned_parse`: Handle unit and zero length vectors correctly (#11026) * `Example.get_aligned_parse`: Do not squeeze gold token idx vector Correctly handle zero-size vectors passed to `np.vectorize` * Add tests * Use `Doc` ctor to initialize attributes * Remove unintended change Co-authored-by: Adriane Boyd * Remove unused import Co-authored-by: Adriane Boyd --- spacy/tests/training/test_training.py | 25 +++++++++++++++++++++++++ spacy/training/example.pyx | 6 +++--- 2 files changed, 28 insertions(+), 3 deletions(-) diff --git a/spacy/tests/training/test_training.py b/spacy/tests/training/test_training.py index 31bf7e07b..4384a796d 100644 --- a/spacy/tests/training/test_training.py +++ b/spacy/tests/training/test_training.py @@ -679,6 +679,31 @@ def test_projectivize(en_tokenizer): assert proj_heads == [3, 2, 3, 3, 3] assert nonproj_heads == [3, 2, 3, 3, 2] + # Test single token documents + doc = en_tokenizer("Conrail") + heads = [0] + deps = ["dep"] + example = Example.from_dict(doc, {"heads": heads, "deps": deps}) + proj_heads, proj_labels = example.get_aligned_parse(projectivize=True) + assert proj_heads == heads + assert proj_labels == deps + + # Test documents with no alignments + doc_a = Doc( + doc.vocab, words=["Double-Jointed"], spaces=[False], deps=["ROOT"], heads=[0] + ) + doc_b = Doc( + doc.vocab, + words=["Double", "-", "Jointed"], + spaces=[True, True, True], + deps=["amod", "punct", "ROOT"], + heads=[2, 2, 2], + ) + example = Example(doc_a, doc_b) + proj_heads, proj_deps = example.get_aligned_parse(projectivize=True) + assert proj_heads == [None] + assert proj_deps == [None] + def test_iob_to_biluo(): good_iob = ["O", "O", "B-LOC", "I-LOC", "O", "B-PERSON"] diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx index 473364f93..d592e5a52 100644 --- a/spacy/training/example.pyx +++ b/spacy/training/example.pyx @@ -249,9 +249,9 @@ cdef class Example: # Fetch all aligned gold token incides. if c2g_single_toks.shape == cand_to_gold.lengths.shape: # This the most likely case. - gold_i = cand_to_gold[:].squeeze() + gold_i = cand_to_gold[:] else: - gold_i = numpy.vectorize(lambda x: cand_to_gold[int(x)][0])(c2g_single_toks).squeeze() + gold_i = numpy.vectorize(lambda x: cand_to_gold[int(x)][0], otypes='i')(c2g_single_toks) # Fetch indices of all gold heads for the aligned gold tokens. heads = numpy.asarray(heads, dtype='i') @@ -261,7 +261,7 @@ cdef class Example: # gold tokens (and are aligned to a single candidate token). g2c_len_heads = gold_to_cand.lengths[gold_head_i] g2c_len_heads = numpy.where(g2c_len_heads == 1)[0] - g2c_i = numpy.vectorize(lambda x: gold_to_cand[int(x)][0])(gold_head_i[g2c_len_heads]).squeeze() + g2c_i = numpy.vectorize(lambda x: gold_to_cand[int(x)][0], otypes='i')(gold_head_i[g2c_len_heads]).squeeze() # Update head/dep alignments with the above. aligned_heads = numpy.full((self.x.length), None) From 24f4908fce4740130fc5355f28e9aa87cadd9817 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 28 Jun 2022 19:50:47 +0200 Subject: [PATCH 22/25] Update vector handling in similarity methods (#11013) Distinguish between vectors that are 0 vs. missing vectors when warning about missing vectors. Update `Doc.has_vector` to match `Span.has_vector` and `Token.has_vector` for cases where the vocab has vectors but none of the tokens in the container have vectors. --- spacy/tests/vocab_vectors/test_similarity.py | 33 +++++++++++++++----- spacy/tests/vocab_vectors/test_vectors.py | 10 +++--- spacy/tokens/doc.pyx | 5 +-- spacy/tokens/span.pyx | 3 +- spacy/tokens/token.pyx | 3 +- 5 files changed, 36 insertions(+), 18 deletions(-) diff --git a/spacy/tests/vocab_vectors/test_similarity.py b/spacy/tests/vocab_vectors/test_similarity.py index 47cd1f060..1efcdd81e 100644 --- a/spacy/tests/vocab_vectors/test_similarity.py +++ b/spacy/tests/vocab_vectors/test_similarity.py @@ -1,6 +1,7 @@ import pytest import numpy from spacy.tokens import Doc +from spacy.vocab import Vocab from ..util import get_cosine, add_vecs_to_vocab @@ -71,19 +72,17 @@ def test_vectors_similarity_DD(vocab, vectors): def test_vectors_similarity_TD(vocab, vectors): [(word1, vec1), (word2, vec2)] = vectors doc = Doc(vocab, words=[word1, word2]) - with pytest.warns(UserWarning): - assert isinstance(doc.similarity(doc[0]), float) - assert isinstance(doc[0].similarity(doc), float) - assert doc.similarity(doc[0]) == doc[0].similarity(doc) + assert isinstance(doc.similarity(doc[0]), float) + assert isinstance(doc[0].similarity(doc), float) + assert doc.similarity(doc[0]) == doc[0].similarity(doc) def test_vectors_similarity_TS(vocab, vectors): [(word1, vec1), (word2, vec2)] = vectors doc = Doc(vocab, words=[word1, word2]) - with pytest.warns(UserWarning): - assert isinstance(doc[:2].similarity(doc[0]), float) - assert isinstance(doc[0].similarity(doc[-2]), float) - assert doc[:2].similarity(doc[0]) == doc[0].similarity(doc[:2]) + assert isinstance(doc[:2].similarity(doc[0]), float) + assert isinstance(doc[0].similarity(doc[:2]), float) + assert doc[:2].similarity(doc[0]) == doc[0].similarity(doc[:2]) def test_vectors_similarity_DS(vocab, vectors): @@ -91,3 +90,21 @@ def test_vectors_similarity_DS(vocab, vectors): doc = Doc(vocab, words=[word1, word2]) assert isinstance(doc.similarity(doc[:2]), float) assert doc.similarity(doc[:2]) == doc[:2].similarity(doc) + + +def test_vectors_similarity_no_vectors(): + vocab = Vocab() + doc1 = Doc(vocab, words=["a", "b"]) + doc2 = Doc(vocab, words=["c", "d", "e"]) + with pytest.warns(UserWarning): + doc1.similarity(doc2) + with pytest.warns(UserWarning): + doc1.similarity(doc2[1]) + with pytest.warns(UserWarning): + doc1.similarity(doc2[:2]) + with pytest.warns(UserWarning): + doc2.similarity(doc1) + with pytest.warns(UserWarning): + doc2[1].similarity(doc1) + with pytest.warns(UserWarning): + doc2[:2].similarity(doc1) diff --git a/spacy/tests/vocab_vectors/test_vectors.py b/spacy/tests/vocab_vectors/test_vectors.py index e3ad206f4..dd2cfc596 100644 --- a/spacy/tests/vocab_vectors/test_vectors.py +++ b/spacy/tests/vocab_vectors/test_vectors.py @@ -318,17 +318,15 @@ def test_vectors_lexeme_doc_similarity(vocab, text): @pytest.mark.parametrize("text", [["apple", "orange", "juice"]]) def test_vectors_span_span_similarity(vocab, text): doc = Doc(vocab, words=text) - with pytest.warns(UserWarning): - assert doc[0:2].similarity(doc[1:3]) == doc[1:3].similarity(doc[0:2]) - assert -1.0 < doc[0:2].similarity(doc[1:3]) < 1.0 + assert doc[0:2].similarity(doc[1:3]) == doc[1:3].similarity(doc[0:2]) + assert -1.0 < doc[0:2].similarity(doc[1:3]) < 1.0 @pytest.mark.parametrize("text", [["apple", "orange", "juice"]]) def test_vectors_span_doc_similarity(vocab, text): doc = Doc(vocab, words=text) - with pytest.warns(UserWarning): - assert doc[0:2].similarity(doc) == doc.similarity(doc[0:2]) - assert -1.0 < doc[0:2].similarity(doc) < 1.0 + assert doc[0:2].similarity(doc) == doc.similarity(doc[0:2]) + assert -1.0 < doc[0:2].similarity(doc) < 1.0 @pytest.mark.parametrize( diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index e38de02b4..d9a104ac8 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -607,7 +607,8 @@ cdef class Doc: if self.vocab.vectors.n_keys == 0: warnings.warn(Warnings.W007.format(obj="Doc")) if self.vector_norm == 0 or other.vector_norm == 0: - warnings.warn(Warnings.W008.format(obj="Doc")) + if not self.has_vector or not other.has_vector: + warnings.warn(Warnings.W008.format(obj="Doc")) return 0.0 vector = self.vector xp = get_array_module(vector) @@ -627,7 +628,7 @@ cdef class Doc: if "has_vector" in self.user_hooks: return self.user_hooks["has_vector"](self) elif self.vocab.vectors.size: - return True + return any(token.has_vector for token in self) elif self.tensor.size: return True else: diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index ab888ae95..c3495f497 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -354,7 +354,8 @@ cdef class Span: if self.vocab.vectors.n_keys == 0: warnings.warn(Warnings.W007.format(obj="Span")) if self.vector_norm == 0.0 or other.vector_norm == 0.0: - warnings.warn(Warnings.W008.format(obj="Span")) + if not self.has_vector or not other.has_vector: + warnings.warn(Warnings.W008.format(obj="Span")) return 0.0 vector = self.vector xp = get_array_module(vector) diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index d14930348..7fff6b162 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -206,7 +206,8 @@ cdef class Token: if self.vocab.vectors.n_keys == 0: warnings.warn(Warnings.W007.format(obj="Token")) if self.vector_norm == 0 or other.vector_norm == 0: - warnings.warn(Warnings.W008.format(obj="Token")) + if not self.has_vector or not other.has_vector: + warnings.warn(Warnings.W008.format(obj="Token")) return 0.0 vector = self.vector xp = get_array_module(vector) From dd038b536cf632408080d9a88f3bc4bf2ffdefe4 Mon Sep 17 00:00:00 2001 From: Peter Baumgartner <5107405+pmbaumgartner@users.noreply.github.com> Date: Tue, 28 Jun 2022 14:42:40 -0400 Subject: [PATCH 23/25] fix to horizontal space (#10994) --- spacy/displacy/render.py | 19 +++++++++++++++---- spacy/displacy/templates.py | 2 +- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/spacy/displacy/render.py b/spacy/displacy/render.py index 247ad996b..a730ce522 100644 --- a/spacy/displacy/render.py +++ b/spacy/displacy/render.py @@ -64,8 +64,11 @@ class SpanRenderer: # Set up how the text and labels will be rendered self.direction = DEFAULT_DIR self.lang = DEFAULT_LANG + # These values are in px self.top_offset = options.get("top_offset", 40) - self.top_offset_step = options.get("top_offset_step", 17) + # This is how far under the top offset the span labels appear + self.span_label_offset = options.get("span_label_offset", 20) + self.offset_step = options.get("top_offset_step", 17) # Set up which templates will be used template = options.get("template") @@ -161,8 +164,16 @@ class SpanRenderer: if entities: slices = self._get_span_slices(token["entities"]) starts = self._get_span_starts(token["entities"]) + total_height = ( + self.top_offset + + self.span_label_offset + + (self.offset_step * (len(entities) - 1)) + ) markup += self.span_template.format( - text=token["text"], span_slices=slices, span_starts=starts + text=token["text"], + span_slices=slices, + span_starts=starts, + total_height=total_height, ) else: markup += escape_html(token["text"] + " ") @@ -171,7 +182,7 @@ class SpanRenderer: def _get_span_slices(self, entities: List[Dict]) -> str: """Get the rendered markup of all Span slices""" span_slices = [] - for entity, step in zip(entities, itertools.count(step=self.top_offset_step)): + for entity, step in zip(entities, itertools.count(step=self.offset_step)): color = self.colors.get(entity["label"].upper(), self.default_color) span_slice = self.span_slice_template.format( bg=color, top_offset=self.top_offset + step @@ -182,7 +193,7 @@ class SpanRenderer: def _get_span_starts(self, entities: List[Dict]) -> str: """Get the rendered markup of all Span start tokens""" span_starts = [] - for entity, step in zip(entities, itertools.count(step=self.top_offset_step)): + for entity, step in zip(entities, itertools.count(step=self.offset_step)): color = self.colors.get(entity["label"].upper(), self.default_color) span_start = ( self.span_start_template.format( diff --git a/spacy/displacy/templates.py b/spacy/displacy/templates.py index ff81e7a1d..40f5376b1 100644 --- a/spacy/displacy/templates.py +++ b/spacy/displacy/templates.py @@ -67,7 +67,7 @@ TPL_SPANS = """ """ TPL_SPAN = """ - + {text} {span_slices} {span_starts} From 0ff14aabcecef1003fa3cb6fb6227041bb0df73b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= Date: Wed, 29 Jun 2022 12:58:31 +0200 Subject: [PATCH 24/25] vectors: avoid expensive comparisons between numpy ints and Python ints (#10992) * vectors: avoid expensive comparisons between numpy ints and Python ints * vectors: avoid failure on lists of ints * Convert another numpy int to Python --- spacy/vectors.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx index 93f6818ee..8300220c1 100644 --- a/spacy/vectors.pyx +++ b/spacy/vectors.pyx @@ -336,10 +336,10 @@ cdef class Vectors: xp = get_array_module(self.data) if key is not None: key = get_string_id(key) - return self.key2row.get(key, -1) + return self.key2row.get(int(key), -1) elif keys is not None: keys = [get_string_id(key) for key in keys] - rows = [self.key2row.get(key, -1) for key in keys] + rows = [self.key2row.get(int(key), -1) for key in keys] return xp.asarray(rows, dtype="i") else: row2key = {row: key for key, row in self.key2row.items()} From 4581a4f53f77114cb074d2a76a62068154fa8211 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Wed, 29 Jun 2022 20:03:36 +0200 Subject: [PATCH 25/25] Run mypy for python 3.10 (#11052) --- .github/azure-steps.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/azure-steps.yml b/.github/azure-steps.yml index 41f743feb..1f886161a 100644 --- a/.github/azure-steps.yml +++ b/.github/azure-steps.yml @@ -27,7 +27,6 @@ steps: - script: python -m mypy spacy displayName: 'Run mypy' - condition: ne(variables['python_version'], '3.10') - task: DeleteFiles@1 inputs: