From 2fa983aa2e746bbd71ac9935483ab99c6322d85e Mon Sep 17 00:00:00 2001 From: Nicolai Bjerre Pedersen Date: Tue, 12 Jul 2022 13:47:35 +0200 Subject: [PATCH 01/13] Fix span typings (#11119) Add id, id_ to span.pyi. --- spacy/tokens/span.pyi | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/spacy/tokens/span.pyi b/spacy/tokens/span.pyi index 4a4149652..617e3d19d 100644 --- a/spacy/tokens/span.pyi +++ b/spacy/tokens/span.pyi @@ -120,6 +120,10 @@ class Span: ent_id: int ent_id_: str @property + def id(self) -> int: ... + @property + def id_(self) -> str: ... + @property def orth_(self) -> str: ... @property def lemma_(self) -> str: ... From 2235e3520c763fd3e25118e6cc104def3f75330f Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 12 Jul 2022 15:20:33 +0200 Subject: [PATCH 02/13] Update binder version in docs (#11124) --- website/meta/site.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/meta/site.json b/website/meta/site.json index 97051011f..360a72178 100644 --- a/website/meta/site.json +++ b/website/meta/site.json @@ -28,7 +28,7 @@ }, "binderUrl": "explosion/spacy-io-binder", "binderBranch": "spacy.io", - "binderVersion": "3.0", + "binderVersion": "3.4", "sections": [ { "id": "usage", "title": "Usage Documentation", "theme": "blue" }, { "id": "models", "title": "Models Documentation", "theme": "blue" }, From 1caa2d1d16babb43b346e3eebcf229367bcc47f5 Mon Sep 17 00:00:00 2001 From: Maarten Grootendorst Date: Tue, 19 Jul 2022 12:37:18 +0200 Subject: [PATCH 03/13] Added BERTopic to Spacy Universe (#11159) * Added BERTopic to Spacy Universe * Fix no render of visualization --- website/meta/universe.json | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/website/meta/universe.json b/website/meta/universe.json index 29d436ec4..53cc53024 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -544,6 +544,37 @@ "website": "https://koaning.io" } }, + { + "id": "bertopic", + "title": "BERTopic", + "slogan": "Leveraging BERT and c-TF-IDF to create easily interpretable topics.", + "description": "BERTopic is a topic modeling technique that leverages embedding models and c-TF-IDF to create dense clusters allowing for easily interpretable topics whilst keeping important words in the topic descriptions. BERTopic supports guided, (semi-) supervised, hierarchical, and dynamic topic modeling.", + "github": "maartengr/bertopic", + "pip": "bertopic", + "thumb": "https://i.imgur.com/Rx2LfBm.png", + "image": "https://raw.githubusercontent.com/MaartenGr/BERTopic/master/images/topic_visualization.gif", + "code_example": [ + "import spacy", + "from bertopic import BERTopic", + "from sklearn.datasets import fetch_20newsgroups", + "", + "docs = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))['data']", + "nlp = spacy.load('en_core_web_md', exclude=['tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer'])", + "", + "topic_model = BERTopic(embedding_model=nlp)", + "topics, probs = topic_model.fit_transform(docs)", + "", + "fig = topic_model.visualize_topics()", + "fig.show()" + ], + "category": ["visualizers", "training"], + "author": "Maarten Grootendorst", + "author_links": { + "twitter": "maartengr", + "github": "maartengr", + "website": "https://maartengrootendorst.com" + } + }, { "id": "tokenwiser", "title": "tokenwiser", From 7ff52c02a11ba80128e55a98b3213d6c9f5aa80a Mon Sep 17 00:00:00 2001 From: Lucas Terriel <44713216+Lucaterre@users.noreply.github.com> Date: Sun, 24 Jul 2022 10:10:29 +0200 Subject: [PATCH 04/13] Update meta for spacyfishing in spaCy Universe (#11185) * add new logo for spacyfishing to update spacy universe * change logo location --- website/meta/universe.json | 1 + 1 file changed, 1 insertion(+) diff --git a/website/meta/universe.json b/website/meta/universe.json index 53cc53024..6a981e9f0 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -22,6 +22,7 @@ "## Set parameter `extra_info` to `True` and check also span._.description, span._.src_description, span._.normal_term, span._.other_ids" ], "category": ["models", "pipeline"], + "image": "https://raw.githubusercontent.com/Lucaterre/spacyfishing/main/docs/spacyfishing-logo-resized.png", "tags": ["NER", "NEL"], "author": "Lucas Terriel", "author_links": { From a5aa3a818fba61cffa7b5738ec24a03700f18468 Mon Sep 17 00:00:00 2001 From: Dan Radenkovic Date: Sun, 24 Jul 2022 10:16:36 +0200 Subject: [PATCH 05/13] fix docs (#11123) --- website/docs/api/matcher.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/api/matcher.md b/website/docs/api/matcher.md index ab88c4194..8cc446c6a 100644 --- a/website/docs/api/matcher.md +++ b/website/docs/api/matcher.md @@ -199,7 +199,7 @@ will be overwritten. > [{"LOWER": "hello"}, {"LOWER": "world"}], > [{"ORTH": "Google"}, {"ORTH": "Maps"}] > ] -> matcher.add("TEST_PATTERNS", patterns) +> matcher.add("TEST_PATTERNS", patterns, on_match=on_match) > doc = nlp("HELLO WORLD on Google Maps.") > matches = matcher(doc) > ``` From 93960dc4b59510b011c12079fbba09eb8219f74e Mon Sep 17 00:00:00 2001 From: 0xpeIpeI <63499912+lll-lll-lll-lll@users.noreply.github.com> Date: Sun, 24 Jul 2022 19:01:04 +0900 Subject: [PATCH 06/13] [universe project] create English interpretation project (#11184) * [add] my universe project setting * [modify] A few adjustments * [Modify] change package description --- website/meta/universe.json | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/website/meta/universe.json b/website/meta/universe.json index 6a981e9f0..3c8afbd9a 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -4023,6 +4023,37 @@ "description": "Episodes about spaCy or interviews with the spaCy team" } ] + }, + { + "id": "sent-pattern", + "title": "English Interpretation Sentence Pattern", + "slogan": "English interpretation for accurate translation from English to Japanese", + "description": "This package categorizes English sentences into one of five basic sentence patterns and identifies the subject, verb, object, and other components. The five basic sentence patterns are based on C. T. Onions's Advanced English Syntax and are frequently used when teaching English in Japan.", + "github": "lll-lll-lll-lll/sent-pattern", + "pip": "sent-pattern", + "code_example": [ + "import spacy", + "nlp = spacy.load('en_core_web_lg')", + "", + "nlp.add_pipe('sent_pattern')", + "text = 'he gives me something'", + "pattern = doc._.sentpattern", + "", + "print(pattern)", + "# FourthSentencePattern (class)", + "print(pattern.subject.root)", + "# he (Token)", + "print(pattern.verb.root)", + "# give (Token)" + ], + "code_language": "python", + "author": "Shunpei Nakayama", + "author_links": { + "twitter": "ExZ79575296", + "github": "lll-lll-lll-lll" + }, + "category": ["pipeline"], + "tags": ["interpretation", "ja"] } ] } From 7a99fe3c65074eb70bfac96d1f0c83cbdb7ec2c7 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 25 Jul 2022 09:14:50 +0200 Subject: [PATCH 07/13] Move sent-patterns to correct section of universe.json (#11192) --- website/meta/universe.json | 46 +++++++++++++------------------------- 1 file changed, 15 insertions(+), 31 deletions(-) diff --git a/website/meta/universe.json b/website/meta/universe.json index 3c8afbd9a..a128f0795 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -3935,6 +3935,21 @@ }, "category": ["biomedical", "scientific", "research", "pipeline"], "tags": ["clinical"] + }, + { + "id": "sent-pattern", + "title": "English Interpretation Sentence Pattern", + "slogan": "English interpretation for accurate translation from English to Japanese", + "description": "This package categorizes English sentences into one of five basic sentence patterns and identifies the subject, verb, object, and other components. The five basic sentence patterns are based on C. T. Onions's Advanced English Syntax and are frequently used when teaching English in Japan.", + "github": "lll-lll-lll-lll/sent-pattern", + "pip": "sent-pattern", + "author": "Shunpei Nakayama", + "author_links": { + "twitter": "ExZ79575296", + "github": "lll-lll-lll-lll" + }, + "category": ["pipeline"], + "tags": ["interpretation", "ja"] } ], @@ -4023,37 +4038,6 @@ "description": "Episodes about spaCy or interviews with the spaCy team" } ] - }, - { - "id": "sent-pattern", - "title": "English Interpretation Sentence Pattern", - "slogan": "English interpretation for accurate translation from English to Japanese", - "description": "This package categorizes English sentences into one of five basic sentence patterns and identifies the subject, verb, object, and other components. The five basic sentence patterns are based on C. T. Onions's Advanced English Syntax and are frequently used when teaching English in Japan.", - "github": "lll-lll-lll-lll/sent-pattern", - "pip": "sent-pattern", - "code_example": [ - "import spacy", - "nlp = spacy.load('en_core_web_lg')", - "", - "nlp.add_pipe('sent_pattern')", - "text = 'he gives me something'", - "pattern = doc._.sentpattern", - "", - "print(pattern)", - "# FourthSentencePattern (class)", - "print(pattern.subject.root)", - "# he (Token)", - "print(pattern.verb.root)", - "# give (Token)" - ], - "code_language": "python", - "author": "Shunpei Nakayama", - "author_links": { - "twitter": "ExZ79575296", - "github": "lll-lll-lll-lll" - }, - "category": ["pipeline"], - "tags": ["interpretation", "ja"] } ] } From 1c12812d1a218f505ccfcd4d958f88ab895ed83e Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Mon, 25 Jul 2022 16:39:34 +0900 Subject: [PATCH 08/13] Replace link to old label (#11188) --- website/src/templates/universe.js | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/website/src/templates/universe.js b/website/src/templates/universe.js index 10f2520d9..48ffa3add 100644 --- a/website/src/templates/universe.js +++ b/website/src/templates/universe.js @@ -142,10 +142,10 @@ const UniverseContent = ({ content = [], categories, theme, pageContext, mdxComp The Universe database is open-source and collected in a simple JSON file. For more details on the formats and available fields, see the documentation. Looking for inspiration your own spaCy plugin or extension? Check out the - - project idea + + project idea - label on the issue tracker. + section in Discussions.

From e5990db71358a4d5f3ad146faf6b33b87d0c231f Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 25 Jul 2022 18:12:18 +0200 Subject: [PATCH 09/13] Revert "Temporarily skip tests that require models/compat" This reverts commit d9320db7db74b970b3751e38ed6f14de5b7d16d5. --- .github/azure-steps.yml | 34 +++++++++++++++++----------------- spacy/tests/test_cli.py | 2 -- 2 files changed, 17 insertions(+), 19 deletions(-) diff --git a/.github/azure-steps.yml b/.github/azure-steps.yml index 5d865b452..aae08c7f3 100644 --- a/.github/azure-steps.yml +++ b/.github/azure-steps.yml @@ -63,12 +63,12 @@ steps: displayName: "Run GPU tests" condition: eq(${{ parameters.gpu }}, true) -# - script: | -# python -m spacy download ca_core_news_sm -# python -m spacy download ca_core_news_md -# python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')" -# displayName: 'Test download CLI' -# condition: eq(variables['python_version'], '3.8') + - script: | + python -m spacy download ca_core_news_sm + python -m spacy download ca_core_news_md + python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')" + displayName: 'Test download CLI' + condition: eq(variables['python_version'], '3.8') - script: | python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json . @@ -92,17 +92,17 @@ steps: displayName: 'Test train CLI' condition: eq(variables['python_version'], '3.8') -# - script: | -# python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')" -# PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir -# displayName: 'Test assemble CLI' -# condition: eq(variables['python_version'], '3.8') -# -# - script: | -# python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')" -# python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113 -# displayName: 'Test assemble CLI vectors warning' -# condition: eq(variables['python_version'], '3.8') + - script: | + python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')" + PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir + displayName: 'Test assemble CLI' + condition: eq(variables['python_version'], '3.8') + + - script: | + python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')" + python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113 + displayName: 'Test assemble CLI vectors warning' + condition: eq(variables['python_version'], '3.8') - script: | python .github/validate_universe_json.py website/meta/universe.json diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index fe8b3a8a1..838e00369 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -589,7 +589,6 @@ def test_string_to_list_intify(value): assert string_to_list(value, intify=True) == [1, 2, 3] -@pytest.mark.skip(reason="Temporarily skip for dev version") def test_download_compatibility(): spec = SpecifierSet("==" + about.__version__) spec.prereleases = False @@ -600,7 +599,6 @@ def test_download_compatibility(): assert get_minor_version(about.__version__) == get_minor_version(version) -@pytest.mark.skip(reason="Temporarily skip for dev version") def test_validate_compatibility_table(): spec = SpecifierSet("==" + about.__version__) spec.prereleases = False From 4ee8a061497ed24ded0fdcaf9b89ba4b28f49e96 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= Date: Tue, 26 Jul 2022 10:52:01 +0200 Subject: [PATCH 10/13] Fix compatibility with CuPy 9.x (#11194) After the precomputable affine table of shape [nB, nF, nO, nP] is computed, padding with shape [1, nF, nO, nP] is assigned to the first row of the precomputed affine table. However, when we are indexing the precomputed table, we get a row of shape [nF, nO, nP]. CuPy versions before 10.0 cannot paper over this shape difference. This change fixes compatibility with CuPy < 10.0 by squeezing the first dimension of the padding before assignment. --- spacy/ml/_precomputable_affine.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/spacy/ml/_precomputable_affine.py b/spacy/ml/_precomputable_affine.py index 7a25e7574..1c20c622b 100644 --- a/spacy/ml/_precomputable_affine.py +++ b/spacy/ml/_precomputable_affine.py @@ -26,7 +26,11 @@ def forward(model, X, is_train): Yf = model.ops.alloc2f(X.shape[0] + 1, nF * nO * nP, zeros=False) model.ops.gemm(X, W.reshape((nF * nO * nP, nI)), trans2=True, out=Yf[1:]) Yf = Yf.reshape((Yf.shape[0], nF, nO, nP)) - Yf[0] = model.get_param("pad") + + # Set padding. Padding has shape (1, nF, nO, nP). Unfortunately, we cannot + # change its shape to (nF, nO, nP) without breaking existing models. So + # we'll squeeze the first dimension here. + Yf[0] = model.ops.xp.squeeze(model.get_param("pad"), 0) def backward(dY_ids): # This backprop is particularly tricky, because we get back a different From c8f5b752bb00e4d83a92e4919ec2688d47b9aada Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 26 Jul 2022 10:56:53 +0200 Subject: [PATCH 11/13] Add link to developer docs code conventions (#11171) --- CONTRIBUTING.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index ddd833be1..1f396bd71 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -271,7 +271,8 @@ except: # noqa: E722 ### Python conventions -All Python code must be written **compatible with Python 3.6+**. +All Python code must be written **compatible with Python 3.6+**. More detailed +code conventions can be found in the [developer docs](https://github.com/explosion/spaCy/blob/master/extra/DEVELOPER_DOCS/Code%20Conventions.md). #### I/O and handling paths From 5c2a00cef04b8c6e93e81cd1ca1d752f320c6e5d Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 26 Jul 2022 12:52:38 +0200 Subject: [PATCH 12/13] Set version to v3.4.1 (#11209) --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index ef0358e1a..843c15aba 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy" -__version__ = "3.4.0" +__version__ = "3.4.1" __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __projects__ = "https://github.com/explosion/projects" From 360a702ecdf468bcdc7e14906d09cdfe1860e764 Mon Sep 17 00:00:00 2001 From: Edward <43848523+thomashacker@users.noreply.github.com> Date: Tue, 26 Jul 2022 14:35:18 +0200 Subject: [PATCH 13/13] Add parent argument (#11210) --- spacy/cli/pretrain.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py index fe3ce0dad..381d589cf 100644 --- a/spacy/cli/pretrain.py +++ b/spacy/cli/pretrain.py @@ -61,7 +61,7 @@ def pretrain_cli( # TODO: What's the solution here? How do we handle optional blocks? msg.fail("The [pretraining] block in your config is empty", exits=1) if not output_dir.exists(): - output_dir.mkdir() + output_dir.mkdir(parents=True) msg.good(f"Created output directory: {output_dir}") # Save non-interpolated config raw_config.to_disk(output_dir / "config.cfg")