From f22fc7a1138545a2a75975909b5af554e8e1d616 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Fri, 9 Dec 2022 10:15:52 +0100 Subject: [PATCH 1/6] Auto-format code with black (#11955) Co-authored-by: explosion-bot --- spacy/tests/test_cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index 3104b49ff..42af08749 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -140,7 +140,7 @@ def test_issue11235(): assert os.path.exists(d / "cfg") assert os.path.exists(d / f"{lang_var}_model") assert cfg["commands"][0]["script"][0] == f"hello {lang_var}" - + def test_cli_info(): nlp = Dutch() From 8c291ace0c0978e70257906438d3585022090e9f Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 12 Dec 2022 08:38:36 +0100 Subject: [PATCH 2/6] Extend to wasabi v1.1 (#11945) * Extend to wasabi v1.1 * Temporarily run mypy and tests with newest wasabi * Temporarily skip check requirements test * Revert "Temporarily skip check requirements test" This reverts commit 44f4ce20a8e8c92e8bfc8042cc68333589a96253. * Revert "Temporarily run mypy and tests with newest wasabi" This reverts commit e677a2257ced55e696cafc3a8e89eb2f7ddfc369. --- requirements.txt | 2 +- setup.cfg | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 778c05e21..0440835f2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,7 +6,7 @@ preshed>=3.0.2,<3.1.0 thinc>=8.1.0,<8.2.0 ml_datasets>=0.2.0,<0.3.0 murmurhash>=0.28.0,<1.1.0 -wasabi>=0.9.1,<1.1.0 +wasabi>=0.9.1,<1.2.0 srsly>=2.4.3,<3.0.0 catalogue>=2.0.6,<2.1.0 typer>=0.3.0,<0.8.0 diff --git a/setup.cfg b/setup.cfg index 5768c9d3e..cf6e6f84b 100644 --- a/setup.cfg +++ b/setup.cfg @@ -47,7 +47,7 @@ install_requires = cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 thinc>=8.1.0,<8.2.0 - wasabi>=0.9.1,<1.1.0 + wasabi>=0.9.1,<1.2.0 srsly>=2.4.3,<3.0.0 catalogue>=2.0.6,<2.1.0 # Third-party dependencies From 0591e67265d7378769c0fc0df4020817f2d514ec Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 12 Dec 2022 08:45:35 +0100 Subject: [PATCH 3/6] Cast to uint64 for all array-based doc representations (#11933) * Convert all individual values explicitly to uint64 for array-based doc representations * Temporarily test with latest numpy v1.24.0rc * Remove unnecessary conversion from attr_t * Reduce number of individual casts * Convert specifically from int32 to uint64 * Revert "Temporarily test with latest numpy v1.24.0rc" This reverts commit eb0e3c5006515b9a7ff52bae59484c909b8a3f65. * Also use int32 in tests --- spacy/tests/doc/test_array.py | 4 ++-- spacy/tokens/doc.pyx | 2 ++ spacy/tokens/span.pyx | 4 ++-- spacy/training/example.pyx | 15 ++++++++------- 4 files changed, 14 insertions(+), 11 deletions(-) diff --git a/spacy/tests/doc/test_array.py b/spacy/tests/doc/test_array.py index c334cc6eb..1f2d7d999 100644 --- a/spacy/tests/doc/test_array.py +++ b/spacy/tests/doc/test_array.py @@ -123,14 +123,14 @@ def test_doc_from_array_heads_in_bounds(en_vocab): # head before start arr = doc.to_array(["HEAD"]) - arr[0] = -1 + arr[0] = numpy.int32(-1).astype(numpy.uint64) doc_from_array = Doc(en_vocab, words=words) with pytest.raises(ValueError): doc_from_array.from_array(["HEAD"], arr) # head after end arr = doc.to_array(["HEAD"]) - arr[0] = 5 + arr[0] = numpy.int32(5).astype(numpy.uint64) doc_from_array = Doc(en_vocab, words=words) with pytest.raises(ValueError): doc_from_array.from_array(["HEAD"], arr) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index f2621292c..075bc4d15 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -359,6 +359,7 @@ cdef class Doc: for annot in annotations: if annot: if annot is heads or annot is sent_starts or annot is ent_iobs: + annot = numpy.array(annot, dtype=numpy.int32).astype(numpy.uint64) for i in range(len(words)): if attrs.ndim == 1: attrs[i] = annot[i] @@ -1558,6 +1559,7 @@ cdef class Doc: for j, (attr, annot) in enumerate(token_annotations.items()): if attr is HEAD: + annot = numpy.array(annot, dtype=numpy.int32).astype(numpy.uint64) for i in range(len(words)): array[i, j] = annot[i] elif attr is MORPH: diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index c3495f497..99a5f43bd 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -299,7 +299,7 @@ cdef class Span: for ancestor in ancestors: ancestor_i = ancestor.i - self.c.start if ancestor_i in range(length): - array[i, head_col] = ancestor_i - i + array[i, head_col] = numpy.int32(ancestor_i - i).astype(numpy.uint64) # if there is no appropriate ancestor, define a new artificial root value = array[i, head_col] @@ -307,7 +307,7 @@ cdef class Span: new_root = old_to_new_root.get(ancestor_i, None) if new_root is not None: # take the same artificial root as a previous token from the same sentence - array[i, head_col] = new_root - i + array[i, head_col] = numpy.int32(new_root - i).astype(numpy.uint64) else: # set this token as the new artificial root array[i, head_col] = 0 diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx index dfd337b9e..95b0f0de9 100644 --- a/spacy/training/example.pyx +++ b/spacy/training/example.pyx @@ -443,26 +443,27 @@ def _annot2array(vocab, tok_annot, doc_annot): if key not in IDS: raise ValueError(Errors.E974.format(obj="token", key=key)) elif key in ["ORTH", "SPACY"]: - pass + continue elif key == "HEAD": attrs.append(key) - values.append([h-i if h is not None else 0 for i, h in enumerate(value)]) + row = [h-i if h is not None else 0 for i, h in enumerate(value)] elif key == "DEP": attrs.append(key) - values.append([vocab.strings.add(h) if h is not None else MISSING_DEP for h in value]) + row = [vocab.strings.add(h) if h is not None else MISSING_DEP for h in value] elif key == "SENT_START": attrs.append(key) - values.append([to_ternary_int(v) for v in value]) + row = [to_ternary_int(v) for v in value] elif key == "MORPH": attrs.append(key) - values.append([vocab.morphology.add(v) for v in value]) + row = [vocab.morphology.add(v) for v in value] else: attrs.append(key) if not all(isinstance(v, str) for v in value): types = set([type(v) for v in value]) raise TypeError(Errors.E969.format(field=key, types=types)) from None - values.append([vocab.strings.add(v) for v in value]) - array = numpy.asarray(values, dtype="uint64") + row = [vocab.strings.add(v) for v in value] + values.append([numpy.array(v, dtype=numpy.int32).astype(numpy.uint64) if v < 0 else v for v in row]) + array = numpy.array(values, dtype=numpy.uint64) return attrs, array.T From e5c7f3b0776d49c4f6aab7e02b503cdb84fb2134 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 12 Dec 2022 10:13:10 +0100 Subject: [PATCH 4/6] CI: Install thinc-apple-ops through extra (#11963) --- .github/azure-steps.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/azure-steps.yml b/.github/azure-steps.yml index 2f77706b8..d0db75f9a 100644 --- a/.github/azure-steps.yml +++ b/.github/azure-steps.yml @@ -107,7 +107,7 @@ steps: displayName: "Run CPU tests" - script: | - python -m pip install --pre thinc-apple-ops + python -m pip install 'spacy[apple]' python -m pytest --pyargs spacy displayName: "Run CPU tests with thinc-apple-ops" condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.11')) From c9d9d6847f9685c21eeec01f4b8cd053cadf8bf5 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 15 Dec 2022 10:55:01 +0100 Subject: [PATCH 5/6] Update build constraints for python 3.11 (#11981) --- build-constraints.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/build-constraints.txt b/build-constraints.txt index 956973abf..c1e82f1b0 100644 --- a/build-constraints.txt +++ b/build-constraints.txt @@ -5,4 +5,5 @@ numpy==1.17.3; python_version=='3.8' and platform_machine!='aarch64' numpy==1.19.2; python_version=='3.8' and platform_machine=='aarch64' numpy==1.19.3; python_version=='3.9' numpy==1.21.3; python_version=='3.10' -numpy; python_version>='3.11' +numpy==1.23.2; python_version=='3.11' +numpy; python_version>='3.12' From 3a2b655a29203d1c181a2c14d230b3f9cf8dd54a Mon Sep 17 00:00:00 2001 From: cfuerbachersparks <119413757+cfuerbachersparks@users.noreply.github.com> Date: Mon, 19 Dec 2022 10:33:38 +0100 Subject: [PATCH 6/6] Update lexeme.md (#11994) Change suffix_ string to end --- website/docs/api/lexeme.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/api/lexeme.md b/website/docs/api/lexeme.md index eb76afa90..557d04cce 100644 --- a/website/docs/api/lexeme.md +++ b/website/docs/api/lexeme.md @@ -138,7 +138,7 @@ The L2 norm of the lexeme's vector representation. | `prefix` | Length-N substring from the start of the word. Defaults to `N=1`. ~~int~~ | | `prefix_` | Length-N substring from the start of the word. Defaults to `N=1`. ~~str~~ | | `suffix` | Length-N substring from the end of the word. Defaults to `N=3`. ~~int~~ | -| `suffix_` | Length-N substring from the start of the word. Defaults to `N=3`. ~~str~~ | +| `suffix_` | Length-N substring from the end of the word. Defaults to `N=3`. ~~str~~ | | `is_alpha` | Does the lexeme consist of alphabetic characters? Equivalent to `lexeme.text.isalpha()`. ~~bool~~ | | `is_ascii` | Does the lexeme consist of ASCII characters? Equivalent to `[any(ord(c) >= 128 for c in lexeme.text)]`. ~~bool~~ | | `is_digit` | Does the lexeme consist of digits? Equivalent to `lexeme.text.isdigit()`. ~~bool~~ |