Merge remote-tracking branch 'upstream/master' into rapidfuzz

This commit is contained in:
Adriane Boyd 2022-12-19 15:46:17 +01:00
commit bac3a08023
10 changed files with 21 additions and 17 deletions

View File

@ -107,7 +107,7 @@ steps:
displayName: "Run CPU tests" displayName: "Run CPU tests"
- script: | - script: |
python -m pip install --pre thinc-apple-ops python -m pip install 'spacy[apple]'
python -m pytest --pyargs spacy python -m pytest --pyargs spacy
displayName: "Run CPU tests with thinc-apple-ops" displayName: "Run CPU tests with thinc-apple-ops"
condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.11')) condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.11'))

View File

@ -5,4 +5,5 @@ numpy==1.17.3; python_version=='3.8' and platform_machine!='aarch64'
numpy==1.19.2; python_version=='3.8' and platform_machine=='aarch64' numpy==1.19.2; python_version=='3.8' and platform_machine=='aarch64'
numpy==1.19.3; python_version=='3.9' numpy==1.19.3; python_version=='3.9'
numpy==1.21.3; python_version=='3.10' numpy==1.21.3; python_version=='3.10'
numpy; python_version>='3.11' numpy==1.23.2; python_version=='3.11'
numpy; python_version>='3.12'

View File

@ -6,7 +6,7 @@ preshed>=3.0.2,<3.1.0
thinc>=8.1.0,<8.2.0 thinc>=8.1.0,<8.2.0
ml_datasets>=0.2.0,<0.3.0 ml_datasets>=0.2.0,<0.3.0
murmurhash>=0.28.0,<1.1.0 murmurhash>=0.28.0,<1.1.0
wasabi>=0.9.1,<1.1.0 wasabi>=0.9.1,<1.2.0
srsly>=2.4.3,<3.0.0 srsly>=2.4.3,<3.0.0
catalogue>=2.0.6,<2.1.0 catalogue>=2.0.6,<2.1.0
typer>=0.3.0,<0.8.0 typer>=0.3.0,<0.8.0

View File

@ -47,7 +47,7 @@ install_requires =
cymem>=2.0.2,<2.1.0 cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0 preshed>=3.0.2,<3.1.0
thinc>=8.1.0,<8.2.0 thinc>=8.1.0,<8.2.0
wasabi>=0.9.1,<1.1.0 wasabi>=0.9.1,<1.2.0
srsly>=2.4.3,<3.0.0 srsly>=2.4.3,<3.0.0
catalogue>=2.0.6,<2.1.0 catalogue>=2.0.6,<2.1.0
# Third-party dependencies # Third-party dependencies

View File

@ -123,14 +123,14 @@ def test_doc_from_array_heads_in_bounds(en_vocab):
# head before start # head before start
arr = doc.to_array(["HEAD"]) arr = doc.to_array(["HEAD"])
arr[0] = -1 arr[0] = numpy.int32(-1).astype(numpy.uint64)
doc_from_array = Doc(en_vocab, words=words) doc_from_array = Doc(en_vocab, words=words)
with pytest.raises(ValueError): with pytest.raises(ValueError):
doc_from_array.from_array(["HEAD"], arr) doc_from_array.from_array(["HEAD"], arr)
# head after end # head after end
arr = doc.to_array(["HEAD"]) arr = doc.to_array(["HEAD"])
arr[0] = 5 arr[0] = numpy.int32(5).astype(numpy.uint64)
doc_from_array = Doc(en_vocab, words=words) doc_from_array = Doc(en_vocab, words=words)
with pytest.raises(ValueError): with pytest.raises(ValueError):
doc_from_array.from_array(["HEAD"], arr) doc_from_array.from_array(["HEAD"], arr)

View File

@ -359,6 +359,7 @@ cdef class Doc:
for annot in annotations: for annot in annotations:
if annot: if annot:
if annot is heads or annot is sent_starts or annot is ent_iobs: if annot is heads or annot is sent_starts or annot is ent_iobs:
annot = numpy.array(annot, dtype=numpy.int32).astype(numpy.uint64)
for i in range(len(words)): for i in range(len(words)):
if attrs.ndim == 1: if attrs.ndim == 1:
attrs[i] = annot[i] attrs[i] = annot[i]
@ -1558,6 +1559,7 @@ cdef class Doc:
for j, (attr, annot) in enumerate(token_annotations.items()): for j, (attr, annot) in enumerate(token_annotations.items()):
if attr is HEAD: if attr is HEAD:
annot = numpy.array(annot, dtype=numpy.int32).astype(numpy.uint64)
for i in range(len(words)): for i in range(len(words)):
array[i, j] = annot[i] array[i, j] = annot[i]
elif attr is MORPH: elif attr is MORPH:

View File

@ -299,7 +299,7 @@ cdef class Span:
for ancestor in ancestors: for ancestor in ancestors:
ancestor_i = ancestor.i - self.c.start ancestor_i = ancestor.i - self.c.start
if ancestor_i in range(length): if ancestor_i in range(length):
array[i, head_col] = ancestor_i - i array[i, head_col] = numpy.int32(ancestor_i - i).astype(numpy.uint64)
# if there is no appropriate ancestor, define a new artificial root # if there is no appropriate ancestor, define a new artificial root
value = array[i, head_col] value = array[i, head_col]
@ -307,7 +307,7 @@ cdef class Span:
new_root = old_to_new_root.get(ancestor_i, None) new_root = old_to_new_root.get(ancestor_i, None)
if new_root is not None: if new_root is not None:
# take the same artificial root as a previous token from the same sentence # take the same artificial root as a previous token from the same sentence
array[i, head_col] = new_root - i array[i, head_col] = numpy.int32(new_root - i).astype(numpy.uint64)
else: else:
# set this token as the new artificial root # set this token as the new artificial root
array[i, head_col] = 0 array[i, head_col] = 0

View File

@ -443,26 +443,27 @@ def _annot2array(vocab, tok_annot, doc_annot):
if key not in IDS: if key not in IDS:
raise ValueError(Errors.E974.format(obj="token", key=key)) raise ValueError(Errors.E974.format(obj="token", key=key))
elif key in ["ORTH", "SPACY"]: elif key in ["ORTH", "SPACY"]:
pass continue
elif key == "HEAD": elif key == "HEAD":
attrs.append(key) attrs.append(key)
values.append([h-i if h is not None else 0 for i, h in enumerate(value)]) row = [h-i if h is not None else 0 for i, h in enumerate(value)]
elif key == "DEP": elif key == "DEP":
attrs.append(key) attrs.append(key)
values.append([vocab.strings.add(h) if h is not None else MISSING_DEP for h in value]) row = [vocab.strings.add(h) if h is not None else MISSING_DEP for h in value]
elif key == "SENT_START": elif key == "SENT_START":
attrs.append(key) attrs.append(key)
values.append([to_ternary_int(v) for v in value]) row = [to_ternary_int(v) for v in value]
elif key == "MORPH": elif key == "MORPH":
attrs.append(key) attrs.append(key)
values.append([vocab.morphology.add(v) for v in value]) row = [vocab.morphology.add(v) for v in value]
else: else:
attrs.append(key) attrs.append(key)
if not all(isinstance(v, str) for v in value): if not all(isinstance(v, str) for v in value):
types = set([type(v) for v in value]) types = set([type(v) for v in value])
raise TypeError(Errors.E969.format(field=key, types=types)) from None raise TypeError(Errors.E969.format(field=key, types=types)) from None
values.append([vocab.strings.add(v) for v in value]) row = [vocab.strings.add(v) for v in value]
array = numpy.asarray(values, dtype="uint64") values.append([numpy.array(v, dtype=numpy.int32).astype(numpy.uint64) if v < 0 else v for v in row])
array = numpy.array(values, dtype=numpy.uint64)
return attrs, array.T return attrs, array.T

View File

@ -138,7 +138,7 @@ The L2 norm of the lexeme's vector representation.
| `prefix` | Length-N substring from the start of the word. Defaults to `N=1`. ~~int~~ | | `prefix` | Length-N substring from the start of the word. Defaults to `N=1`. ~~int~~ |
| `prefix_` | Length-N substring from the start of the word. Defaults to `N=1`. ~~str~~ | | `prefix_` | Length-N substring from the start of the word. Defaults to `N=1`. ~~str~~ |
| `suffix` | Length-N substring from the end of the word. Defaults to `N=3`. ~~int~~ | | `suffix` | Length-N substring from the end of the word. Defaults to `N=3`. ~~int~~ |
| `suffix_` | Length-N substring from the start of the word. Defaults to `N=3`. ~~str~~ | | `suffix_` | Length-N substring from the end of the word. Defaults to `N=3`. ~~str~~ |
| `is_alpha` | Does the lexeme consist of alphabetic characters? Equivalent to `lexeme.text.isalpha()`. ~~bool~~ | | `is_alpha` | Does the lexeme consist of alphabetic characters? Equivalent to `lexeme.text.isalpha()`. ~~bool~~ |
| `is_ascii` | Does the lexeme consist of ASCII characters? Equivalent to `[any(ord(c) >= 128 for c in lexeme.text)]`. ~~bool~~ | | `is_ascii` | Does the lexeme consist of ASCII characters? Equivalent to `[any(ord(c) >= 128 for c in lexeme.text)]`. ~~bool~~ |
| `is_digit` | Does the lexeme consist of digits? Equivalent to `lexeme.text.isdigit()`. ~~bool~~ | | `is_digit` | Does the lexeme consist of digits? Equivalent to `lexeme.text.isdigit()`. ~~bool~~ |