Merge remote-tracking branch 'upstream/master' into rapidfuzz

This commit is contained in:
Adriane Boyd 2022-12-19 15:46:17 +01:00
commit bac3a08023
10 changed files with 21 additions and 17 deletions

View File

@ -107,7 +107,7 @@ steps:
displayName: "Run CPU tests"
- script: |
python -m pip install --pre thinc-apple-ops
python -m pip install 'spacy[apple]'
python -m pytest --pyargs spacy
displayName: "Run CPU tests with thinc-apple-ops"
condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.11'))

View File

@ -5,4 +5,5 @@ numpy==1.17.3; python_version=='3.8' and platform_machine!='aarch64'
numpy==1.19.2; python_version=='3.8' and platform_machine=='aarch64'
numpy==1.19.3; python_version=='3.9'
numpy==1.21.3; python_version=='3.10'
numpy; python_version>='3.11'
numpy==1.23.2; python_version=='3.11'
numpy; python_version>='3.12'

View File

@ -6,7 +6,7 @@ preshed>=3.0.2,<3.1.0
thinc>=8.1.0,<8.2.0
ml_datasets>=0.2.0,<0.3.0
murmurhash>=0.28.0,<1.1.0
wasabi>=0.9.1,<1.1.0
wasabi>=0.9.1,<1.2.0
srsly>=2.4.3,<3.0.0
catalogue>=2.0.6,<2.1.0
typer>=0.3.0,<0.8.0

View File

@ -47,7 +47,7 @@ install_requires =
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
thinc>=8.1.0,<8.2.0
wasabi>=0.9.1,<1.1.0
wasabi>=0.9.1,<1.2.0
srsly>=2.4.3,<3.0.0
catalogue>=2.0.6,<2.1.0
# Third-party dependencies

View File

@ -123,14 +123,14 @@ def test_doc_from_array_heads_in_bounds(en_vocab):
# head before start
arr = doc.to_array(["HEAD"])
arr[0] = -1
arr[0] = numpy.int32(-1).astype(numpy.uint64)
doc_from_array = Doc(en_vocab, words=words)
with pytest.raises(ValueError):
doc_from_array.from_array(["HEAD"], arr)
# head after end
arr = doc.to_array(["HEAD"])
arr[0] = 5
arr[0] = numpy.int32(5).astype(numpy.uint64)
doc_from_array = Doc(en_vocab, words=words)
with pytest.raises(ValueError):
doc_from_array.from_array(["HEAD"], arr)

View File

@ -140,7 +140,7 @@ def test_issue11235():
assert os.path.exists(d / "cfg")
assert os.path.exists(d / f"{lang_var}_model")
assert cfg["commands"][0]["script"][0] == f"hello {lang_var}"
def test_cli_info():
nlp = Dutch()

View File

@ -359,6 +359,7 @@ cdef class Doc:
for annot in annotations:
if annot:
if annot is heads or annot is sent_starts or annot is ent_iobs:
annot = numpy.array(annot, dtype=numpy.int32).astype(numpy.uint64)
for i in range(len(words)):
if attrs.ndim == 1:
attrs[i] = annot[i]
@ -1558,6 +1559,7 @@ cdef class Doc:
for j, (attr, annot) in enumerate(token_annotations.items()):
if attr is HEAD:
annot = numpy.array(annot, dtype=numpy.int32).astype(numpy.uint64)
for i in range(len(words)):
array[i, j] = annot[i]
elif attr is MORPH:

View File

@ -299,7 +299,7 @@ cdef class Span:
for ancestor in ancestors:
ancestor_i = ancestor.i - self.c.start
if ancestor_i in range(length):
array[i, head_col] = ancestor_i - i
array[i, head_col] = numpy.int32(ancestor_i - i).astype(numpy.uint64)
# if there is no appropriate ancestor, define a new artificial root
value = array[i, head_col]
@ -307,7 +307,7 @@ cdef class Span:
new_root = old_to_new_root.get(ancestor_i, None)
if new_root is not None:
# take the same artificial root as a previous token from the same sentence
array[i, head_col] = new_root - i
array[i, head_col] = numpy.int32(new_root - i).astype(numpy.uint64)
else:
# set this token as the new artificial root
array[i, head_col] = 0

View File

@ -443,26 +443,27 @@ def _annot2array(vocab, tok_annot, doc_annot):
if key not in IDS:
raise ValueError(Errors.E974.format(obj="token", key=key))
elif key in ["ORTH", "SPACY"]:
pass
continue
elif key == "HEAD":
attrs.append(key)
values.append([h-i if h is not None else 0 for i, h in enumerate(value)])
row = [h-i if h is not None else 0 for i, h in enumerate(value)]
elif key == "DEP":
attrs.append(key)
values.append([vocab.strings.add(h) if h is not None else MISSING_DEP for h in value])
row = [vocab.strings.add(h) if h is not None else MISSING_DEP for h in value]
elif key == "SENT_START":
attrs.append(key)
values.append([to_ternary_int(v) for v in value])
row = [to_ternary_int(v) for v in value]
elif key == "MORPH":
attrs.append(key)
values.append([vocab.morphology.add(v) for v in value])
row = [vocab.morphology.add(v) for v in value]
else:
attrs.append(key)
if not all(isinstance(v, str) for v in value):
types = set([type(v) for v in value])
raise TypeError(Errors.E969.format(field=key, types=types)) from None
values.append([vocab.strings.add(v) for v in value])
array = numpy.asarray(values, dtype="uint64")
row = [vocab.strings.add(v) for v in value]
values.append([numpy.array(v, dtype=numpy.int32).astype(numpy.uint64) if v < 0 else v for v in row])
array = numpy.array(values, dtype=numpy.uint64)
return attrs, array.T

View File

@ -138,7 +138,7 @@ The L2 norm of the lexeme's vector representation.
| `prefix` | Length-N substring from the start of the word. Defaults to `N=1`. ~~int~~ |
| `prefix_` | Length-N substring from the start of the word. Defaults to `N=1`. ~~str~~ |
| `suffix` | Length-N substring from the end of the word. Defaults to `N=3`. ~~int~~ |
| `suffix_` | Length-N substring from the start of the word. Defaults to `N=3`. ~~str~~ |
| `suffix_` | Length-N substring from the end of the word. Defaults to `N=3`. ~~str~~ |
| `is_alpha` | Does the lexeme consist of alphabetic characters? Equivalent to `lexeme.text.isalpha()`. ~~bool~~ |
| `is_ascii` | Does the lexeme consist of ASCII characters? Equivalent to `[any(ord(c) >= 128 for c in lexeme.text)]`. ~~bool~~ |
| `is_digit` | Does the lexeme consist of digits? Equivalent to `lexeme.text.isdigit()`. ~~bool~~ |