mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-03 03:40:24 +03:00
Improvements
This commit is contained in:
parent
910a6bc98f
commit
a18bac40f5
|
@ -981,31 +981,25 @@ def test_get_affixes_good_case(en_tokenizer):
|
||||||
doc = en_tokenizer("spaCy✨ and Prodigy")
|
doc = en_tokenizer("spaCy✨ and Prodigy")
|
||||||
prefixes = doc.get_affixes(False, 1, 5, "", 2, 3)
|
prefixes = doc.get_affixes(False, 1, 5, "", 2, 3)
|
||||||
suffixes = doc.get_affixes(True, 2, 6, "xx✨rP", 2, 3)
|
suffixes = doc.get_affixes(True, 2, 6, "xx✨rP", 2, 3)
|
||||||
assert prefixes[0][3, 3, 6] == suffixes[0][3, 3, 6]
|
assert prefixes[0][3, 3, 3] == suffixes[0][3, 3, 3]
|
||||||
assert prefixes[0][3, 3, 7] == suffixes[0][3, 3, 7]
|
assert prefixes[0][3, 3, 2] == suffixes[0][3, 3, 4]
|
||||||
assert prefixes[0][3, 3, 4] == suffixes[0][3, 3, 8]
|
assert (prefixes[0][0, :, 1:] == 0).all()
|
||||||
assert prefixes[0][3, 3, 5] == suffixes[0][3, 3, 9]
|
assert not (suffixes[0][0, :, 1:] == 0).all()
|
||||||
assert (prefixes[0][0, :, 2:] == 0).all()
|
assert (suffixes[0][0, :, 2:] == 0).all()
|
||||||
assert not (suffixes[0][0, :, 2:] == 0).all()
|
assert (prefixes[0][1, :, 2:] == 0).all()
|
||||||
assert (suffixes[0][0, :, 4:] == 0).all()
|
assert (prefixes[0][:, 1, 1:] == 0).all()
|
||||||
assert (prefixes[0][1, :, 4:] == 0).all()
|
assert not (suffixes[0][1, :, 2:] == 0).all()
|
||||||
assert (prefixes[0][:, 1, 2:] == 0).all()
|
assert (suffixes[0][1, :, 3:] == 0).all()
|
||||||
assert not (suffixes[0][1, :, 4:] == 0).all()
|
assert suffixes[1][0][1].tolist() == [10024, 0]
|
||||||
assert (suffixes[0][1, :, 6:] == 0).all()
|
assert suffixes[1][0][3].tolist() == [114, 112]
|
||||||
assert prefixes[0][0][0][0] == 0
|
|
||||||
assert prefixes[0][0][1][0] != 0
|
|
||||||
assert (prefixes[1] == 0).all()
|
|
||||||
assert (suffixes[1][0][0] == 0).all()
|
|
||||||
assert suffixes[1][0][1].tolist() == [39, 40, 0, 0]
|
|
||||||
assert suffixes[1][0][3].tolist() == [0, 114, 0, 80]
|
|
||||||
|
|
||||||
|
|
||||||
def test_get_affixes_4_byte_normal_char(en_tokenizer):
|
def test_get_affixes_4_byte_normal_char(en_tokenizer):
|
||||||
doc = en_tokenizer("and𐌞")
|
doc = en_tokenizer("and𐌞")
|
||||||
suffixes = doc.get_affixes(True, 2, 6, "a", 1, 2)
|
suffixes = doc.get_affixes(True, 2, 6, "a", 1, 2)
|
||||||
assert (suffixes[0][:, 0, 2] == 216).all()
|
assert (suffixes[0][:, 0, 1] == 55296).all()
|
||||||
assert suffixes[0][3, 0, 9] == 97
|
assert suffixes[0][3, 0, 4] == 97
|
||||||
assert suffixes[1][0, 0, 1] == 97
|
assert suffixes[1][0, 0, 0] == 97
|
||||||
|
|
||||||
|
|
||||||
def test_get_affixes_4_byte_special_char(en_tokenizer):
|
def test_get_affixes_4_byte_special_char(en_tokenizer):
|
||||||
|
|
|
@ -1738,47 +1738,44 @@ cdef class Doc:
|
||||||
"""
|
"""
|
||||||
TODO
|
TODO
|
||||||
"""
|
"""
|
||||||
byte_strings = [token.orth_.encode('utf-16BE') for token in self]
|
|
||||||
cdef int num_tokens = len(byte_strings)
|
|
||||||
|
|
||||||
special_chars_enc = special_chars.encode('utf-16BE')
|
|
||||||
cdef int sc_test_len = len(special_chars)
|
|
||||||
if sc_test_len * 2 != len(special_chars_enc):
|
|
||||||
raise ValueError(Errors.E1044)
|
|
||||||
|
|
||||||
cdef np.ndarray[np.uint8_t, ndim=3] outputs = numpy.zeros(
|
|
||||||
(len_end - len_start, num_tokens, (len_end - 1) * 2), dtype="uint8")
|
|
||||||
cdef np.ndarray[np.uint8_t, ndim=3] sc_outputs = numpy.zeros(
|
|
||||||
(sc_len_end - sc_len_start, num_tokens, (sc_len_end - 1) * 2), dtype="uint8")
|
|
||||||
|
|
||||||
cdef bytes byte_string
|
cdef bytes byte_string
|
||||||
cdef char this_char_part, next_char_part, this_test_char_part, next_test_char_part
|
cdef np.uint16_t this_char
|
||||||
cdef int len_byte_string, idx, sc_char_idx, sc_test_idx, this_len, this_sc_len
|
cdef int idx, len_byte_string, sc_char_idx, sc_test_idx, this_len, this_sc_len
|
||||||
|
|
||||||
for token_idx, byte_string in enumerate(byte_strings):
|
cdef int num_tokens = len(self)
|
||||||
|
cdef bytes sc_enc = special_chars.lower().encode("utf-16BE")
|
||||||
|
cdef int sc_test_len = len(special_chars)
|
||||||
|
if sc_test_len * 2 != len(sc_enc):
|
||||||
|
raise ValueError(Errors.E1044)
|
||||||
|
cdef np.ndarray[np.uint16_t, ndim=1] scs = numpy.empty((sc_test_len,), dtype="uint16")
|
||||||
|
for idx in range(sc_test_len):
|
||||||
|
scs[idx] = (sc_enc[idx*2] << 8) + sc_enc[idx * 2 + 1]
|
||||||
|
|
||||||
|
cdef np.ndarray[np.uint16_t, ndim=3] outputs = numpy.zeros(
|
||||||
|
(len_end - len_start, num_tokens, len_end - 1), dtype="uint16")
|
||||||
|
cdef np.ndarray[np.uint16_t, ndim=3] sc_outputs = numpy.zeros(
|
||||||
|
(sc_len_end - sc_len_start, num_tokens, sc_len_end - 1), dtype="uint16")
|
||||||
|
|
||||||
|
for token_idx in range(num_tokens):
|
||||||
|
byte_string = self[token_idx].lower_.encode("utf-16BE")
|
||||||
idx = 0
|
idx = 0
|
||||||
sc_char_idx = 0
|
sc_char_idx = 0
|
||||||
len_byte_string = len(byte_string)
|
len_byte_string = len(byte_string)
|
||||||
|
|
||||||
while (idx < len_end - 1 or sc_char_idx < sc_len_end - 1) and idx * 2 < len_byte_string:
|
while (idx < len_end - 1 or sc_char_idx < sc_len_end - 1) and idx * 2 < len_byte_string:
|
||||||
char_first_byte_idx = len_byte_string - 2 * (idx + 1) if suffs_not_prefs else idx * 2
|
char_first_byte_idx = len_byte_string - 2 * (idx + 1) if suffs_not_prefs else idx * 2
|
||||||
this_char_part = byte_string[char_first_byte_idx]
|
this_char = (byte_string[char_first_byte_idx] << 8) + byte_string[char_first_byte_idx + 1]
|
||||||
next_char_part = byte_string[char_first_byte_idx + 1]
|
|
||||||
for this_len in range(len_end-1, len_start-1, -1):
|
for this_len in range(len_end-1, len_start-1, -1):
|
||||||
if idx >= this_len:
|
if idx >= this_len:
|
||||||
break
|
break
|
||||||
outputs[this_len - len_start, token_idx, idx * 2] = this_char_part
|
outputs[this_len - len_start, token_idx, idx] = this_char
|
||||||
outputs[this_len - len_start, token_idx, idx * 2 + 1] = next_char_part
|
|
||||||
sc_test_idx = 0
|
sc_test_idx = 0
|
||||||
while sc_test_len > sc_test_idx:
|
while sc_test_len > sc_test_idx:
|
||||||
this_test_char_part = special_chars_enc[sc_test_idx*2]
|
if this_char == scs[sc_test_idx]:
|
||||||
next_test_char_part = special_chars_enc[sc_test_idx*2 + 1]
|
|
||||||
if this_char_part == this_test_char_part and next_char_part == next_test_char_part:
|
|
||||||
for this_sc_len in range(sc_len_end-1, sc_len_start-1, -1):
|
for this_sc_len in range(sc_len_end-1, sc_len_start-1, -1):
|
||||||
if sc_char_idx >= this_sc_len:
|
if sc_char_idx >= this_sc_len:
|
||||||
break
|
break
|
||||||
sc_outputs[this_sc_len - sc_len_start, token_idx, sc_char_idx * 2] = this_char_part
|
sc_outputs[this_sc_len - sc_len_start, token_idx, sc_char_idx] = this_char
|
||||||
sc_outputs[this_sc_len - sc_len_start, token_idx, sc_char_idx * 2 + 1] = next_char_part
|
|
||||||
sc_char_idx += 1
|
sc_char_idx += 1
|
||||||
break
|
break
|
||||||
sc_test_idx += 1
|
sc_test_idx += 1
|
||||||
|
|
Loading…
Reference in New Issue
Block a user