mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-02 19:30:19 +03:00
Major refactoring
This commit is contained in:
parent
f7d9942e7c
commit
42b7b8d509
|
@ -224,11 +224,11 @@ def RichMultiHashEmbed(
|
||||||
case_sensitive: bool,
|
case_sensitive: bool,
|
||||||
pref_lengths: Optional[List[int]] = None,
|
pref_lengths: Optional[List[int]] = None,
|
||||||
pref_rows: Optional[List[int]] = None,
|
pref_rows: Optional[List[int]] = None,
|
||||||
|
suff_lengths: Optional[List[int]] = None,
|
||||||
|
suff_rows: Optional[List[int]] = None,
|
||||||
pref_search_chars: Optional[str] = None,
|
pref_search_chars: Optional[str] = None,
|
||||||
pref_search_lengths: Optional[List[int]] = None,
|
pref_search_lengths: Optional[List[int]] = None,
|
||||||
pref_search_rows: Optional[List[int]] = None,
|
pref_search_rows: Optional[List[int]] = None,
|
||||||
suff_lengths: Optional[List[int]] = None,
|
|
||||||
suff_rows: Optional[List[int]] = None,
|
|
||||||
suff_search_chars: Optional[str] = None,
|
suff_search_chars: Optional[str] = None,
|
||||||
suff_search_lengths: Optional[List[int]] = None,
|
suff_search_lengths: Optional[List[int]] = None,
|
||||||
suff_search_rows: Optional[List[int]] = None,
|
suff_search_rows: Optional[List[int]] = None,
|
||||||
|
@ -252,13 +252,14 @@ def RichMultiHashEmbed(
|
||||||
depending on the presence of some other letter before or after it, e.g. German
|
depending on the presence of some other letter before or after it, e.g. German
|
||||||
plural nouns where the final two vowels are `ä-e` regularly correspond to
|
plural nouns where the final two vowels are `ä-e` regularly correspond to
|
||||||
singular lemmas where the `e` is no longer present and the `ä` has become `a`.
|
singular lemmas where the `e` is no longer present and the `ä` has become `a`.
|
||||||
For most languages, searching is likely to be useful starting at the end
|
For most languages used with spaCy, searching is likely to be useful starting
|
||||||
(`suff_*`), but the ability to search from the beginning (`pref_*`) is also
|
at the end (`suff_*`), but the ability to search from the beginning (`pref_*`)
|
||||||
offered for completeness. Search characters should consist of all characters
|
is also offered for completeness. Search characters should consist of all
|
||||||
that regularly alternate with other characters in the language in question or
|
characters that regularly alternate with other characters in the language in
|
||||||
whose presence before or after characters that would otherwise alternate
|
question or whose presence before or after characters that would otherwise
|
||||||
prevents the alternation from occurring, e.g. an `ä` in a German plural noun does
|
alternate prevents the alternation from occurring, e.g. an `ä` in a German
|
||||||
not become `a` if it is the third or fourth vowel from the end of the word.
|
plural noun does not become `a` if it is the third or fourth vowel from the
|
||||||
|
end of the word.
|
||||||
|
|
||||||
width (int): The output width. Also used as the width of the embedding tables.
|
width (int): The output width. Also used as the width of the embedding tables.
|
||||||
Recommended values are between 64 and 300.
|
Recommended values are between 64 and 300.
|
||||||
|
@ -274,22 +275,18 @@ def RichMultiHashEmbed(
|
||||||
for each word, e.g. for the word `spaCy`:
|
for each word, e.g. for the word `spaCy`:
|
||||||
`[1, 3]` would lead to `s` and `spa` being used as features.
|
`[1, 3]` would lead to `s` and `spa` being used as features.
|
||||||
pref_rows (Optional[List[int]]): The number of rows for each of `pref_lengths`.
|
pref_rows (Optional[List[int]]): The number of rows for each of `pref_lengths`.
|
||||||
pref_search_chars (Optional[str]): A string containing characters to search for
|
|
||||||
starting from the beginning of each word. May not contain characters that
|
|
||||||
occupy four bytes in UTF-16; if `case_sensitive==True`, may not contain
|
|
||||||
upper-case letters.
|
|
||||||
pref_search_lengths (Optional[List[int]]): The lengths of search result strings
|
|
||||||
to use as features, where the searches start from the beginning of each word.
|
|
||||||
pref_search_rows (Optional[List[int]]): The number of rows for each of
|
|
||||||
`pref_search_lengths`.
|
|
||||||
suff_lengths (Optional[List[int]]): The lengths of suffixes to use as features
|
suff_lengths (Optional[List[int]]): The lengths of suffixes to use as features
|
||||||
for each word, e.g. for the word `spaCy`:
|
for each word, e.g. for the word `spaCy`:
|
||||||
`[1, 3]` would lead to `y` and `aCy` being used as features.
|
`[1, 3]` would lead to `y` and `aCy` being used as features.
|
||||||
suff_rows (Optional[List[int]]): The number of rows for each of `suff_lengths`.
|
suff_rows (Optional[List[int]]): The number of rows for each of `suff_lengths`.
|
||||||
|
pref_search_chars (Optional[str]): A string containing characters to search for
|
||||||
|
starting from the beginning of each word.
|
||||||
|
pref_search_lengths (Optional[List[int]]): The lengths of search result strings
|
||||||
|
to use as features, where the searches start from the beginning of each word.
|
||||||
|
pref_search_rows (Optional[List[int]]): The number of rows for each of
|
||||||
|
`pref_search_lengths`.
|
||||||
suff_search_chars (Optional[str]): A string containing characters to search for
|
suff_search_chars (Optional[str]): A string containing characters to search for
|
||||||
starting from the end of each word. May not contain characters that
|
starting from the end of each word.
|
||||||
occupy four bytes in UTF-16; if `case_sensitive==True`, may not contain
|
|
||||||
upper-case letters.
|
|
||||||
suff_search_lengths (Optional[List[int]]): The lengths of search result strings
|
suff_search_lengths (Optional[List[int]]): The lengths of search result strings
|
||||||
to use as features, where the searches start from the end of each word.
|
to use as features, where the searches start from the end of each word.
|
||||||
suff_search_rows (Optional[List[int]]): The number of rows for each of
|
suff_search_rows (Optional[List[int]]): The number of rows for each of
|
||||||
|
@ -302,6 +299,9 @@ def RichMultiHashEmbed(
|
||||||
_verify_rich_config_group(
|
_verify_rich_config_group(
|
||||||
"prefix", pref_lengths, pref_rows, None, False, case_sensitive
|
"prefix", pref_lengths, pref_rows, None, False, case_sensitive
|
||||||
)
|
)
|
||||||
|
_verify_rich_config_group(
|
||||||
|
"suffix", suff_lengths, suff_rows, None, False, case_sensitive
|
||||||
|
)
|
||||||
_verify_rich_config_group(
|
_verify_rich_config_group(
|
||||||
"prefix search",
|
"prefix search",
|
||||||
pref_search_lengths,
|
pref_search_lengths,
|
||||||
|
@ -310,9 +310,6 @@ def RichMultiHashEmbed(
|
||||||
True,
|
True,
|
||||||
case_sensitive,
|
case_sensitive,
|
||||||
)
|
)
|
||||||
_verify_rich_config_group(
|
|
||||||
"suffix", suff_lengths, suff_rows, None, False, case_sensitive
|
|
||||||
)
|
|
||||||
_verify_rich_config_group(
|
_verify_rich_config_group(
|
||||||
"suffix search",
|
"suffix search",
|
||||||
suff_search_lengths,
|
suff_search_lengths,
|
||||||
|
@ -324,10 +321,10 @@ def RichMultiHashEmbed(
|
||||||
|
|
||||||
if pref_rows is not None:
|
if pref_rows is not None:
|
||||||
rows.extend(pref_rows)
|
rows.extend(pref_rows)
|
||||||
if pref_search_rows is not None:
|
|
||||||
rows.extend(pref_search_rows)
|
|
||||||
if suff_rows is not None:
|
if suff_rows is not None:
|
||||||
rows.extend(suff_rows)
|
rows.extend(suff_rows)
|
||||||
|
if pref_search_rows is not None:
|
||||||
|
rows.extend(pref_search_rows)
|
||||||
if suff_search_rows is not None:
|
if suff_search_rows is not None:
|
||||||
rows.extend(suff_search_rows)
|
rows.extend(suff_search_rows)
|
||||||
|
|
||||||
|
@ -344,9 +341,9 @@ def RichMultiHashEmbed(
|
||||||
RichFeatureExtractor(
|
RichFeatureExtractor(
|
||||||
case_sensitive=case_sensitive,
|
case_sensitive=case_sensitive,
|
||||||
pref_lengths=pref_lengths,
|
pref_lengths=pref_lengths,
|
||||||
|
suff_lengths=suff_lengths,
|
||||||
pref_search_chars=pref_search_chars,
|
pref_search_chars=pref_search_chars,
|
||||||
pref_search_lengths=pref_search_lengths,
|
pref_search_lengths=pref_search_lengths,
|
||||||
suff_lengths=suff_lengths,
|
|
||||||
suff_search_chars=suff_search_chars,
|
suff_search_chars=suff_search_chars,
|
||||||
suff_search_lengths=suff_search_lengths,
|
suff_search_lengths=suff_search_lengths,
|
||||||
),
|
),
|
||||||
|
|
|
@ -11,9 +11,9 @@ def RichFeatureExtractor(
|
||||||
*,
|
*,
|
||||||
case_sensitive: bool,
|
case_sensitive: bool,
|
||||||
pref_lengths: Optional[List[int]] = None,
|
pref_lengths: Optional[List[int]] = None,
|
||||||
|
suff_lengths: Optional[List[int]] = None,
|
||||||
pref_search_chars: Optional[str] = None,
|
pref_search_chars: Optional[str] = None,
|
||||||
pref_search_lengths: Optional[List[int]] = None,
|
pref_search_lengths: Optional[List[int]] = None,
|
||||||
suff_lengths: Optional[List[int]] = None,
|
|
||||||
suff_search_chars: Optional[str] = None,
|
suff_search_chars: Optional[str] = None,
|
||||||
suff_search_lengths: Optional[List[int]] = None,
|
suff_search_lengths: Optional[List[int]] = None,
|
||||||
) -> Model[List[Doc], List[Ints2d]]:
|
) -> Model[List[Doc], List[Ints2d]]:
|
||||||
|
|
|
@ -67,7 +67,7 @@ cdef void _search_for_chars(
|
||||||
Py_UCS4* result_buf,
|
Py_UCS4* result_buf,
|
||||||
const int result_buf_len,
|
const int result_buf_len,
|
||||||
bint suffs_not_prefs
|
bint suffs_not_prefs
|
||||||
)
|
) nogil
|
||||||
|
|
||||||
|
|
||||||
cdef class Doc:
|
cdef class Doc:
|
||||||
|
|
|
@ -1755,10 +1755,11 @@ cdef class Doc:
|
||||||
derived from the raw text of each token.
|
derived from the raw text of each token.
|
||||||
|
|
||||||
Generally:
|
Generally:
|
||||||
p_ variables relate to prefixes (affixes starting at the beginning of the word)
|
|
||||||
s_ variables relate to suffixes (affixes starting at the end of the word)
|
p_ variables relate to prefixes (affixes starting at the beginning of the word)
|
||||||
ps_ variables relate to searches starting at the beginning of the word
|
s_ variables relate to suffixes (affixes starting at the end of the word)
|
||||||
ss_ variables relate to searches starting at the end of the word
|
ps_ variables relate to searches starting at the beginning of the word
|
||||||
|
ss_ variables relate to searches starting at the end of the word
|
||||||
|
|
||||||
cs: if *False*, hashes are generated based on the lower-case version of each token.
|
cs: if *False*, hashes are generated based on the lower-case version of each token.
|
||||||
p_lengths: an Ints1d specifying the lengths of prefixes to be hashed. For example, if *p_lengths==[2, 3]*,
|
p_lengths: an Ints1d specifying the lengths of prefixes to be hashed. For example, if *p_lengths==[2, 3]*,
|
||||||
|
@ -1770,7 +1771,7 @@ cdef class Doc:
|
||||||
the corresponding position in *ps_search* is matched. Having separate search and lookup arrays enables
|
the corresponding position in *ps_search* is matched. Having separate search and lookup arrays enables
|
||||||
case-insensitivity to be handled efficiently.
|
case-insensitivity to be handled efficiently.
|
||||||
ps_search_l: the number of characters in *ps_search* and hence also in *ps_lookup*
|
ps_search_l: the number of characters in *ps_search* and hence also in *ps_lookup*
|
||||||
ps_lengths: an Ints1d specifying the lengths of search results to be hashed. For example if
|
ps_lengths: an Ints1d specifying the lengths of search results (from the beginning) to be hashed. For example, if
|
||||||
*ps_lengths==[1, 2]*, *ps_search=="aC" and *cs==False*, the searched strings hashed for
|
*ps_lengths==[1, 2]*, *ps_search=="aC" and *cs==False*, the searched strings hashed for
|
||||||
"spaCy" would be "a" and "ac".
|
"spaCy" would be "a" and "ac".
|
||||||
ss_search: a byte array containing characters to search for within each token, starting at the end.
|
ss_search: a byte array containing characters to search for within each token, starting at the end.
|
||||||
|
@ -1778,12 +1779,13 @@ cdef class Doc:
|
||||||
the corresponding position in *ss_search* is matched. Having separate search and lookup arrays enables
|
the corresponding position in *ss_search* is matched. Having separate search and lookup arrays enables
|
||||||
case-insensitivity to be handled efficiently.
|
case-insensitivity to be handled efficiently.
|
||||||
ss_l: the number of characters in *ss_search* and hence also in *ss_lookup*
|
ss_l: the number of characters in *ss_search* and hence also in *ss_lookup*
|
||||||
ss_lengths: an integer list specifying the lengths of search results to be hashed. For example if
|
ss_lengths: an integer list specifying the lengths of search results (from the end) to be hashed. For example, if
|
||||||
*suff_search_lengths==[1, 2]*, *suff_search_chars=="aC" and *cs==False*, the searched strings hashed for
|
*ss_lengths==[1, 2]*, *ss_search=="aC" and *cs==False*, the searched strings hashed for
|
||||||
"spaCy" would be "c" and "ca".
|
"spaCy" would be "c" and "ca".
|
||||||
|
|
||||||
For a document with tokens ["spaCy", "and", "Prodigy"], the NumPy array returned by
|
For a document with tokens ["spaCy", "and", "Prodigy"], the NumPy array returned by
|
||||||
*get_character_combination_hashes(True, [2], [2, 4, 6], "", "", 0, [], "yC", "yC", 2, [1, 2])* would correspond to
|
*get_character_combination_hashes(True, [2], [2, 4, 6], "", "", 0, [], "yC", "yC", 2, [1, 2])*
|
||||||
|
would correspond to
|
||||||
|
|
||||||
[[hash("sp"), [hash("Cy"), hash("paCy"), hash(" spaCy"), hash("y"), hash("yC")],
|
[[hash("sp"), [hash("Cy"), hash("paCy"), hash(" spaCy"), hash("y"), hash("yC")],
|
||||||
[hash("an"), hash("nd"), hash(" and", hash(" and"), hash(" "), hash(" "))],
|
[hash("an"), hash("nd"), hash(" and", hash(" and"), hash(" "), hash(" "))],
|
||||||
|
@ -1812,18 +1814,18 @@ cdef class Doc:
|
||||||
# Define / allocate buffer (pr/sr: result buffers)
|
# Define / allocate buffer (pr/sr: result buffers)
|
||||||
cdef int aff_buf_l = p_max_l + s_max_l
|
cdef int aff_buf_l = p_max_l + s_max_l
|
||||||
cdef Py_UCS4* aff_buf = <Py_UCS4*> self.mem.alloc(aff_buf_l, sizeof(Py_UCS4))
|
cdef Py_UCS4* aff_buf = <Py_UCS4*> self.mem.alloc(aff_buf_l, sizeof(Py_UCS4))
|
||||||
cdef Py_UCS4* ps_buf = <Py_UCS4*> ps_search
|
cdef Py_UCS4* ps_s_buf = <Py_UCS4*> ps_search
|
||||||
cdef Py_UCS4* pl_buf = <Py_UCS4*> ps_lookup
|
cdef Py_UCS4* ps_l_buf = <Py_UCS4*> ps_lookup
|
||||||
cdef Py_UCS4* pr_buf = <Py_UCS4*> self.mem.alloc(ps_max_l, sizeof(Py_UCS4))
|
cdef Py_UCS4* ps_r_buf = <Py_UCS4*> self.mem.alloc(ps_max_l, sizeof(Py_UCS4))
|
||||||
cdef Py_UCS4* ss_buf = <Py_UCS4*> ss_search
|
cdef Py_UCS4* ss_s_buf = <Py_UCS4*> ss_search
|
||||||
cdef Py_UCS4* sl_buf = <Py_UCS4*> ss_lookup
|
cdef Py_UCS4* ss_l_buf = <Py_UCS4*> ss_lookup
|
||||||
cdef Py_UCS4* sr_buf = <Py_UCS4*> self.mem.alloc(ss_max_l, sizeof(Py_UCS4))
|
cdef Py_UCS4* ss_r_buf = <Py_UCS4*> self.mem.alloc(ss_max_l, sizeof(Py_UCS4))
|
||||||
|
|
||||||
# Define memory views on length arrays
|
# Define memory views on length arrays
|
||||||
cdef int[:] p_v = p_lengths
|
cdef int[:] p_lengths_v = p_lengths
|
||||||
cdef int[:] s_v = s_lengths
|
cdef int[:] s_lengths_v = s_lengths
|
||||||
cdef int[:] ps_v = ps_lengths
|
cdef int[:] ps_lengths_v = ps_lengths
|
||||||
cdef int[:] ss_v = ss_lengths
|
cdef int[:] ss_lengths_v = ss_lengths
|
||||||
|
|
||||||
# Define working variables
|
# Define working variables
|
||||||
cdef TokenC tok_c
|
cdef TokenC tok_c
|
||||||
|
@ -1838,27 +1840,27 @@ cdef class Doc:
|
||||||
_set_affixes(text_buf, tok_idx, tok_len, aff_buf, p_max_l, s_max_l, not cs)
|
_set_affixes(text_buf, tok_idx, tok_len, aff_buf, p_max_l, s_max_l, not cs)
|
||||||
|
|
||||||
for hash_idx in range(p_h_num):
|
for hash_idx in range(p_h_num):
|
||||||
hashes[tok_i, hash_idx] = hash32(aff_buf, p_v[hash_idx] * sizeof(Py_UCS4), 0)
|
hashes[tok_i, hash_idx] = hash32(aff_buf, p_lengths_v[hash_idx] * sizeof(Py_UCS4), 0)
|
||||||
|
|
||||||
for hash_idx in range(p_h_num, s_h_end):
|
for hash_idx in range(p_h_num, s_h_end):
|
||||||
aff_len = s_v[hash_idx - p_h_num]
|
aff_len = s_lengths_v[hash_idx - p_h_num]
|
||||||
hashes[tok_i, hash_idx] = hash32(aff_buf + aff_buf_l - aff_len, aff_len * sizeof(Py_UCS4), 0)
|
hashes[tok_i, hash_idx] = hash32(aff_buf + aff_buf_l - aff_len, aff_len * sizeof(Py_UCS4), 0)
|
||||||
|
|
||||||
if ps_h_num > 0:
|
if ps_h_num > 0:
|
||||||
_search_for_chars(text_buf, tok_idx, tok_len, ps_buf, pl_buf, ps_l, pr_buf, ps_max_l, False)
|
_search_for_chars(text_buf, tok_idx, tok_len, ps_s_buf, ps_l_buf, ps_l, ps_r_buf, ps_max_l, False)
|
||||||
for hash_idx in range(s_h_end, ps_h_end):
|
for hash_idx in range(s_h_end, ps_h_end):
|
||||||
aff_len = ps_v[hash_idx - s_h_end]
|
aff_len = ps_lengths_v[hash_idx - s_h_end]
|
||||||
hashes[tok_i, hash_idx] = hash32(pr_buf, aff_len * sizeof(Py_UCS4), 0)
|
hashes[tok_i, hash_idx] = hash32(ps_r_buf, aff_len * sizeof(Py_UCS4), 0)
|
||||||
|
|
||||||
if ss_h_num > 0:
|
if ss_h_num > 0:
|
||||||
_search_for_chars(text_buf, tok_idx, tok_len, ss_buf, sl_buf, ss_l, sr_buf, ss_max_l, True)
|
_search_for_chars(text_buf, tok_idx, tok_len, ss_s_buf, ss_l_buf, ss_l, ss_r_buf, ss_max_l, True)
|
||||||
for hash_idx in range(ps_h_end, ss_h_end):
|
for hash_idx in range(ps_h_end, ss_h_end):
|
||||||
aff_len = ss_v[hash_idx - ps_h_end]
|
aff_len = ss_lengths_v[hash_idx - ps_h_end]
|
||||||
hashes[tok_i, hash_idx] = hash32(sr_buf, aff_len * sizeof(Py_UCS4), 0)
|
hashes[tok_i, hash_idx] = hash32(ss_r_buf, aff_len * sizeof(Py_UCS4), 0)
|
||||||
|
|
||||||
self.mem.free(aff_buf)
|
self.mem.free(aff_buf)
|
||||||
self.mem.free(pr_buf)
|
self.mem.free(ps_r_buf)
|
||||||
self.mem.free(sr_buf)
|
self.mem.free(ss_r_buf)
|
||||||
return hashes
|
return hashes
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
@ -2051,8 +2053,9 @@ cdef void _copy_chars(
|
||||||
"""Copies *length* Py_UCS4 characters from *source* to *target*. If *to_lower==True*, converts
|
"""Copies *length* Py_UCS4 characters from *source* to *target*. If *to_lower==True*, converts
|
||||||
any upper-case characters to lower case within the target buffer.
|
any upper-case characters to lower case within the target buffer.
|
||||||
"""
|
"""
|
||||||
memcpy(target, source, length * sizeof(Py_UCS4))
|
|
||||||
cdef int idx
|
cdef int idx
|
||||||
|
|
||||||
|
memcpy(target, source, length * sizeof(Py_UCS4))
|
||||||
if to_lower:
|
if to_lower:
|
||||||
for idx in range(length):
|
for idx in range(length):
|
||||||
if Py_UNICODE_ISUPPER(target[idx]):
|
if Py_UNICODE_ISUPPER(target[idx]):
|
||||||
|
@ -2089,15 +2092,18 @@ cdef void _set_affixes(
|
||||||
if tok_len < pref_len:
|
if tok_len < pref_len:
|
||||||
memset(aff_buf + aff_buf_idx, 0, sizeof(Py_UCS4) * (pref_len - tok_len))
|
memset(aff_buf + aff_buf_idx, 0, sizeof(Py_UCS4) * (pref_len - tok_len))
|
||||||
aff_buf_idx = aff_buf_len - suff_len
|
aff_buf_idx = aff_buf_len - suff_len
|
||||||
|
|
||||||
if tok_len < suff_len:
|
if tok_len < suff_len:
|
||||||
memset(aff_buf + aff_buf_idx, 0, sizeof(Py_UCS4) * (suff_len - tok_len))
|
memset(aff_buf + aff_buf_idx, 0, sizeof(Py_UCS4) * (suff_len - tok_len))
|
||||||
aff_buf_idx = aff_buf_len - tok_len
|
aff_buf_idx = aff_buf_len - tok_len
|
||||||
|
|
||||||
if suff_len > 0:
|
if suff_len > 0:
|
||||||
|
# in_word_idx: the index within the token where the suffix starts
|
||||||
in_word_idx = aff_buf_idx + tok_len - aff_buf_len
|
in_word_idx = aff_buf_idx + tok_len - aff_buf_len
|
||||||
if in_word_idx < pref_len:
|
if in_word_idx < pref_len:
|
||||||
memcpy(aff_buf + aff_buf_idx, aff_buf + in_word_idx, sizeof(Py_UCS4) * (filled_pref_len - in_word_idx))
|
memcpy(aff_buf + aff_buf_idx, aff_buf + in_word_idx, sizeof(Py_UCS4) * (filled_pref_len - in_word_idx))
|
||||||
aff_buf_idx += filled_pref_len - in_word_idx
|
aff_buf_idx += filled_pref_len - in_word_idx
|
||||||
|
in_word_idx = aff_buf_idx + tok_len - aff_buf_len
|
||||||
if aff_buf_idx < aff_buf_len:
|
if aff_buf_idx < aff_buf_len:
|
||||||
_copy_chars(aff_buf + aff_buf_idx, text_buf + tok_idx + in_word_idx, aff_buf_len - aff_buf_idx, to_lower)
|
_copy_chars(aff_buf + aff_buf_idx, text_buf + tok_idx + in_word_idx, aff_buf_len - aff_buf_idx, to_lower)
|
||||||
|
|
||||||
|
@ -2112,7 +2118,7 @@ cdef void _search_for_chars(
|
||||||
Py_UCS4* result_buf,
|
Py_UCS4* result_buf,
|
||||||
const int result_buf_len,
|
const int result_buf_len,
|
||||||
bint suffs_not_prefs
|
bint suffs_not_prefs
|
||||||
):
|
) nogil:
|
||||||
""" Search a word within a string for characters within *search_buf*, starting at the beginning or
|
""" Search a word within a string for characters within *search_buf*, starting at the beginning or
|
||||||
end depending on the value of *suffs_not_prefs*. Wherever a character from *search_buf* matches,
|
end depending on the value of *suffs_not_prefs*. Wherever a character from *search_buf* matches,
|
||||||
the corresponding character from *lookup_buf* is added to *result_buf*.
|
the corresponding character from *lookup_buf* is added to *result_buf*.
|
||||||
|
|
|
@ -1791,8 +1791,7 @@ def get_arrays_for_search_chars(
|
||||||
def get_ordered_raw_bytes(
|
def get_ordered_raw_bytes(
|
||||||
search: List[bytes], lookup: List[bytes]
|
search: List[bytes], lookup: List[bytes]
|
||||||
) -> Tuple[bytes, bytes]:
|
) -> Tuple[bytes, bytes]:
|
||||||
"""Flatten the two lists, ordering both by the entries in *search*
|
"""Flatten the two lists, ordering both by the entries in *search*.
|
||||||
using the native endianness of the platform.
|
|
||||||
"""
|
"""
|
||||||
num_search = [list(entry) for entry in search]
|
num_search = [list(entry) for entry in search]
|
||||||
search = [entry for _, entry in sorted(zip(num_search, search))]
|
search = [entry for _, entry in sorted(zip(num_search, search))]
|
||||||
|
|
|
@ -210,13 +210,14 @@ one letter or letters regularly alternate with another letter or letters
|
||||||
depending on the presence of some other letter before or after it, e.g. German
|
depending on the presence of some other letter before or after it, e.g. German
|
||||||
plural nouns where the final two vowels are `ä-e` regularly correspond to
|
plural nouns where the final two vowels are `ä-e` regularly correspond to
|
||||||
singular lemmas where the `e` is no longer present and the `ä` has become `a`.
|
singular lemmas where the `e` is no longer present and the `ä` has become `a`.
|
||||||
For most languages, searching is likely to be useful starting at the end
|
For most languages used with spaCy, searching is likely to be useful starting
|
||||||
(`suff_*`), but the ability to search from the beginning (`pref_*`) is also
|
at the end (`suff_*`), but the ability to search from the beginning (`pref_*`)
|
||||||
offered for completeness. Search characters should consist of all characters
|
is also offered for completeness. Search characters should consist of all
|
||||||
that regularly alternate with other characters in the language in question or
|
characters that regularly alternate with other characters in the language in
|
||||||
whose presence before or after characters that would otherwise alternate
|
question or whose presence before or after characters that would otherwise
|
||||||
prevents the alternation from occurring, e.g. an `ä` in a German plural noun
|
alternate prevents the alternation from occurring, e.g. an `ä` in a German
|
||||||
does not become `a` if it is the third or fourth vowel from the end of the word.
|
plural noun does not become `a` if it is the third or fourth vowel from the
|
||||||
|
end of the word.
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
@ -226,13 +227,13 @@ does not become `a` if it is the third or fourth vowel from the end of the word.
|
||||||
| `include_static_vectors` | Whether to also use static word vectors. Requires a vectors table to be loaded in the [`Doc`](/api/doc) objects' vocab. ~~bool~~ |
|
| `include_static_vectors` | Whether to also use static word vectors. Requires a vectors table to be loaded in the [`Doc`](/api/doc) objects' vocab. ~~bool~~ |
|
||||||
| `case_sensitive` | Whether lower-case and upper-case letters should be distinguished when generating the character combinations to use as features. ~~bool~~ |
|
| `case_sensitive` | Whether lower-case and upper-case letters should be distinguished when generating the character combinations to use as features. ~~bool~~ |
|
||||||
| `pref_lengths` | The lengths of prefixes to use as features for each word, e.g. for the word `spaCy`: `[1, 3]` would lead to `s` and `spa` being used as features. ~~Optional[List[int]~~ |
|
| `pref_lengths` | The lengths of prefixes to use as features for each word, e.g. for the word `spaCy`: `[1, 3]` would lead to `s` and `spa` being used as features. ~~Optional[List[int]~~ |
|
||||||
|
| `suff_lengths` | The lengths of suffixes to use as features for each word, e.g. for the word `spaCy`: `[1, 3]` would lead to `y` and `aCy` being used as features. ~~Optional[List[int]~~ |
|
||||||
| `pref_rows` | The number of rows for each of `pref_lengths`. ~~Optional[List[int]~~ |
|
| `pref_rows` | The number of rows for each of `pref_lengths`. ~~Optional[List[int]~~ |
|
||||||
| `pref_search_chars` | A string containing characters to search for starting from the beginning of each word. May not contain characters that occupy four bytes in UTF-16; if `case_sensitive==True`, may not contain upper-case letters. ~~Optional[str]~~ |
|
| `pref_search_chars` | A string containing characters to search for starting from the beginning of each word. ~~Optional[str]~~ |
|
||||||
| `pref_search_lengths` | The lengths of search result strings to use as features, where the searches start from the beginning of each word. ~~Optional[List[int]]~~ |
|
| `pref_search_lengths` | The lengths of search result strings to use as features, where the searches start from the beginning of each word. ~~Optional[List[int]]~~ |
|
||||||
| `pref_search_rows` | The number of rows for each of `pref_search_lengths`. ~~Optional[List[int]~~ |
|
| `pref_search_rows` | The number of rows for each of `pref_search_lengths`. ~~Optional[List[int]~~ |
|
||||||
| `suff_lengths` | The lengths of suffixes to use as features for each word, e.g. for the word `spaCy`: `[1, 3]` would lead to `y` and `aCy` being used as features. ~~Optional[List[int]~~ |
|
|
||||||
| `suff_rows` | The number of rows for each of `suff_lengths`. ~~Optional[List[int]~~ |
|
| `suff_rows` | The number of rows for each of `suff_lengths`. ~~Optional[List[int]~~ |
|
||||||
| `suff_search_chars` | A string containing characters to search for starting from the end of each word. May not contain characters that occupy four bytes in UTF-16; if `case_sensitive==True`, may not contain upper-case letters. ~~Optional[str]~~ |
|
| `suff_search_chars` | A string containing characters to search for starting from the end of each word. ~~Optional[str]~~ |
|
||||||
| `suff_search_lengths` | The lengths of search result strings to use as features, where the searches start from the end of each word. ~~Optional[List[int]]~~ |
|
| `suff_search_lengths` | The lengths of search result strings to use as features, where the searches start from the end of each word. ~~Optional[List[int]]~~ |
|
||||||
| `suff_search_rows` | The number of rows for each of `suff_search_lengths`. ~~Optional[List[int]~~ |
|
| `suff_search_rows` | The number of rows for each of `suff_search_lengths`. ~~Optional[List[int]~~ |
|
||||||
| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ |
|
| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||||
|
|
Loading…
Reference in New Issue
Block a user