mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-02 19:30:19 +03:00
Major refactoring
This commit is contained in:
parent
f7d9942e7c
commit
42b7b8d509
|
@ -224,11 +224,11 @@ def RichMultiHashEmbed(
|
|||
case_sensitive: bool,
|
||||
pref_lengths: Optional[List[int]] = None,
|
||||
pref_rows: Optional[List[int]] = None,
|
||||
suff_lengths: Optional[List[int]] = None,
|
||||
suff_rows: Optional[List[int]] = None,
|
||||
pref_search_chars: Optional[str] = None,
|
||||
pref_search_lengths: Optional[List[int]] = None,
|
||||
pref_search_rows: Optional[List[int]] = None,
|
||||
suff_lengths: Optional[List[int]] = None,
|
||||
suff_rows: Optional[List[int]] = None,
|
||||
suff_search_chars: Optional[str] = None,
|
||||
suff_search_lengths: Optional[List[int]] = None,
|
||||
suff_search_rows: Optional[List[int]] = None,
|
||||
|
@ -252,13 +252,14 @@ def RichMultiHashEmbed(
|
|||
depending on the presence of some other letter before or after it, e.g. German
|
||||
plural nouns where the final two vowels are `ä-e` regularly correspond to
|
||||
singular lemmas where the `e` is no longer present and the `ä` has become `a`.
|
||||
For most languages, searching is likely to be useful starting at the end
|
||||
(`suff_*`), but the ability to search from the beginning (`pref_*`) is also
|
||||
offered for completeness. Search characters should consist of all characters
|
||||
that regularly alternate with other characters in the language in question or
|
||||
whose presence before or after characters that would otherwise alternate
|
||||
prevents the alternation from occurring, e.g. an `ä` in a German plural noun does
|
||||
not become `a` if it is the third or fourth vowel from the end of the word.
|
||||
For most languages used with spaCy, searching is likely to be useful starting
|
||||
at the end (`suff_*`), but the ability to search from the beginning (`pref_*`)
|
||||
is also offered for completeness. Search characters should consist of all
|
||||
characters that regularly alternate with other characters in the language in
|
||||
question or whose presence before or after characters that would otherwise
|
||||
alternate prevents the alternation from occurring, e.g. an `ä` in a German
|
||||
plural noun does not become `a` if it is the third or fourth vowel from the
|
||||
end of the word.
|
||||
|
||||
width (int): The output width. Also used as the width of the embedding tables.
|
||||
Recommended values are between 64 and 300.
|
||||
|
@ -274,22 +275,18 @@ def RichMultiHashEmbed(
|
|||
for each word, e.g. for the word `spaCy`:
|
||||
`[1, 3]` would lead to `s` and `spa` being used as features.
|
||||
pref_rows (Optional[List[int]]): The number of rows for each of `pref_lengths`.
|
||||
pref_search_chars (Optional[str]): A string containing characters to search for
|
||||
starting from the beginning of each word. May not contain characters that
|
||||
occupy four bytes in UTF-16; if `case_sensitive==True`, may not contain
|
||||
upper-case letters.
|
||||
pref_search_lengths (Optional[List[int]]): The lengths of search result strings
|
||||
to use as features, where the searches start from the beginning of each word.
|
||||
pref_search_rows (Optional[List[int]]): The number of rows for each of
|
||||
`pref_search_lengths`.
|
||||
suff_lengths (Optional[List[int]]): The lengths of suffixes to use as features
|
||||
for each word, e.g. for the word `spaCy`:
|
||||
`[1, 3]` would lead to `y` and `aCy` being used as features.
|
||||
suff_rows (Optional[List[int]]): The number of rows for each of `suff_lengths`.
|
||||
pref_search_chars (Optional[str]): A string containing characters to search for
|
||||
starting from the beginning of each word.
|
||||
pref_search_lengths (Optional[List[int]]): The lengths of search result strings
|
||||
to use as features, where the searches start from the beginning of each word.
|
||||
pref_search_rows (Optional[List[int]]): The number of rows for each of
|
||||
`pref_search_lengths`.
|
||||
suff_search_chars (Optional[str]): A string containing characters to search for
|
||||
starting from the end of each word. May not contain characters that
|
||||
occupy four bytes in UTF-16; if `case_sensitive==True`, may not contain
|
||||
upper-case letters.
|
||||
starting from the end of each word.
|
||||
suff_search_lengths (Optional[List[int]]): The lengths of search result strings
|
||||
to use as features, where the searches start from the end of each word.
|
||||
suff_search_rows (Optional[List[int]]): The number of rows for each of
|
||||
|
@ -302,6 +299,9 @@ def RichMultiHashEmbed(
|
|||
_verify_rich_config_group(
|
||||
"prefix", pref_lengths, pref_rows, None, False, case_sensitive
|
||||
)
|
||||
_verify_rich_config_group(
|
||||
"suffix", suff_lengths, suff_rows, None, False, case_sensitive
|
||||
)
|
||||
_verify_rich_config_group(
|
||||
"prefix search",
|
||||
pref_search_lengths,
|
||||
|
@ -310,9 +310,6 @@ def RichMultiHashEmbed(
|
|||
True,
|
||||
case_sensitive,
|
||||
)
|
||||
_verify_rich_config_group(
|
||||
"suffix", suff_lengths, suff_rows, None, False, case_sensitive
|
||||
)
|
||||
_verify_rich_config_group(
|
||||
"suffix search",
|
||||
suff_search_lengths,
|
||||
|
@ -324,10 +321,10 @@ def RichMultiHashEmbed(
|
|||
|
||||
if pref_rows is not None:
|
||||
rows.extend(pref_rows)
|
||||
if pref_search_rows is not None:
|
||||
rows.extend(pref_search_rows)
|
||||
if suff_rows is not None:
|
||||
rows.extend(suff_rows)
|
||||
if pref_search_rows is not None:
|
||||
rows.extend(pref_search_rows)
|
||||
if suff_search_rows is not None:
|
||||
rows.extend(suff_search_rows)
|
||||
|
||||
|
@ -344,9 +341,9 @@ def RichMultiHashEmbed(
|
|||
RichFeatureExtractor(
|
||||
case_sensitive=case_sensitive,
|
||||
pref_lengths=pref_lengths,
|
||||
suff_lengths=suff_lengths,
|
||||
pref_search_chars=pref_search_chars,
|
||||
pref_search_lengths=pref_search_lengths,
|
||||
suff_lengths=suff_lengths,
|
||||
suff_search_chars=suff_search_chars,
|
||||
suff_search_lengths=suff_search_lengths,
|
||||
),
|
||||
|
|
|
@ -11,9 +11,9 @@ def RichFeatureExtractor(
|
|||
*,
|
||||
case_sensitive: bool,
|
||||
pref_lengths: Optional[List[int]] = None,
|
||||
suff_lengths: Optional[List[int]] = None,
|
||||
pref_search_chars: Optional[str] = None,
|
||||
pref_search_lengths: Optional[List[int]] = None,
|
||||
suff_lengths: Optional[List[int]] = None,
|
||||
suff_search_chars: Optional[str] = None,
|
||||
suff_search_lengths: Optional[List[int]] = None,
|
||||
) -> Model[List[Doc], List[Ints2d]]:
|
||||
|
|
|
@ -67,7 +67,7 @@ cdef void _search_for_chars(
|
|||
Py_UCS4* result_buf,
|
||||
const int result_buf_len,
|
||||
bint suffs_not_prefs
|
||||
)
|
||||
) nogil
|
||||
|
||||
|
||||
cdef class Doc:
|
||||
|
|
|
@ -1755,6 +1755,7 @@ cdef class Doc:
|
|||
derived from the raw text of each token.
|
||||
|
||||
Generally:
|
||||
|
||||
p_ variables relate to prefixes (affixes starting at the beginning of the word)
|
||||
s_ variables relate to suffixes (affixes starting at the end of the word)
|
||||
ps_ variables relate to searches starting at the beginning of the word
|
||||
|
@ -1770,7 +1771,7 @@ cdef class Doc:
|
|||
the corresponding position in *ps_search* is matched. Having separate search and lookup arrays enables
|
||||
case-insensitivity to be handled efficiently.
|
||||
ps_search_l: the number of characters in *ps_search* and hence also in *ps_lookup*
|
||||
ps_lengths: an Ints1d specifying the lengths of search results to be hashed. For example if
|
||||
ps_lengths: an Ints1d specifying the lengths of search results (from the beginning) to be hashed. For example, if
|
||||
*ps_lengths==[1, 2]*, *ps_search=="aC" and *cs==False*, the searched strings hashed for
|
||||
"spaCy" would be "a" and "ac".
|
||||
ss_search: a byte array containing characters to search for within each token, starting at the end.
|
||||
|
@ -1778,12 +1779,13 @@ cdef class Doc:
|
|||
the corresponding position in *ss_search* is matched. Having separate search and lookup arrays enables
|
||||
case-insensitivity to be handled efficiently.
|
||||
ss_l: the number of characters in *ss_search* and hence also in *ss_lookup*
|
||||
ss_lengths: an integer list specifying the lengths of search results to be hashed. For example if
|
||||
*suff_search_lengths==[1, 2]*, *suff_search_chars=="aC" and *cs==False*, the searched strings hashed for
|
||||
ss_lengths: an integer list specifying the lengths of search results (from the end) to be hashed. For example, if
|
||||
*ss_lengths==[1, 2]*, *ss_search=="aC" and *cs==False*, the searched strings hashed for
|
||||
"spaCy" would be "c" and "ca".
|
||||
|
||||
For a document with tokens ["spaCy", "and", "Prodigy"], the NumPy array returned by
|
||||
*get_character_combination_hashes(True, [2], [2, 4, 6], "", "", 0, [], "yC", "yC", 2, [1, 2])* would correspond to
|
||||
*get_character_combination_hashes(True, [2], [2, 4, 6], "", "", 0, [], "yC", "yC", 2, [1, 2])*
|
||||
would correspond to
|
||||
|
||||
[[hash("sp"), [hash("Cy"), hash("paCy"), hash(" spaCy"), hash("y"), hash("yC")],
|
||||
[hash("an"), hash("nd"), hash(" and", hash(" and"), hash(" "), hash(" "))],
|
||||
|
@ -1812,18 +1814,18 @@ cdef class Doc:
|
|||
# Define / allocate buffer (pr/sr: result buffers)
|
||||
cdef int aff_buf_l = p_max_l + s_max_l
|
||||
cdef Py_UCS4* aff_buf = <Py_UCS4*> self.mem.alloc(aff_buf_l, sizeof(Py_UCS4))
|
||||
cdef Py_UCS4* ps_buf = <Py_UCS4*> ps_search
|
||||
cdef Py_UCS4* pl_buf = <Py_UCS4*> ps_lookup
|
||||
cdef Py_UCS4* pr_buf = <Py_UCS4*> self.mem.alloc(ps_max_l, sizeof(Py_UCS4))
|
||||
cdef Py_UCS4* ss_buf = <Py_UCS4*> ss_search
|
||||
cdef Py_UCS4* sl_buf = <Py_UCS4*> ss_lookup
|
||||
cdef Py_UCS4* sr_buf = <Py_UCS4*> self.mem.alloc(ss_max_l, sizeof(Py_UCS4))
|
||||
cdef Py_UCS4* ps_s_buf = <Py_UCS4*> ps_search
|
||||
cdef Py_UCS4* ps_l_buf = <Py_UCS4*> ps_lookup
|
||||
cdef Py_UCS4* ps_r_buf = <Py_UCS4*> self.mem.alloc(ps_max_l, sizeof(Py_UCS4))
|
||||
cdef Py_UCS4* ss_s_buf = <Py_UCS4*> ss_search
|
||||
cdef Py_UCS4* ss_l_buf = <Py_UCS4*> ss_lookup
|
||||
cdef Py_UCS4* ss_r_buf = <Py_UCS4*> self.mem.alloc(ss_max_l, sizeof(Py_UCS4))
|
||||
|
||||
# Define memory views on length arrays
|
||||
cdef int[:] p_v = p_lengths
|
||||
cdef int[:] s_v = s_lengths
|
||||
cdef int[:] ps_v = ps_lengths
|
||||
cdef int[:] ss_v = ss_lengths
|
||||
cdef int[:] p_lengths_v = p_lengths
|
||||
cdef int[:] s_lengths_v = s_lengths
|
||||
cdef int[:] ps_lengths_v = ps_lengths
|
||||
cdef int[:] ss_lengths_v = ss_lengths
|
||||
|
||||
# Define working variables
|
||||
cdef TokenC tok_c
|
||||
|
@ -1838,27 +1840,27 @@ cdef class Doc:
|
|||
_set_affixes(text_buf, tok_idx, tok_len, aff_buf, p_max_l, s_max_l, not cs)
|
||||
|
||||
for hash_idx in range(p_h_num):
|
||||
hashes[tok_i, hash_idx] = hash32(aff_buf, p_v[hash_idx] * sizeof(Py_UCS4), 0)
|
||||
hashes[tok_i, hash_idx] = hash32(aff_buf, p_lengths_v[hash_idx] * sizeof(Py_UCS4), 0)
|
||||
|
||||
for hash_idx in range(p_h_num, s_h_end):
|
||||
aff_len = s_v[hash_idx - p_h_num]
|
||||
aff_len = s_lengths_v[hash_idx - p_h_num]
|
||||
hashes[tok_i, hash_idx] = hash32(aff_buf + aff_buf_l - aff_len, aff_len * sizeof(Py_UCS4), 0)
|
||||
|
||||
if ps_h_num > 0:
|
||||
_search_for_chars(text_buf, tok_idx, tok_len, ps_buf, pl_buf, ps_l, pr_buf, ps_max_l, False)
|
||||
_search_for_chars(text_buf, tok_idx, tok_len, ps_s_buf, ps_l_buf, ps_l, ps_r_buf, ps_max_l, False)
|
||||
for hash_idx in range(s_h_end, ps_h_end):
|
||||
aff_len = ps_v[hash_idx - s_h_end]
|
||||
hashes[tok_i, hash_idx] = hash32(pr_buf, aff_len * sizeof(Py_UCS4), 0)
|
||||
aff_len = ps_lengths_v[hash_idx - s_h_end]
|
||||
hashes[tok_i, hash_idx] = hash32(ps_r_buf, aff_len * sizeof(Py_UCS4), 0)
|
||||
|
||||
if ss_h_num > 0:
|
||||
_search_for_chars(text_buf, tok_idx, tok_len, ss_buf, sl_buf, ss_l, sr_buf, ss_max_l, True)
|
||||
_search_for_chars(text_buf, tok_idx, tok_len, ss_s_buf, ss_l_buf, ss_l, ss_r_buf, ss_max_l, True)
|
||||
for hash_idx in range(ps_h_end, ss_h_end):
|
||||
aff_len = ss_v[hash_idx - ps_h_end]
|
||||
hashes[tok_i, hash_idx] = hash32(sr_buf, aff_len * sizeof(Py_UCS4), 0)
|
||||
aff_len = ss_lengths_v[hash_idx - ps_h_end]
|
||||
hashes[tok_i, hash_idx] = hash32(ss_r_buf, aff_len * sizeof(Py_UCS4), 0)
|
||||
|
||||
self.mem.free(aff_buf)
|
||||
self.mem.free(pr_buf)
|
||||
self.mem.free(sr_buf)
|
||||
self.mem.free(ps_r_buf)
|
||||
self.mem.free(ss_r_buf)
|
||||
return hashes
|
||||
|
||||
@staticmethod
|
||||
|
@ -2051,8 +2053,9 @@ cdef void _copy_chars(
|
|||
"""Copies *length* Py_UCS4 characters from *source* to *target*. If *to_lower==True*, converts
|
||||
any upper-case characters to lower case within the target buffer.
|
||||
"""
|
||||
memcpy(target, source, length * sizeof(Py_UCS4))
|
||||
cdef int idx
|
||||
|
||||
memcpy(target, source, length * sizeof(Py_UCS4))
|
||||
if to_lower:
|
||||
for idx in range(length):
|
||||
if Py_UNICODE_ISUPPER(target[idx]):
|
||||
|
@ -2089,15 +2092,18 @@ cdef void _set_affixes(
|
|||
if tok_len < pref_len:
|
||||
memset(aff_buf + aff_buf_idx, 0, sizeof(Py_UCS4) * (pref_len - tok_len))
|
||||
aff_buf_idx = aff_buf_len - suff_len
|
||||
|
||||
if tok_len < suff_len:
|
||||
memset(aff_buf + aff_buf_idx, 0, sizeof(Py_UCS4) * (suff_len - tok_len))
|
||||
aff_buf_idx = aff_buf_len - tok_len
|
||||
|
||||
if suff_len > 0:
|
||||
# in_word_idx: the index within the token where the suffix starts
|
||||
in_word_idx = aff_buf_idx + tok_len - aff_buf_len
|
||||
if in_word_idx < pref_len:
|
||||
memcpy(aff_buf + aff_buf_idx, aff_buf + in_word_idx, sizeof(Py_UCS4) * (filled_pref_len - in_word_idx))
|
||||
aff_buf_idx += filled_pref_len - in_word_idx
|
||||
in_word_idx = aff_buf_idx + tok_len - aff_buf_len
|
||||
if aff_buf_idx < aff_buf_len:
|
||||
_copy_chars(aff_buf + aff_buf_idx, text_buf + tok_idx + in_word_idx, aff_buf_len - aff_buf_idx, to_lower)
|
||||
|
||||
|
@ -2112,7 +2118,7 @@ cdef void _search_for_chars(
|
|||
Py_UCS4* result_buf,
|
||||
const int result_buf_len,
|
||||
bint suffs_not_prefs
|
||||
):
|
||||
) nogil:
|
||||
""" Search a word within a string for characters within *search_buf*, starting at the beginning or
|
||||
end depending on the value of *suffs_not_prefs*. Wherever a character from *search_buf* matches,
|
||||
the corresponding character from *lookup_buf* is added to *result_buf*.
|
||||
|
|
|
@ -1791,8 +1791,7 @@ def get_arrays_for_search_chars(
|
|||
def get_ordered_raw_bytes(
|
||||
search: List[bytes], lookup: List[bytes]
|
||||
) -> Tuple[bytes, bytes]:
|
||||
"""Flatten the two lists, ordering both by the entries in *search*
|
||||
using the native endianness of the platform.
|
||||
"""Flatten the two lists, ordering both by the entries in *search*.
|
||||
"""
|
||||
num_search = [list(entry) for entry in search]
|
||||
search = [entry for _, entry in sorted(zip(num_search, search))]
|
||||
|
|
|
@ -210,13 +210,14 @@ one letter or letters regularly alternate with another letter or letters
|
|||
depending on the presence of some other letter before or after it, e.g. German
|
||||
plural nouns where the final two vowels are `ä-e` regularly correspond to
|
||||
singular lemmas where the `e` is no longer present and the `ä` has become `a`.
|
||||
For most languages, searching is likely to be useful starting at the end
|
||||
(`suff_*`), but the ability to search from the beginning (`pref_*`) is also
|
||||
offered for completeness. Search characters should consist of all characters
|
||||
that regularly alternate with other characters in the language in question or
|
||||
whose presence before or after characters that would otherwise alternate
|
||||
prevents the alternation from occurring, e.g. an `ä` in a German plural noun
|
||||
does not become `a` if it is the third or fourth vowel from the end of the word.
|
||||
For most languages used with spaCy, searching is likely to be useful starting
|
||||
at the end (`suff_*`), but the ability to search from the beginning (`pref_*`)
|
||||
is also offered for completeness. Search characters should consist of all
|
||||
characters that regularly alternate with other characters in the language in
|
||||
question or whose presence before or after characters that would otherwise
|
||||
alternate prevents the alternation from occurring, e.g. an `ä` in a German
|
||||
plural noun does not become `a` if it is the third or fourth vowel from the
|
||||
end of the word.
|
||||
|
||||
| Name | Description |
|
||||
| ------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
|
@ -226,13 +227,13 @@ does not become `a` if it is the third or fourth vowel from the end of the word.
|
|||
| `include_static_vectors` | Whether to also use static word vectors. Requires a vectors table to be loaded in the [`Doc`](/api/doc) objects' vocab. ~~bool~~ |
|
||||
| `case_sensitive` | Whether lower-case and upper-case letters should be distinguished when generating the character combinations to use as features. ~~bool~~ |
|
||||
| `pref_lengths` | The lengths of prefixes to use as features for each word, e.g. for the word `spaCy`: `[1, 3]` would lead to `s` and `spa` being used as features. ~~Optional[List[int]~~ |
|
||||
| `suff_lengths` | The lengths of suffixes to use as features for each word, e.g. for the word `spaCy`: `[1, 3]` would lead to `y` and `aCy` being used as features. ~~Optional[List[int]~~ |
|
||||
| `pref_rows` | The number of rows for each of `pref_lengths`. ~~Optional[List[int]~~ |
|
||||
| `pref_search_chars` | A string containing characters to search for starting from the beginning of each word. May not contain characters that occupy four bytes in UTF-16; if `case_sensitive==True`, may not contain upper-case letters. ~~Optional[str]~~ |
|
||||
| `pref_search_chars` | A string containing characters to search for starting from the beginning of each word. ~~Optional[str]~~ |
|
||||
| `pref_search_lengths` | The lengths of search result strings to use as features, where the searches start from the beginning of each word. ~~Optional[List[int]]~~ |
|
||||
| `pref_search_rows` | The number of rows for each of `pref_search_lengths`. ~~Optional[List[int]~~ |
|
||||
| `suff_lengths` | The lengths of suffixes to use as features for each word, e.g. for the word `spaCy`: `[1, 3]` would lead to `y` and `aCy` being used as features. ~~Optional[List[int]~~ |
|
||||
| `suff_rows` | The number of rows for each of `suff_lengths`. ~~Optional[List[int]~~ |
|
||||
| `suff_search_chars` | A string containing characters to search for starting from the end of each word. May not contain characters that occupy four bytes in UTF-16; if `case_sensitive==True`, may not contain upper-case letters. ~~Optional[str]~~ |
|
||||
| `suff_search_chars` | A string containing characters to search for starting from the end of each word. ~~Optional[str]~~ |
|
||||
| `suff_search_lengths` | The lengths of search result strings to use as features, where the searches start from the end of each word. ~~Optional[List[int]]~~ |
|
||||
| `suff_search_rows` | The number of rows for each of `suff_search_lengths`. ~~Optional[List[int]~~ |
|
||||
| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||
|
|
Loading…
Reference in New Issue
Block a user