From 42b7b8d509994864fdf0395007fa4117a88d6aff Mon Sep 17 00:00:00 2001 From: richardpaulhudson Date: Fri, 21 Oct 2022 12:01:24 +0200 Subject: [PATCH] Major refactoring --- spacy/ml/models/tok2vec.py | 49 +++++++++++------------ spacy/ml/richfeatureextractor.py | 2 +- spacy/tokens/doc.pxd | 2 +- spacy/tokens/doc.pyx | 66 +++++++++++++++++-------------- spacy/util.py | 3 +- website/docs/api/architectures.md | 21 +++++----- 6 files changed, 73 insertions(+), 70 deletions(-) diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py index bd67613e1..2d72a417f 100644 --- a/spacy/ml/models/tok2vec.py +++ b/spacy/ml/models/tok2vec.py @@ -224,11 +224,11 @@ def RichMultiHashEmbed( case_sensitive: bool, pref_lengths: Optional[List[int]] = None, pref_rows: Optional[List[int]] = None, + suff_lengths: Optional[List[int]] = None, + suff_rows: Optional[List[int]] = None, pref_search_chars: Optional[str] = None, pref_search_lengths: Optional[List[int]] = None, pref_search_rows: Optional[List[int]] = None, - suff_lengths: Optional[List[int]] = None, - suff_rows: Optional[List[int]] = None, suff_search_chars: Optional[str] = None, suff_search_lengths: Optional[List[int]] = None, suff_search_rows: Optional[List[int]] = None, @@ -252,13 +252,14 @@ def RichMultiHashEmbed( depending on the presence of some other letter before or after it, e.g. German plural nouns where the final two vowels are `ä-e` regularly correspond to singular lemmas where the `e` is no longer present and the `ä` has become `a`. - For most languages, searching is likely to be useful starting at the end - (`suff_*`), but the ability to search from the beginning (`pref_*`) is also - offered for completeness. Search characters should consist of all characters - that regularly alternate with other characters in the language in question or - whose presence before or after characters that would otherwise alternate - prevents the alternation from occurring, e.g. an `ä` in a German plural noun does - not become `a` if it is the third or fourth vowel from the end of the word. + For most languages used with spaCy, searching is likely to be useful starting + at the end (`suff_*`), but the ability to search from the beginning (`pref_*`) + is also offered for completeness. Search characters should consist of all + characters that regularly alternate with other characters in the language in + question or whose presence before or after characters that would otherwise + alternate prevents the alternation from occurring, e.g. an `ä` in a German + plural noun does not become `a` if it is the third or fourth vowel from the + end of the word. width (int): The output width. Also used as the width of the embedding tables. Recommended values are between 64 and 300. @@ -274,22 +275,18 @@ def RichMultiHashEmbed( for each word, e.g. for the word `spaCy`: `[1, 3]` would lead to `s` and `spa` being used as features. pref_rows (Optional[List[int]]): The number of rows for each of `pref_lengths`. - pref_search_chars (Optional[str]): A string containing characters to search for - starting from the beginning of each word. May not contain characters that - occupy four bytes in UTF-16; if `case_sensitive==True`, may not contain - upper-case letters. - pref_search_lengths (Optional[List[int]]): The lengths of search result strings - to use as features, where the searches start from the beginning of each word. - pref_search_rows (Optional[List[int]]): The number of rows for each of - `pref_search_lengths`. suff_lengths (Optional[List[int]]): The lengths of suffixes to use as features for each word, e.g. for the word `spaCy`: `[1, 3]` would lead to `y` and `aCy` being used as features. suff_rows (Optional[List[int]]): The number of rows for each of `suff_lengths`. + pref_search_chars (Optional[str]): A string containing characters to search for + starting from the beginning of each word. + pref_search_lengths (Optional[List[int]]): The lengths of search result strings + to use as features, where the searches start from the beginning of each word. + pref_search_rows (Optional[List[int]]): The number of rows for each of + `pref_search_lengths`. suff_search_chars (Optional[str]): A string containing characters to search for - starting from the end of each word. May not contain characters that - occupy four bytes in UTF-16; if `case_sensitive==True`, may not contain - upper-case letters. + starting from the end of each word. suff_search_lengths (Optional[List[int]]): The lengths of search result strings to use as features, where the searches start from the end of each word. suff_search_rows (Optional[List[int]]): The number of rows for each of @@ -302,6 +299,9 @@ def RichMultiHashEmbed( _verify_rich_config_group( "prefix", pref_lengths, pref_rows, None, False, case_sensitive ) + _verify_rich_config_group( + "suffix", suff_lengths, suff_rows, None, False, case_sensitive + ) _verify_rich_config_group( "prefix search", pref_search_lengths, @@ -310,9 +310,6 @@ def RichMultiHashEmbed( True, case_sensitive, ) - _verify_rich_config_group( - "suffix", suff_lengths, suff_rows, None, False, case_sensitive - ) _verify_rich_config_group( "suffix search", suff_search_lengths, @@ -324,10 +321,10 @@ def RichMultiHashEmbed( if pref_rows is not None: rows.extend(pref_rows) - if pref_search_rows is not None: - rows.extend(pref_search_rows) if suff_rows is not None: rows.extend(suff_rows) + if pref_search_rows is not None: + rows.extend(pref_search_rows) if suff_search_rows is not None: rows.extend(suff_search_rows) @@ -344,9 +341,9 @@ def RichMultiHashEmbed( RichFeatureExtractor( case_sensitive=case_sensitive, pref_lengths=pref_lengths, + suff_lengths=suff_lengths, pref_search_chars=pref_search_chars, pref_search_lengths=pref_search_lengths, - suff_lengths=suff_lengths, suff_search_chars=suff_search_chars, suff_search_lengths=suff_search_lengths, ), diff --git a/spacy/ml/richfeatureextractor.py b/spacy/ml/richfeatureextractor.py index a31242a9e..7293c69a9 100644 --- a/spacy/ml/richfeatureextractor.py +++ b/spacy/ml/richfeatureextractor.py @@ -11,9 +11,9 @@ def RichFeatureExtractor( *, case_sensitive: bool, pref_lengths: Optional[List[int]] = None, + suff_lengths: Optional[List[int]] = None, pref_search_chars: Optional[str] = None, pref_search_lengths: Optional[List[int]] = None, - suff_lengths: Optional[List[int]] = None, suff_search_chars: Optional[str] = None, suff_search_lengths: Optional[List[int]] = None, ) -> Model[List[Doc], List[Ints2d]]: diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd index 39a199ff3..05e75dc17 100644 --- a/spacy/tokens/doc.pxd +++ b/spacy/tokens/doc.pxd @@ -67,7 +67,7 @@ cdef void _search_for_chars( Py_UCS4* result_buf, const int result_buf_len, bint suffs_not_prefs -) +) nogil cdef class Doc: diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 33f141ced..0eeda8199 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -1755,10 +1755,11 @@ cdef class Doc: derived from the raw text of each token. Generally: - p_ variables relate to prefixes (affixes starting at the beginning of the word) - s_ variables relate to suffixes (affixes starting at the end of the word) - ps_ variables relate to searches starting at the beginning of the word - ss_ variables relate to searches starting at the end of the word + + p_ variables relate to prefixes (affixes starting at the beginning of the word) + s_ variables relate to suffixes (affixes starting at the end of the word) + ps_ variables relate to searches starting at the beginning of the word + ss_ variables relate to searches starting at the end of the word cs: if *False*, hashes are generated based on the lower-case version of each token. p_lengths: an Ints1d specifying the lengths of prefixes to be hashed. For example, if *p_lengths==[2, 3]*, @@ -1770,7 +1771,7 @@ cdef class Doc: the corresponding position in *ps_search* is matched. Having separate search and lookup arrays enables case-insensitivity to be handled efficiently. ps_search_l: the number of characters in *ps_search* and hence also in *ps_lookup* - ps_lengths: an Ints1d specifying the lengths of search results to be hashed. For example if + ps_lengths: an Ints1d specifying the lengths of search results (from the beginning) to be hashed. For example, if *ps_lengths==[1, 2]*, *ps_search=="aC" and *cs==False*, the searched strings hashed for "spaCy" would be "a" and "ac". ss_search: a byte array containing characters to search for within each token, starting at the end. @@ -1778,12 +1779,13 @@ cdef class Doc: the corresponding position in *ss_search* is matched. Having separate search and lookup arrays enables case-insensitivity to be handled efficiently. ss_l: the number of characters in *ss_search* and hence also in *ss_lookup* - ss_lengths: an integer list specifying the lengths of search results to be hashed. For example if - *suff_search_lengths==[1, 2]*, *suff_search_chars=="aC" and *cs==False*, the searched strings hashed for + ss_lengths: an integer list specifying the lengths of search results (from the end) to be hashed. For example, if + *ss_lengths==[1, 2]*, *ss_search=="aC" and *cs==False*, the searched strings hashed for "spaCy" would be "c" and "ca". For a document with tokens ["spaCy", "and", "Prodigy"], the NumPy array returned by - *get_character_combination_hashes(True, [2], [2, 4, 6], "", "", 0, [], "yC", "yC", 2, [1, 2])* would correspond to + *get_character_combination_hashes(True, [2], [2, 4, 6], "", "", 0, [], "yC", "yC", 2, [1, 2])* + would correspond to [[hash("sp"), [hash("Cy"), hash("paCy"), hash(" spaCy"), hash("y"), hash("yC")], [hash("an"), hash("nd"), hash(" and", hash(" and"), hash(" "), hash(" "))], @@ -1812,18 +1814,18 @@ cdef class Doc: # Define / allocate buffer (pr/sr: result buffers) cdef int aff_buf_l = p_max_l + s_max_l cdef Py_UCS4* aff_buf = self.mem.alloc(aff_buf_l, sizeof(Py_UCS4)) - cdef Py_UCS4* ps_buf = ps_search - cdef Py_UCS4* pl_buf = ps_lookup - cdef Py_UCS4* pr_buf = self.mem.alloc(ps_max_l, sizeof(Py_UCS4)) - cdef Py_UCS4* ss_buf = ss_search - cdef Py_UCS4* sl_buf = ss_lookup - cdef Py_UCS4* sr_buf = self.mem.alloc(ss_max_l, sizeof(Py_UCS4)) + cdef Py_UCS4* ps_s_buf = ps_search + cdef Py_UCS4* ps_l_buf = ps_lookup + cdef Py_UCS4* ps_r_buf = self.mem.alloc(ps_max_l, sizeof(Py_UCS4)) + cdef Py_UCS4* ss_s_buf = ss_search + cdef Py_UCS4* ss_l_buf = ss_lookup + cdef Py_UCS4* ss_r_buf = self.mem.alloc(ss_max_l, sizeof(Py_UCS4)) # Define memory views on length arrays - cdef int[:] p_v = p_lengths - cdef int[:] s_v = s_lengths - cdef int[:] ps_v = ps_lengths - cdef int[:] ss_v = ss_lengths + cdef int[:] p_lengths_v = p_lengths + cdef int[:] s_lengths_v = s_lengths + cdef int[:] ps_lengths_v = ps_lengths + cdef int[:] ss_lengths_v = ss_lengths # Define working variables cdef TokenC tok_c @@ -1838,27 +1840,27 @@ cdef class Doc: _set_affixes(text_buf, tok_idx, tok_len, aff_buf, p_max_l, s_max_l, not cs) for hash_idx in range(p_h_num): - hashes[tok_i, hash_idx] = hash32(aff_buf, p_v[hash_idx] * sizeof(Py_UCS4), 0) + hashes[tok_i, hash_idx] = hash32(aff_buf, p_lengths_v[hash_idx] * sizeof(Py_UCS4), 0) for hash_idx in range(p_h_num, s_h_end): - aff_len = s_v[hash_idx - p_h_num] + aff_len = s_lengths_v[hash_idx - p_h_num] hashes[tok_i, hash_idx] = hash32(aff_buf + aff_buf_l - aff_len, aff_len * sizeof(Py_UCS4), 0) if ps_h_num > 0: - _search_for_chars(text_buf, tok_idx, tok_len, ps_buf, pl_buf, ps_l, pr_buf, ps_max_l, False) + _search_for_chars(text_buf, tok_idx, tok_len, ps_s_buf, ps_l_buf, ps_l, ps_r_buf, ps_max_l, False) for hash_idx in range(s_h_end, ps_h_end): - aff_len = ps_v[hash_idx - s_h_end] - hashes[tok_i, hash_idx] = hash32(pr_buf, aff_len * sizeof(Py_UCS4), 0) + aff_len = ps_lengths_v[hash_idx - s_h_end] + hashes[tok_i, hash_idx] = hash32(ps_r_buf, aff_len * sizeof(Py_UCS4), 0) if ss_h_num > 0: - _search_for_chars(text_buf, tok_idx, tok_len, ss_buf, sl_buf, ss_l, sr_buf, ss_max_l, True) + _search_for_chars(text_buf, tok_idx, tok_len, ss_s_buf, ss_l_buf, ss_l, ss_r_buf, ss_max_l, True) for hash_idx in range(ps_h_end, ss_h_end): - aff_len = ss_v[hash_idx - ps_h_end] - hashes[tok_i, hash_idx] = hash32(sr_buf, aff_len * sizeof(Py_UCS4), 0) + aff_len = ss_lengths_v[hash_idx - ps_h_end] + hashes[tok_i, hash_idx] = hash32(ss_r_buf, aff_len * sizeof(Py_UCS4), 0) self.mem.free(aff_buf) - self.mem.free(pr_buf) - self.mem.free(sr_buf) + self.mem.free(ps_r_buf) + self.mem.free(ss_r_buf) return hashes @staticmethod @@ -2051,8 +2053,9 @@ cdef void _copy_chars( """Copies *length* Py_UCS4 characters from *source* to *target*. If *to_lower==True*, converts any upper-case characters to lower case within the target buffer. """ - memcpy(target, source, length * sizeof(Py_UCS4)) cdef int idx + + memcpy(target, source, length * sizeof(Py_UCS4)) if to_lower: for idx in range(length): if Py_UNICODE_ISUPPER(target[idx]): @@ -2089,15 +2092,18 @@ cdef void _set_affixes( if tok_len < pref_len: memset(aff_buf + aff_buf_idx, 0, sizeof(Py_UCS4) * (pref_len - tok_len)) aff_buf_idx = aff_buf_len - suff_len + if tok_len < suff_len: memset(aff_buf + aff_buf_idx, 0, sizeof(Py_UCS4) * (suff_len - tok_len)) aff_buf_idx = aff_buf_len - tok_len if suff_len > 0: + # in_word_idx: the index within the token where the suffix starts in_word_idx = aff_buf_idx + tok_len - aff_buf_len if in_word_idx < pref_len: memcpy(aff_buf + aff_buf_idx, aff_buf + in_word_idx, sizeof(Py_UCS4) * (filled_pref_len - in_word_idx)) aff_buf_idx += filled_pref_len - in_word_idx + in_word_idx = aff_buf_idx + tok_len - aff_buf_len if aff_buf_idx < aff_buf_len: _copy_chars(aff_buf + aff_buf_idx, text_buf + tok_idx + in_word_idx, aff_buf_len - aff_buf_idx, to_lower) @@ -2112,7 +2118,7 @@ cdef void _search_for_chars( Py_UCS4* result_buf, const int result_buf_len, bint suffs_not_prefs -): +) nogil: """ Search a word within a string for characters within *search_buf*, starting at the beginning or end depending on the value of *suffs_not_prefs*. Wherever a character from *search_buf* matches, the corresponding character from *lookup_buf* is added to *result_buf*. diff --git a/spacy/util.py b/spacy/util.py index 8eaaf0889..c3add9fc9 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -1791,8 +1791,7 @@ def get_arrays_for_search_chars( def get_ordered_raw_bytes( search: List[bytes], lookup: List[bytes] ) -> Tuple[bytes, bytes]: - """Flatten the two lists, ordering both by the entries in *search* - using the native endianness of the platform. + """Flatten the two lists, ordering both by the entries in *search*. """ num_search = [list(entry) for entry in search] search = [entry for _, entry in sorted(zip(num_search, search))] diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md index 62ce83609..f34f570f8 100644 --- a/website/docs/api/architectures.md +++ b/website/docs/api/architectures.md @@ -210,13 +210,14 @@ one letter or letters regularly alternate with another letter or letters depending on the presence of some other letter before or after it, e.g. German plural nouns where the final two vowels are `ä-e` regularly correspond to singular lemmas where the `e` is no longer present and the `ä` has become `a`. -For most languages, searching is likely to be useful starting at the end -(`suff_*`), but the ability to search from the beginning (`pref_*`) is also -offered for completeness. Search characters should consist of all characters -that regularly alternate with other characters in the language in question or -whose presence before or after characters that would otherwise alternate -prevents the alternation from occurring, e.g. an `ä` in a German plural noun -does not become `a` if it is the third or fourth vowel from the end of the word. +For most languages used with spaCy, searching is likely to be useful starting +at the end (`suff_*`), but the ability to search from the beginning (`pref_*`) +is also offered for completeness. Search characters should consist of all +characters that regularly alternate with other characters in the language in +question or whose presence before or after characters that would otherwise +alternate prevents the alternation from occurring, e.g. an `ä` in a German +plural noun does not become `a` if it is the third or fourth vowel from the +end of the word. | Name | Description | | ------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | @@ -226,13 +227,13 @@ does not become `a` if it is the third or fourth vowel from the end of the word. | `include_static_vectors` | Whether to also use static word vectors. Requires a vectors table to be loaded in the [`Doc`](/api/doc) objects' vocab. ~~bool~~ | | `case_sensitive` | Whether lower-case and upper-case letters should be distinguished when generating the character combinations to use as features. ~~bool~~ | | `pref_lengths` | The lengths of prefixes to use as features for each word, e.g. for the word `spaCy`: `[1, 3]` would lead to `s` and `spa` being used as features. ~~Optional[List[int]~~ | +| `suff_lengths` | The lengths of suffixes to use as features for each word, e.g. for the word `spaCy`: `[1, 3]` would lead to `y` and `aCy` being used as features. ~~Optional[List[int]~~ | | `pref_rows` | The number of rows for each of `pref_lengths`. ~~Optional[List[int]~~ | -| `pref_search_chars` | A string containing characters to search for starting from the beginning of each word. May not contain characters that occupy four bytes in UTF-16; if `case_sensitive==True`, may not contain upper-case letters. ~~Optional[str]~~ | +| `pref_search_chars` | A string containing characters to search for starting from the beginning of each word. ~~Optional[str]~~ | | `pref_search_lengths` | The lengths of search result strings to use as features, where the searches start from the beginning of each word. ~~Optional[List[int]]~~ | | `pref_search_rows` | The number of rows for each of `pref_search_lengths`. ~~Optional[List[int]~~ | -| `suff_lengths` | The lengths of suffixes to use as features for each word, e.g. for the word `spaCy`: `[1, 3]` would lead to `y` and `aCy` being used as features. ~~Optional[List[int]~~ | | `suff_rows` | The number of rows for each of `suff_lengths`. ~~Optional[List[int]~~ | -| `suff_search_chars` | A string containing characters to search for starting from the end of each word. May not contain characters that occupy four bytes in UTF-16; if `case_sensitive==True`, may not contain upper-case letters. ~~Optional[str]~~ | +| `suff_search_chars` | A string containing characters to search for starting from the end of each word. ~~Optional[str]~~ | | `suff_search_lengths` | The lengths of search result strings to use as features, where the searches start from the end of each word. ~~Optional[List[int]]~~ | | `suff_search_rows` | The number of rows for each of `suff_search_lengths`. ~~Optional[List[int]~~ | | **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ |