diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py index 7f6898fe4..a0f936f50 100644 --- a/spacy/ml/models/tok2vec.py +++ b/spacy/ml/models/tok2vec.py @@ -248,11 +248,13 @@ def RichMultiHashEmbed( the end; and each character that matches one of the search characters is added, in order, to the string to be used as a feature. The search continues until either the search result string is full or the whole word has been examined. - This is useful because many languages exhibit morphological alternations where + This is useful because some languages exhibit morphological alternations where one letter or letters regularly alternate with another letter or letters depending on the presence of some other letter before or after it, e.g. German plural nouns where the final two vowels are `ä-e` regularly correspond to - singular lemmas where the `e` is no longer present and the `ä` has become `a`. + singular lemmas where the `e` is no longer present and the `ä` has become `a`, + e.g. `die Bäche` (plural) vs `der Bach` (singular). + For most languages used with spaCy, searching is likely to be useful starting at the end (`suff_*`), but the ability to search from the beginning (`pref_*`) is also offered for completeness. Search characters should consist of all @@ -268,7 +270,7 @@ def RichMultiHashEmbed( prefixes, suffixes and character search results may need to be increased accordingly. - All lengths must be specified in ascending order. + All arrays specifying lengths must be in ascending order. width (int): The output width. Also used as the width of the embedding tables. Recommended values are between 64 and 300. @@ -286,7 +288,7 @@ def RichMultiHashEmbed( pref_rows (Optional[List[int]]): The number of rows for each of `pref_lengths`. suff_lengths (Optional[List[int]]): The lengths of suffixes to use as features for each word, e.g. for the word `spaCy`: - `[1, 3]` would lead to `y` and `aCy` being used as features. + `[1, 3]` would lead to `y` and `yCa` being used as features. suff_rows (Optional[List[int]]): The number of rows for each of `suff_lengths`. pref_search_chars (Optional[str]): A string containing characters to search for starting from the beginning of each word. diff --git a/spacy/ml/richfeatureextractor.py b/spacy/ml/richfeatureextractor.py index e623da45f..c7766340a 100644 --- a/spacy/ml/richfeatureextractor.py +++ b/spacy/ml/richfeatureextractor.py @@ -1,11 +1,8 @@ from typing import List, Optional, Callable, Tuple -from spacy.util import get_search_char_byte_arrays - -# from ..util import get_arrays_for_search_chars from thinc.types import Ints2d from thinc.api import Model, registry, get_current_ops - from ..tokens import Doc +from ..util import get_search_char_byte_arrays @registry.layers("spacy.RichFeatureExtractor.v1") @@ -21,13 +18,17 @@ def RichFeatureExtractor( ) -> Model[List[Doc], List[Ints2d]]: ops = get_current_ops() if pref_search_chars is not None: - ps_search_chars, ps_width_offsets = get_search_char_byte_arrays(pref_search_chars, case_sensitive) + ps_search_chars, ps_width_offsets = get_search_char_byte_arrays( + pref_search_chars, case_sensitive + ) else: ps_search_chars = bytes() ps_width_offsets = bytes() if suff_search_chars is not None: - - ss_search_chars, ss_width_offsets = get_search_char_byte_arrays(suff_search_chars, case_sensitive) + + ss_search_chars, ss_width_offsets = get_search_char_byte_arrays( + suff_search_chars, case_sensitive + ) else: ss_search_chars = bytes() ss_width_offsets = bytes() @@ -36,12 +37,8 @@ def RichFeatureExtractor( forward, attrs={ "case_sensitive": case_sensitive, - "p_lengths": bytes(pref_lengths) - if pref_lengths is not None - else bytes(), - "s_lengths": bytes(suff_lengths) - if suff_lengths is not None - else bytes(), + "p_lengths": bytes(pref_lengths) if pref_lengths is not None else bytes(), + "s_lengths": bytes(suff_lengths) if suff_lengths is not None else bytes(), "ps_search_chars": ps_search_chars, "ps_width_offsets": ps_width_offsets, "ps_lengths": bytes(pref_search_lengths) diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index e52142639..a0aac6d54 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -16,7 +16,6 @@ from spacy.lang.xx import MultiLanguage from spacy.language import Language from spacy.lexeme import Lexeme from spacy.tokens import Doc, Span, SpanGroup, Token -from spacy.tokens.doc import get_fnv1a_hash from spacy.util import get_search_char_byte_arrays from spacy.vocab import Vocab @@ -998,18 +997,32 @@ def test_doc_spans_setdefault(en_tokenizer): EMPTY_HASH_VALUE = 0xCBF29CE484222325 +FNV1A_OFFSET_BASIS = 0xCBF29CE484222325 +FNV1A_PRIME = 0x00000100000001B3 + + +def _get_fnv1a_hash(input: bytes) -> int: + hash_val = FNV1A_OFFSET_BASIS + length = len(input) + offset = 0 + + while offset < length: + hash_val ^= input[offset] + hash_val *= FNV1A_PRIME + hash_val %= 2**64 + offset += 1 + return hash_val def test_fnv1a_hash(): """Checks the conformity of the 64-bit FNV1A implementation with http://www.isthe.com/chongo/src/fnv/test_fnv.c. - The method called here is only used in testing; in production - code, the hashing is performed in a fashion that is interweaved - with other logic. The conformity of the production code is - demonstrated by the character combination hash tests, where - hashes produced by the production code are tested for equality - against hashes produced by the test code. - s""" + The method called here, _get_fnv1a_hash(), is only used in testing; + in production code, the hashing is performed in a fashion that is interweaved + with other logic. The conformity of the production code is demonstrated by the + character combination hash tests, where hashes produced by the production code + are tested for equality against hashes produced by _get_fnv1a_hash(). + """ INPUTS = [ b"", b"a", @@ -1424,14 +1437,14 @@ def test_fnv1a_hash(): assert len(INPUTS) == len(OUTPUTS) for i in range(len(INPUTS)): - assert get_fnv1a_hash(INPUTS[i]) == OUTPUTS[i] + assert _get_fnv1a_hash(INPUTS[i]) == OUTPUTS[i] def _encode_and_hash(input: str, *, reverse: bool = False) -> int: encoded_input = input.encode("UTF-8") if reverse: encoded_input = encoded_input[::-1] - return get_fnv1a_hash(encoded_input) + return _get_fnv1a_hash(encoded_input) @pytest.mark.parametrize("case_sensitive", [True, False]) @@ -1566,7 +1579,7 @@ def test_get_character_combination_hashes_good_case_partial(en_tokenizer): assert hashes[3][4] == _encode_and_hash("pr") -def test_get_character_combination_hashes_copying_in_middle(en_tokenizer): +def test_get_character_combination_hashes_various_lengths(en_tokenizer): doc = en_tokenizer("sp𐌞Cé") for p_length in range(1, 8): diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd index 5f215c009..fc431b151 100644 --- a/spacy/tokens/doc.pxd +++ b/spacy/tokens/doc.pxd @@ -6,8 +6,6 @@ from ..structs cimport TokenC, LexemeC, SpanC from ..typedefs cimport attr_t from ..attrs cimport attr_id_t -from libc.stdint cimport uint32_t - cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil cdef attr_t get_token_attr_for_matcher(const TokenC* token, attr_id_t feat_name) nogil @@ -20,10 +18,6 @@ ctypedef fused LexemeOrToken: const_TokenC_ptr -cdef extern from "unicodeobject.h": - bint Py_UNICODE_ISUPPER(Py_UCS4 ch) - Py_UCS4 Py_UNICODE_TOLOWER(Py_UCS4 ch) - cdef int set_children_from_heads(TokenC* tokens, int start, int end) except -1 diff --git a/spacy/tokens/doc.pyi b/spacy/tokens/doc.pyi index a748dfb79..9b9e1ab66 100644 --- a/spacy/tokens/doc.pyi +++ b/spacy/tokens/doc.pyi @@ -1,7 +1,7 @@ from typing import Callable, Protocol, Iterable, Iterator, Optional from typing import Union, Tuple, List, Dict, Any, overload from cymem.cymem import Pool -from thinc.types import Floats1d, Floats2d, Ints1d, Ints2d +from thinc.types import Floats1d, Floats2d, Ints2d from .span import Span from .token import Token from ._dict_proxies import SpanGroups @@ -126,7 +126,7 @@ class Doc: blocked: Optional[List[Span]] = ..., missing: Optional[List[Span]] = ..., outside: Optional[List[Span]] = ..., - default: str = ..., + default: str = ... ) -> None: ... @property def noun_chunks(self) -> Iterator[Span]: ... @@ -189,7 +189,4 @@ class Doc: ) -> Ints2d: ... @staticmethod - def _get_array_attrs() -> Tuple[Any]: ... - -def get_fnv1a_hash(input: bytes) -> int: ... - + def _get_array_attrs() -> Tuple[Any]: ... \ No newline at end of file diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index cbc1bc3c7..9d4839d55 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -3,7 +3,6 @@ from typing import Set, List cimport cython cimport numpy as np -from cpython cimport array from libc.string cimport memcpy, memcmp, memset, strlen from libc.math cimport sqrt from libc.stdint cimport int32_t, uint64_t @@ -955,7 +954,7 @@ cdef class Doc: cdef int i, j cdef attr_id_t feature cdef np.ndarray[attr_t, ndim=2] output - # Handle scalar/list inputs of cdef np.strings/ints for py_attr_ids + # Handle scalar/list inputs of strings/ints for py_attr_ids # See also #3064 if isinstance(py_attr_ids, str): # Handle inputs like doc.to_array('ORTH') @@ -1780,7 +1779,7 @@ cdef class Doc: Many of the buffers passed into and used by this method contain single-byte numerical values. This takes advantage of the fact that we are hashing short affixes and searching for small groups of characters. The calling code is responsible - for ensuring that lengths being passed in cannot exceed 63 and hence that resulting values with a maximum of four-byte + for ensuring that lengths being passed in cannot exceed 63 and hence that resulting values with maximally four-byte character widths can never exceed 255. Note that this method performs no data validation itself as it expects the calling code will already have done so, and @@ -2117,7 +2116,7 @@ cdef void _search_for_chars( suffs_not_prefs: if *True*, searching starts from the end of the word; if *False*, from the beginning. res_buf: the buffer in which to place the search results. - l_buf: a buffer of length *max_res_l* in which to store the byte lengths. + l_buf: a buffer of length *max_res_l* in which to store the end byte offsets of the found characters. The calling code ensures that lengths greater than 255 cannot occur. """ cdef int res_buf_idx = 0, l_buf_idx = 0, ch_wdth, tok_start_idx, search_char_idx, end_search_idx @@ -2162,17 +2161,6 @@ cdef void _search_for_chars( cdef uint64_t FNV1A_OFFSET_BASIS = 0xcbf29ce484222325 cdef uint64_t FNV1A_PRIME = 0x00000100000001B3 -def get_fnv1a_hash(input: bytes): - """ Python-callable method to facilitate testing. """ - cdef uint64_t hash_val = FNV1A_OFFSET_BASIS - cdef int length = len(input), offset = 0 - - while offset < length: - hash_val ^= input[offset] - hash_val *= FNV1A_PRIME - offset += 1 - return hash_val - @cython.boundscheck(False) # Deactivate bounds checking cdef int _write_hashes( diff --git a/spacy/util.py b/spacy/util.py index aa929c765..e0b9ac308 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -1741,14 +1741,15 @@ def get_search_char_byte_arrays( search_char_string: str, case_sensitive: bool ) -> Tuple[bytes, bytes]: """ - This function supports the rich feature extractor. It orders the characters in - *search_char_string*, removing any duplicates, encodes them with UTF-8, and - returns the result together with a byte array containing the offsets where the - characters of various byte lengths start within the result, i.e. + This function supports *RichMultiHashEmbed*. It orders the characters in + *search_char_string*, removing any duplicates, encodes them as UTF-8, and + returns the result bufer together with a byte array containing the offsets + where the characters of various byte lengths start within the result buffer, + i.e. <1-byte-start>, <2-byte-start>, <3-byte-start>, <4-byte-start>, <4-byte-end>. - If the string does not contain any characters of length *n*, + If the result buffer does not contain any characters of length *n*, == . """ diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md index bd1077bbe..fbb2e4319 100644 --- a/website/docs/api/architectures.md +++ b/website/docs/api/architectures.md @@ -205,11 +205,13 @@ characters in each word are examined in order starting at the beginning or at the end; and each character that matches one of the search characters is added, in order, to the string to be used as a feature. The search continues until either the search result string is full or the whole word has been examined. -This is useful because many languages exhibit morphological alternations where +This is useful because some languages exhibit morphological alternations where one letter or letters regularly alternate with another letter or letters depending on the presence of some other letter before or after it, e.g. German plural nouns where the final two vowels are `ä-e` regularly correspond to -singular lemmas where the `e` is no longer present and the `ä` has become `a`. +singular lemmas where the `e` is no longer present and the `ä` has become `a`, +e.g. `die Bäche` (plural) vs. `der Bach` (singular). + For most languages used with spaCy, searching is likely to be useful starting at the end (`suff_*`), but the ability to search from the beginning (`pref_*`) is also offered for completeness. Search characters should consist of all @@ -224,7 +226,7 @@ than one UTF-8 character, e.g. _i_ when representing the lower-case form of the Turkish letter _İ_. Such situations are supported, but the lengths of prefixes, suffixes and character search results may need to be increased accordingly. -All lengths must be specified in ascending order. +All arrays specifying lengths must be in ascending order. | Name | Description | | ------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | @@ -235,7 +237,7 @@ All lengths must be specified in ascending order. | `case_sensitive` | Whether lower-case and upper-case letters should be distinguished when generating the character combinations to use as features. ~~bool~~ | | `pref_lengths` | The lengths of prefixes to use as features for each word, e.g. for the word `spaCy`: `[1, 3]` would lead to `s` and `spa` being used as features. ~~Optional[List[int]~~ | | `pref_rows` | The number of rows for each of `pref_lengths`. ~~Optional[List[int]~~ | -| `suff_lengths` | The lengths of suffixes to use as features for each word, e.g. for the word `spaCy`: `[1, 3]` would lead to `y` and `aCy` being used as features. ~~Optional[List[int]~~ | +| `suff_lengths` | The lengths of suffixes to use as features for each word, e.g. for the word `spaCy`: `[1, 3]` would lead to `y` and `yCa` being used as features. ~~Optional[List[int]~~ | | `suff_rows` | The number of rows for each of `suff_lengths`. ~~Optional[List[int]~~ | | `pref_search_chars` | A string containing characters to search for starting from the beginning of each word. ~~Optional[str]~~ | | `pref_search_lengths` | The lengths of search result strings to use as features, where the searches start from the beginning of each word. ~~Optional[List[int]]~~ |