Final touches

This commit is contained in:
richardpaulhudson 2022-11-09 11:40:54 +01:00
parent a972791c9a
commit 35d0c217d2
8 changed files with 58 additions and 64 deletions

View File

@ -248,11 +248,13 @@ def RichMultiHashEmbed(
the end; and each character that matches one of the search characters is added,
in order, to the string to be used as a feature. The search continues until
either the search result string is full or the whole word has been examined.
This is useful because many languages exhibit morphological alternations where
This is useful because some languages exhibit morphological alternations where
one letter or letters regularly alternate with another letter or letters
depending on the presence of some other letter before or after it, e.g. German
plural nouns where the final two vowels are `ä-e` regularly correspond to
singular lemmas where the `e` is no longer present and the `ä` has become `a`.
singular lemmas where the `e` is no longer present and the `ä` has become `a`,
e.g. `die Bäche` (plural) vs `der Bach` (singular).
For most languages used with spaCy, searching is likely to be useful starting
at the end (`suff_*`), but the ability to search from the beginning (`pref_*`)
is also offered for completeness. Search characters should consist of all
@ -268,7 +270,7 @@ def RichMultiHashEmbed(
prefixes, suffixes and character search results may need to be increased
accordingly.
All lengths must be specified in ascending order.
All arrays specifying lengths must be in ascending order.
width (int): The output width. Also used as the width of the embedding tables.
Recommended values are between 64 and 300.
@ -286,7 +288,7 @@ def RichMultiHashEmbed(
pref_rows (Optional[List[int]]): The number of rows for each of `pref_lengths`.
suff_lengths (Optional[List[int]]): The lengths of suffixes to use as features
for each word, e.g. for the word `spaCy`:
`[1, 3]` would lead to `y` and `aCy` being used as features.
`[1, 3]` would lead to `y` and `yCa` being used as features.
suff_rows (Optional[List[int]]): The number of rows for each of `suff_lengths`.
pref_search_chars (Optional[str]): A string containing characters to search for
starting from the beginning of each word.

View File

@ -1,11 +1,8 @@
from typing import List, Optional, Callable, Tuple
from spacy.util import get_search_char_byte_arrays
# from ..util import get_arrays_for_search_chars
from thinc.types import Ints2d
from thinc.api import Model, registry, get_current_ops
from ..tokens import Doc
from ..util import get_search_char_byte_arrays
@registry.layers("spacy.RichFeatureExtractor.v1")
@ -21,13 +18,17 @@ def RichFeatureExtractor(
) -> Model[List[Doc], List[Ints2d]]:
ops = get_current_ops()
if pref_search_chars is not None:
ps_search_chars, ps_width_offsets = get_search_char_byte_arrays(pref_search_chars, case_sensitive)
ps_search_chars, ps_width_offsets = get_search_char_byte_arrays(
pref_search_chars, case_sensitive
)
else:
ps_search_chars = bytes()
ps_width_offsets = bytes()
if suff_search_chars is not None:
ss_search_chars, ss_width_offsets = get_search_char_byte_arrays(suff_search_chars, case_sensitive)
ss_search_chars, ss_width_offsets = get_search_char_byte_arrays(
suff_search_chars, case_sensitive
)
else:
ss_search_chars = bytes()
ss_width_offsets = bytes()
@ -36,12 +37,8 @@ def RichFeatureExtractor(
forward,
attrs={
"case_sensitive": case_sensitive,
"p_lengths": bytes(pref_lengths)
if pref_lengths is not None
else bytes(),
"s_lengths": bytes(suff_lengths)
if suff_lengths is not None
else bytes(),
"p_lengths": bytes(pref_lengths) if pref_lengths is not None else bytes(),
"s_lengths": bytes(suff_lengths) if suff_lengths is not None else bytes(),
"ps_search_chars": ps_search_chars,
"ps_width_offsets": ps_width_offsets,
"ps_lengths": bytes(pref_search_lengths)

View File

@ -16,7 +16,6 @@ from spacy.lang.xx import MultiLanguage
from spacy.language import Language
from spacy.lexeme import Lexeme
from spacy.tokens import Doc, Span, SpanGroup, Token
from spacy.tokens.doc import get_fnv1a_hash
from spacy.util import get_search_char_byte_arrays
from spacy.vocab import Vocab
@ -998,18 +997,32 @@ def test_doc_spans_setdefault(en_tokenizer):
EMPTY_HASH_VALUE = 0xCBF29CE484222325
FNV1A_OFFSET_BASIS = 0xCBF29CE484222325
FNV1A_PRIME = 0x00000100000001B3
def _get_fnv1a_hash(input: bytes) -> int:
hash_val = FNV1A_OFFSET_BASIS
length = len(input)
offset = 0
while offset < length:
hash_val ^= input[offset]
hash_val *= FNV1A_PRIME
hash_val %= 2**64
offset += 1
return hash_val
def test_fnv1a_hash():
"""Checks the conformity of the 64-bit FNV1A implementation with
http://www.isthe.com/chongo/src/fnv/test_fnv.c.
The method called here is only used in testing; in production
code, the hashing is performed in a fashion that is interweaved
with other logic. The conformity of the production code is
demonstrated by the character combination hash tests, where
hashes produced by the production code are tested for equality
against hashes produced by the test code.
s"""
The method called here, _get_fnv1a_hash(), is only used in testing;
in production code, the hashing is performed in a fashion that is interweaved
with other logic. The conformity of the production code is demonstrated by the
character combination hash tests, where hashes produced by the production code
are tested for equality against hashes produced by _get_fnv1a_hash().
"""
INPUTS = [
b"",
b"a",
@ -1424,14 +1437,14 @@ def test_fnv1a_hash():
assert len(INPUTS) == len(OUTPUTS)
for i in range(len(INPUTS)):
assert get_fnv1a_hash(INPUTS[i]) == OUTPUTS[i]
assert _get_fnv1a_hash(INPUTS[i]) == OUTPUTS[i]
def _encode_and_hash(input: str, *, reverse: bool = False) -> int:
encoded_input = input.encode("UTF-8")
if reverse:
encoded_input = encoded_input[::-1]
return get_fnv1a_hash(encoded_input)
return _get_fnv1a_hash(encoded_input)
@pytest.mark.parametrize("case_sensitive", [True, False])
@ -1566,7 +1579,7 @@ def test_get_character_combination_hashes_good_case_partial(en_tokenizer):
assert hashes[3][4] == _encode_and_hash("pr")
def test_get_character_combination_hashes_copying_in_middle(en_tokenizer):
def test_get_character_combination_hashes_various_lengths(en_tokenizer):
doc = en_tokenizer("sp𐌞Cé")
for p_length in range(1, 8):

View File

@ -6,8 +6,6 @@ from ..structs cimport TokenC, LexemeC, SpanC
from ..typedefs cimport attr_t
from ..attrs cimport attr_id_t
from libc.stdint cimport uint32_t
cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil
cdef attr_t get_token_attr_for_matcher(const TokenC* token, attr_id_t feat_name) nogil
@ -20,10 +18,6 @@ ctypedef fused LexemeOrToken:
const_TokenC_ptr
cdef extern from "unicodeobject.h":
bint Py_UNICODE_ISUPPER(Py_UCS4 ch)
Py_UCS4 Py_UNICODE_TOLOWER(Py_UCS4 ch)
cdef int set_children_from_heads(TokenC* tokens, int start, int end) except -1

View File

@ -1,7 +1,7 @@
from typing import Callable, Protocol, Iterable, Iterator, Optional
from typing import Union, Tuple, List, Dict, Any, overload
from cymem.cymem import Pool
from thinc.types import Floats1d, Floats2d, Ints1d, Ints2d
from thinc.types import Floats1d, Floats2d, Ints2d
from .span import Span
from .token import Token
from ._dict_proxies import SpanGroups
@ -126,7 +126,7 @@ class Doc:
blocked: Optional[List[Span]] = ...,
missing: Optional[List[Span]] = ...,
outside: Optional[List[Span]] = ...,
default: str = ...,
default: str = ...
) -> None: ...
@property
def noun_chunks(self) -> Iterator[Span]: ...
@ -189,7 +189,4 @@ class Doc:
) -> Ints2d: ...
@staticmethod
def _get_array_attrs() -> Tuple[Any]: ...
def get_fnv1a_hash(input: bytes) -> int: ...
def _get_array_attrs() -> Tuple[Any]: ...

View File

@ -3,7 +3,6 @@ from typing import Set, List
cimport cython
cimport numpy as np
from cpython cimport array
from libc.string cimport memcpy, memcmp, memset, strlen
from libc.math cimport sqrt
from libc.stdint cimport int32_t, uint64_t
@ -955,7 +954,7 @@ cdef class Doc:
cdef int i, j
cdef attr_id_t feature
cdef np.ndarray[attr_t, ndim=2] output
# Handle scalar/list inputs of cdef np.strings/ints for py_attr_ids
# Handle scalar/list inputs of strings/ints for py_attr_ids
# See also #3064
if isinstance(py_attr_ids, str):
# Handle inputs like doc.to_array('ORTH')
@ -1780,7 +1779,7 @@ cdef class Doc:
Many of the buffers passed into and used by this method contain single-byte numerical values. This takes advantage of
the fact that we are hashing short affixes and searching for small groups of characters. The calling code is responsible
for ensuring that lengths being passed in cannot exceed 63 and hence that resulting values with a maximum of four-byte
for ensuring that lengths being passed in cannot exceed 63 and hence that resulting values with maximally four-byte
character widths can never exceed 255.
Note that this method performs no data validation itself as it expects the calling code will already have done so, and
@ -2117,7 +2116,7 @@ cdef void _search_for_chars(
suffs_not_prefs: if *True*, searching starts from the end of the word;
if *False*, from the beginning.
res_buf: the buffer in which to place the search results.
l_buf: a buffer of length *max_res_l* in which to store the byte lengths.
l_buf: a buffer of length *max_res_l* in which to store the end byte offsets of the found characters.
The calling code ensures that lengths greater than 255 cannot occur.
"""
cdef int res_buf_idx = 0, l_buf_idx = 0, ch_wdth, tok_start_idx, search_char_idx, end_search_idx
@ -2162,17 +2161,6 @@ cdef void _search_for_chars(
cdef uint64_t FNV1A_OFFSET_BASIS = 0xcbf29ce484222325
cdef uint64_t FNV1A_PRIME = 0x00000100000001B3
def get_fnv1a_hash(input: bytes):
""" Python-callable method to facilitate testing. """
cdef uint64_t hash_val = FNV1A_OFFSET_BASIS
cdef int length = len(input), offset = 0
while offset < length:
hash_val ^= input[offset]
hash_val *= FNV1A_PRIME
offset += 1
return hash_val
@cython.boundscheck(False) # Deactivate bounds checking
cdef int _write_hashes(

View File

@ -1741,14 +1741,15 @@ def get_search_char_byte_arrays(
search_char_string: str, case_sensitive: bool
) -> Tuple[bytes, bytes]:
"""
This function supports the rich feature extractor. It orders the characters in
*search_char_string*, removing any duplicates, encodes them with UTF-8, and
returns the result together with a byte array containing the offsets where the
characters of various byte lengths start within the result, i.e.
This function supports *RichMultiHashEmbed*. It orders the characters in
*search_char_string*, removing any duplicates, encodes them as UTF-8, and
returns the result bufer together with a byte array containing the offsets
where the characters of various byte lengths start within the result buffer,
i.e.
<1-byte-start>, <2-byte-start>, <3-byte-start>, <4-byte-start>, <4-byte-end>.
If the string does not contain any characters of length *n*,
If the result buffer does not contain any characters of length *n*,
<n_byte_start> == <n+1_byte_start>.
"""

View File

@ -205,11 +205,13 @@ characters in each word are examined in order starting at the beginning or at
the end; and each character that matches one of the search characters is added,
in order, to the string to be used as a feature. The search continues until
either the search result string is full or the whole word has been examined.
This is useful because many languages exhibit morphological alternations where
This is useful because some languages exhibit morphological alternations where
one letter or letters regularly alternate with another letter or letters
depending on the presence of some other letter before or after it, e.g. German
plural nouns where the final two vowels are `ä-e` regularly correspond to
singular lemmas where the `e` is no longer present and the `ä` has become `a`.
singular lemmas where the `e` is no longer present and the `ä` has become `a`,
e.g. `die Bäche` (plural) vs. `der Bach` (singular).
For most languages used with spaCy, searching is likely to be useful starting at
the end (`suff_*`), but the ability to search from the beginning (`pref_*`) is
also offered for completeness. Search characters should consist of all
@ -224,7 +226,7 @@ than one UTF-8 character, e.g. _i_ when representing the lower-case form of the
Turkish letter _İ_. Such situations are supported, but the lengths of prefixes,
suffixes and character search results may need to be increased accordingly.
All lengths must be specified in ascending order.
All arrays specifying lengths must be in ascending order.
| Name | Description |
| ------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
@ -235,7 +237,7 @@ All lengths must be specified in ascending order.
| `case_sensitive` | Whether lower-case and upper-case letters should be distinguished when generating the character combinations to use as features. ~~bool~~ |
| `pref_lengths` | The lengths of prefixes to use as features for each word, e.g. for the word `spaCy`: `[1, 3]` would lead to `s` and `spa` being used as features. ~~Optional[List[int]~~ |
| `pref_rows` | The number of rows for each of `pref_lengths`. ~~Optional[List[int]~~ |
| `suff_lengths` | The lengths of suffixes to use as features for each word, e.g. for the word `spaCy`: `[1, 3]` would lead to `y` and `aCy` being used as features. ~~Optional[List[int]~~ |
| `suff_lengths` | The lengths of suffixes to use as features for each word, e.g. for the word `spaCy`: `[1, 3]` would lead to `y` and `yCa` being used as features. ~~Optional[List[int]~~ |
| `suff_rows` | The number of rows for each of `suff_lengths`. ~~Optional[List[int]~~ |
| `pref_search_chars` | A string containing characters to search for starting from the beginning of each word. ~~Optional[str]~~ |
| `pref_search_lengths` | The lengths of search result strings to use as features, where the searches start from the beginning of each word. ~~Optional[List[int]]~~ |