mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-02 19:30:19 +03:00
Final touches
This commit is contained in:
parent
a972791c9a
commit
35d0c217d2
|
@ -248,11 +248,13 @@ def RichMultiHashEmbed(
|
|||
the end; and each character that matches one of the search characters is added,
|
||||
in order, to the string to be used as a feature. The search continues until
|
||||
either the search result string is full or the whole word has been examined.
|
||||
This is useful because many languages exhibit morphological alternations where
|
||||
This is useful because some languages exhibit morphological alternations where
|
||||
one letter or letters regularly alternate with another letter or letters
|
||||
depending on the presence of some other letter before or after it, e.g. German
|
||||
plural nouns where the final two vowels are `ä-e` regularly correspond to
|
||||
singular lemmas where the `e` is no longer present and the `ä` has become `a`.
|
||||
singular lemmas where the `e` is no longer present and the `ä` has become `a`,
|
||||
e.g. `die Bäche` (plural) vs `der Bach` (singular).
|
||||
|
||||
For most languages used with spaCy, searching is likely to be useful starting
|
||||
at the end (`suff_*`), but the ability to search from the beginning (`pref_*`)
|
||||
is also offered for completeness. Search characters should consist of all
|
||||
|
@ -268,7 +270,7 @@ def RichMultiHashEmbed(
|
|||
prefixes, suffixes and character search results may need to be increased
|
||||
accordingly.
|
||||
|
||||
All lengths must be specified in ascending order.
|
||||
All arrays specifying lengths must be in ascending order.
|
||||
|
||||
width (int): The output width. Also used as the width of the embedding tables.
|
||||
Recommended values are between 64 and 300.
|
||||
|
@ -286,7 +288,7 @@ def RichMultiHashEmbed(
|
|||
pref_rows (Optional[List[int]]): The number of rows for each of `pref_lengths`.
|
||||
suff_lengths (Optional[List[int]]): The lengths of suffixes to use as features
|
||||
for each word, e.g. for the word `spaCy`:
|
||||
`[1, 3]` would lead to `y` and `aCy` being used as features.
|
||||
`[1, 3]` would lead to `y` and `yCa` being used as features.
|
||||
suff_rows (Optional[List[int]]): The number of rows for each of `suff_lengths`.
|
||||
pref_search_chars (Optional[str]): A string containing characters to search for
|
||||
starting from the beginning of each word.
|
||||
|
|
|
@ -1,11 +1,8 @@
|
|||
from typing import List, Optional, Callable, Tuple
|
||||
from spacy.util import get_search_char_byte_arrays
|
||||
|
||||
# from ..util import get_arrays_for_search_chars
|
||||
from thinc.types import Ints2d
|
||||
from thinc.api import Model, registry, get_current_ops
|
||||
|
||||
from ..tokens import Doc
|
||||
from ..util import get_search_char_byte_arrays
|
||||
|
||||
|
||||
@registry.layers("spacy.RichFeatureExtractor.v1")
|
||||
|
@ -21,13 +18,17 @@ def RichFeatureExtractor(
|
|||
) -> Model[List[Doc], List[Ints2d]]:
|
||||
ops = get_current_ops()
|
||||
if pref_search_chars is not None:
|
||||
ps_search_chars, ps_width_offsets = get_search_char_byte_arrays(pref_search_chars, case_sensitive)
|
||||
ps_search_chars, ps_width_offsets = get_search_char_byte_arrays(
|
||||
pref_search_chars, case_sensitive
|
||||
)
|
||||
else:
|
||||
ps_search_chars = bytes()
|
||||
ps_width_offsets = bytes()
|
||||
if suff_search_chars is not None:
|
||||
|
||||
ss_search_chars, ss_width_offsets = get_search_char_byte_arrays(suff_search_chars, case_sensitive)
|
||||
ss_search_chars, ss_width_offsets = get_search_char_byte_arrays(
|
||||
suff_search_chars, case_sensitive
|
||||
)
|
||||
else:
|
||||
ss_search_chars = bytes()
|
||||
ss_width_offsets = bytes()
|
||||
|
@ -36,12 +37,8 @@ def RichFeatureExtractor(
|
|||
forward,
|
||||
attrs={
|
||||
"case_sensitive": case_sensitive,
|
||||
"p_lengths": bytes(pref_lengths)
|
||||
if pref_lengths is not None
|
||||
else bytes(),
|
||||
"s_lengths": bytes(suff_lengths)
|
||||
if suff_lengths is not None
|
||||
else bytes(),
|
||||
"p_lengths": bytes(pref_lengths) if pref_lengths is not None else bytes(),
|
||||
"s_lengths": bytes(suff_lengths) if suff_lengths is not None else bytes(),
|
||||
"ps_search_chars": ps_search_chars,
|
||||
"ps_width_offsets": ps_width_offsets,
|
||||
"ps_lengths": bytes(pref_search_lengths)
|
||||
|
|
|
@ -16,7 +16,6 @@ from spacy.lang.xx import MultiLanguage
|
|||
from spacy.language import Language
|
||||
from spacy.lexeme import Lexeme
|
||||
from spacy.tokens import Doc, Span, SpanGroup, Token
|
||||
from spacy.tokens.doc import get_fnv1a_hash
|
||||
from spacy.util import get_search_char_byte_arrays
|
||||
from spacy.vocab import Vocab
|
||||
|
||||
|
@ -998,18 +997,32 @@ def test_doc_spans_setdefault(en_tokenizer):
|
|||
|
||||
|
||||
EMPTY_HASH_VALUE = 0xCBF29CE484222325
|
||||
FNV1A_OFFSET_BASIS = 0xCBF29CE484222325
|
||||
FNV1A_PRIME = 0x00000100000001B3
|
||||
|
||||
|
||||
def _get_fnv1a_hash(input: bytes) -> int:
|
||||
hash_val = FNV1A_OFFSET_BASIS
|
||||
length = len(input)
|
||||
offset = 0
|
||||
|
||||
while offset < length:
|
||||
hash_val ^= input[offset]
|
||||
hash_val *= FNV1A_PRIME
|
||||
hash_val %= 2**64
|
||||
offset += 1
|
||||
return hash_val
|
||||
|
||||
|
||||
def test_fnv1a_hash():
|
||||
"""Checks the conformity of the 64-bit FNV1A implementation with
|
||||
http://www.isthe.com/chongo/src/fnv/test_fnv.c.
|
||||
The method called here is only used in testing; in production
|
||||
code, the hashing is performed in a fashion that is interweaved
|
||||
with other logic. The conformity of the production code is
|
||||
demonstrated by the character combination hash tests, where
|
||||
hashes produced by the production code are tested for equality
|
||||
against hashes produced by the test code.
|
||||
s"""
|
||||
The method called here, _get_fnv1a_hash(), is only used in testing;
|
||||
in production code, the hashing is performed in a fashion that is interweaved
|
||||
with other logic. The conformity of the production code is demonstrated by the
|
||||
character combination hash tests, where hashes produced by the production code
|
||||
are tested for equality against hashes produced by _get_fnv1a_hash().
|
||||
"""
|
||||
INPUTS = [
|
||||
b"",
|
||||
b"a",
|
||||
|
@ -1424,14 +1437,14 @@ def test_fnv1a_hash():
|
|||
|
||||
assert len(INPUTS) == len(OUTPUTS)
|
||||
for i in range(len(INPUTS)):
|
||||
assert get_fnv1a_hash(INPUTS[i]) == OUTPUTS[i]
|
||||
assert _get_fnv1a_hash(INPUTS[i]) == OUTPUTS[i]
|
||||
|
||||
|
||||
def _encode_and_hash(input: str, *, reverse: bool = False) -> int:
|
||||
encoded_input = input.encode("UTF-8")
|
||||
if reverse:
|
||||
encoded_input = encoded_input[::-1]
|
||||
return get_fnv1a_hash(encoded_input)
|
||||
return _get_fnv1a_hash(encoded_input)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("case_sensitive", [True, False])
|
||||
|
@ -1566,7 +1579,7 @@ def test_get_character_combination_hashes_good_case_partial(en_tokenizer):
|
|||
assert hashes[3][4] == _encode_and_hash("pr")
|
||||
|
||||
|
||||
def test_get_character_combination_hashes_copying_in_middle(en_tokenizer):
|
||||
def test_get_character_combination_hashes_various_lengths(en_tokenizer):
|
||||
doc = en_tokenizer("sp𐌞Cé")
|
||||
|
||||
for p_length in range(1, 8):
|
||||
|
|
|
@ -6,8 +6,6 @@ from ..structs cimport TokenC, LexemeC, SpanC
|
|||
from ..typedefs cimport attr_t
|
||||
from ..attrs cimport attr_id_t
|
||||
|
||||
from libc.stdint cimport uint32_t
|
||||
|
||||
|
||||
cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil
|
||||
cdef attr_t get_token_attr_for_matcher(const TokenC* token, attr_id_t feat_name) nogil
|
||||
|
@ -20,10 +18,6 @@ ctypedef fused LexemeOrToken:
|
|||
const_TokenC_ptr
|
||||
|
||||
|
||||
cdef extern from "unicodeobject.h":
|
||||
bint Py_UNICODE_ISUPPER(Py_UCS4 ch)
|
||||
Py_UCS4 Py_UNICODE_TOLOWER(Py_UCS4 ch)
|
||||
|
||||
|
||||
cdef int set_children_from_heads(TokenC* tokens, int start, int end) except -1
|
||||
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
from typing import Callable, Protocol, Iterable, Iterator, Optional
|
||||
from typing import Union, Tuple, List, Dict, Any, overload
|
||||
from cymem.cymem import Pool
|
||||
from thinc.types import Floats1d, Floats2d, Ints1d, Ints2d
|
||||
from thinc.types import Floats1d, Floats2d, Ints2d
|
||||
from .span import Span
|
||||
from .token import Token
|
||||
from ._dict_proxies import SpanGroups
|
||||
|
@ -126,7 +126,7 @@ class Doc:
|
|||
blocked: Optional[List[Span]] = ...,
|
||||
missing: Optional[List[Span]] = ...,
|
||||
outside: Optional[List[Span]] = ...,
|
||||
default: str = ...,
|
||||
default: str = ...
|
||||
) -> None: ...
|
||||
@property
|
||||
def noun_chunks(self) -> Iterator[Span]: ...
|
||||
|
@ -190,6 +190,3 @@ class Doc:
|
|||
|
||||
@staticmethod
|
||||
def _get_array_attrs() -> Tuple[Any]: ...
|
||||
|
||||
def get_fnv1a_hash(input: bytes) -> int: ...
|
||||
|
||||
|
|
|
@ -3,7 +3,6 @@ from typing import Set, List
|
|||
|
||||
cimport cython
|
||||
cimport numpy as np
|
||||
from cpython cimport array
|
||||
from libc.string cimport memcpy, memcmp, memset, strlen
|
||||
from libc.math cimport sqrt
|
||||
from libc.stdint cimport int32_t, uint64_t
|
||||
|
@ -955,7 +954,7 @@ cdef class Doc:
|
|||
cdef int i, j
|
||||
cdef attr_id_t feature
|
||||
cdef np.ndarray[attr_t, ndim=2] output
|
||||
# Handle scalar/list inputs of cdef np.strings/ints for py_attr_ids
|
||||
# Handle scalar/list inputs of strings/ints for py_attr_ids
|
||||
# See also #3064
|
||||
if isinstance(py_attr_ids, str):
|
||||
# Handle inputs like doc.to_array('ORTH')
|
||||
|
@ -1780,7 +1779,7 @@ cdef class Doc:
|
|||
|
||||
Many of the buffers passed into and used by this method contain single-byte numerical values. This takes advantage of
|
||||
the fact that we are hashing short affixes and searching for small groups of characters. The calling code is responsible
|
||||
for ensuring that lengths being passed in cannot exceed 63 and hence that resulting values with a maximum of four-byte
|
||||
for ensuring that lengths being passed in cannot exceed 63 and hence that resulting values with maximally four-byte
|
||||
character widths can never exceed 255.
|
||||
|
||||
Note that this method performs no data validation itself as it expects the calling code will already have done so, and
|
||||
|
@ -2117,7 +2116,7 @@ cdef void _search_for_chars(
|
|||
suffs_not_prefs: if *True*, searching starts from the end of the word;
|
||||
if *False*, from the beginning.
|
||||
res_buf: the buffer in which to place the search results.
|
||||
l_buf: a buffer of length *max_res_l* in which to store the byte lengths.
|
||||
l_buf: a buffer of length *max_res_l* in which to store the end byte offsets of the found characters.
|
||||
The calling code ensures that lengths greater than 255 cannot occur.
|
||||
"""
|
||||
cdef int res_buf_idx = 0, l_buf_idx = 0, ch_wdth, tok_start_idx, search_char_idx, end_search_idx
|
||||
|
@ -2162,17 +2161,6 @@ cdef void _search_for_chars(
|
|||
cdef uint64_t FNV1A_OFFSET_BASIS = 0xcbf29ce484222325
|
||||
cdef uint64_t FNV1A_PRIME = 0x00000100000001B3
|
||||
|
||||
def get_fnv1a_hash(input: bytes):
|
||||
""" Python-callable method to facilitate testing. """
|
||||
cdef uint64_t hash_val = FNV1A_OFFSET_BASIS
|
||||
cdef int length = len(input), offset = 0
|
||||
|
||||
while offset < length:
|
||||
hash_val ^= input[offset]
|
||||
hash_val *= FNV1A_PRIME
|
||||
offset += 1
|
||||
return hash_val
|
||||
|
||||
|
||||
@cython.boundscheck(False) # Deactivate bounds checking
|
||||
cdef int _write_hashes(
|
||||
|
|
|
@ -1741,14 +1741,15 @@ def get_search_char_byte_arrays(
|
|||
search_char_string: str, case_sensitive: bool
|
||||
) -> Tuple[bytes, bytes]:
|
||||
"""
|
||||
This function supports the rich feature extractor. It orders the characters in
|
||||
*search_char_string*, removing any duplicates, encodes them with UTF-8, and
|
||||
returns the result together with a byte array containing the offsets where the
|
||||
characters of various byte lengths start within the result, i.e.
|
||||
This function supports *RichMultiHashEmbed*. It orders the characters in
|
||||
*search_char_string*, removing any duplicates, encodes them as UTF-8, and
|
||||
returns the result bufer together with a byte array containing the offsets
|
||||
where the characters of various byte lengths start within the result buffer,
|
||||
i.e.
|
||||
|
||||
<1-byte-start>, <2-byte-start>, <3-byte-start>, <4-byte-start>, <4-byte-end>.
|
||||
|
||||
If the string does not contain any characters of length *n*,
|
||||
If the result buffer does not contain any characters of length *n*,
|
||||
<n_byte_start> == <n+1_byte_start>.
|
||||
"""
|
||||
|
||||
|
|
|
@ -205,11 +205,13 @@ characters in each word are examined in order starting at the beginning or at
|
|||
the end; and each character that matches one of the search characters is added,
|
||||
in order, to the string to be used as a feature. The search continues until
|
||||
either the search result string is full or the whole word has been examined.
|
||||
This is useful because many languages exhibit morphological alternations where
|
||||
This is useful because some languages exhibit morphological alternations where
|
||||
one letter or letters regularly alternate with another letter or letters
|
||||
depending on the presence of some other letter before or after it, e.g. German
|
||||
plural nouns where the final two vowels are `ä-e` regularly correspond to
|
||||
singular lemmas where the `e` is no longer present and the `ä` has become `a`.
|
||||
singular lemmas where the `e` is no longer present and the `ä` has become `a`,
|
||||
e.g. `die Bäche` (plural) vs. `der Bach` (singular).
|
||||
|
||||
For most languages used with spaCy, searching is likely to be useful starting at
|
||||
the end (`suff_*`), but the ability to search from the beginning (`pref_*`) is
|
||||
also offered for completeness. Search characters should consist of all
|
||||
|
@ -224,7 +226,7 @@ than one UTF-8 character, e.g. _i_ when representing the lower-case form of the
|
|||
Turkish letter _İ_. Such situations are supported, but the lengths of prefixes,
|
||||
suffixes and character search results may need to be increased accordingly.
|
||||
|
||||
All lengths must be specified in ascending order.
|
||||
All arrays specifying lengths must be in ascending order.
|
||||
|
||||
| Name | Description |
|
||||
| ------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
|
@ -235,7 +237,7 @@ All lengths must be specified in ascending order.
|
|||
| `case_sensitive` | Whether lower-case and upper-case letters should be distinguished when generating the character combinations to use as features. ~~bool~~ |
|
||||
| `pref_lengths` | The lengths of prefixes to use as features for each word, e.g. for the word `spaCy`: `[1, 3]` would lead to `s` and `spa` being used as features. ~~Optional[List[int]~~ |
|
||||
| `pref_rows` | The number of rows for each of `pref_lengths`. ~~Optional[List[int]~~ |
|
||||
| `suff_lengths` | The lengths of suffixes to use as features for each word, e.g. for the word `spaCy`: `[1, 3]` would lead to `y` and `aCy` being used as features. ~~Optional[List[int]~~ |
|
||||
| `suff_lengths` | The lengths of suffixes to use as features for each word, e.g. for the word `spaCy`: `[1, 3]` would lead to `y` and `yCa` being used as features. ~~Optional[List[int]~~ |
|
||||
| `suff_rows` | The number of rows for each of `suff_lengths`. ~~Optional[List[int]~~ |
|
||||
| `pref_search_chars` | A string containing characters to search for starting from the beginning of each word. ~~Optional[str]~~ |
|
||||
| `pref_search_lengths` | The lengths of search result strings to use as features, where the searches start from the beginning of each word. ~~Optional[List[int]]~~ |
|
||||
|
|
Loading…
Reference in New Issue
Block a user