mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-30 10:00:04 +03:00
Move functionality to richfeatureextractor
This commit is contained in:
parent
5a08596f92
commit
5d24934cf5
1
setup.py
1
setup.py
|
@ -34,6 +34,7 @@ MOD_NAMES = [
|
|||
"spacy.kb.kb",
|
||||
"spacy.kb.kb_in_memory",
|
||||
"spacy.ml.parser_model",
|
||||
"spacy.ml.richfeatureextractor",
|
||||
"spacy.morphology",
|
||||
"spacy.pipeline.dep_parser",
|
||||
"spacy.pipeline._edit_tree_internals.edit_trees",
|
||||
|
|
|
@ -973,6 +973,7 @@ class Errors(metaclass=ErrorsWithCodes):
|
|||
E1051 = ("Invalid rich group config '{label}'.")
|
||||
E1052 = ("Length > 63 in rich group config '{label}'.")
|
||||
E1053 = ("Rich group config {label} specifies lengths that are not in ascending order.")
|
||||
E1054 = ("Mismatched lengths in hash embed config: {len_rows} rows, {len_attrs} attrs.")
|
||||
|
||||
|
||||
# Deprecated model shortcuts, only used in errors and warnings
|
||||
|
|
|
@ -151,7 +151,7 @@ def MultiHashEmbed(
|
|||
Requires a vectors table to be loaded in the Doc objects' vocab.
|
||||
"""
|
||||
if len(rows) != len(attrs):
|
||||
raise ValueError(f"Mismatched lengths: {len(rows)} vs {len(attrs)}")
|
||||
raise ValueError(Errors.E1054.format(len_rows=len(rows), len_attrs=len(attrs)))
|
||||
seed = 7
|
||||
|
||||
def make_hash_embed(index):
|
||||
|
@ -253,7 +253,7 @@ def RichMultiHashEmbed(
|
|||
"""
|
||||
|
||||
if len(rows) != len(attrs):
|
||||
raise ValueError(f"Mismatched lengths: {len(rows)} vs {len(attrs)}")
|
||||
raise ValueError(Errors.E1054.format(len_rows=len(rows), len_attrs=len(attrs)))
|
||||
|
||||
_verify_rich_config_group("prefix", pref_lengths, pref_rows)
|
||||
_verify_rich_config_group("suffix", suff_lengths, suff_rows)
|
||||
|
@ -296,7 +296,7 @@ def RichMultiHashEmbed(
|
|||
),
|
||||
max_out,
|
||||
ragged2list(),
|
||||
Dropout(0.0)
|
||||
Dropout(0.0),
|
||||
)
|
||||
else:
|
||||
model = chain(
|
||||
|
@ -305,7 +305,7 @@ def RichMultiHashEmbed(
|
|||
with_array(concatenate(*embeddings)),
|
||||
max_out,
|
||||
ragged2list(),
|
||||
Dropout(0.0)
|
||||
Dropout(0.0),
|
||||
)
|
||||
return model
|
||||
|
||||
|
|
27
spacy/ml/richfeatureextractor.pxd
Normal file
27
spacy/ml/richfeatureextractor.pxd
Normal file
|
@ -0,0 +1,27 @@
|
|||
cimport numpy as np
|
||||
|
||||
cdef void _set_prefix_lengths(
|
||||
const unsigned char* tok_str,
|
||||
const int tok_str_l,
|
||||
const int p_max_l,
|
||||
unsigned char* pref_l_buf,
|
||||
) nogil
|
||||
|
||||
|
||||
cdef void _set_suffix_lengths(
|
||||
const unsigned char* tok_str,
|
||||
const int tok_str_l,
|
||||
const int s_max_l,
|
||||
unsigned char* suff_l_buf,
|
||||
) nogil
|
||||
|
||||
|
||||
cdef int _write_hashes(
|
||||
const unsigned char* res_buf,
|
||||
const unsigned char* aff_l_buf,
|
||||
const unsigned char* offset_buf,
|
||||
const int res_buf_last,
|
||||
np.uint64_t* hashes_ptr,
|
||||
) nogil
|
||||
|
||||
|
|
@ -1,42 +0,0 @@
|
|||
from typing import List, Optional, Callable, Tuple
|
||||
from thinc.types import Ints2d
|
||||
from thinc.api import Model, registry, get_current_ops
|
||||
from ..tokens import Doc
|
||||
|
||||
|
||||
@registry.layers("spacy.RichFeatureExtractor.v1")
|
||||
def RichFeatureExtractor(
|
||||
*,
|
||||
case_sensitive: bool,
|
||||
pref_lengths: Optional[List[int]] = None,
|
||||
suff_lengths: Optional[List[int]] = None,
|
||||
) -> Model[List[Doc], List[Ints2d]]:
|
||||
return Model(
|
||||
"extract_character_combination_hashes",
|
||||
forward,
|
||||
attrs={
|
||||
"case_sensitive": case_sensitive,
|
||||
"p_lengths": bytes(pref_lengths) if pref_lengths is not None else bytes(),
|
||||
"s_lengths": bytes(suff_lengths) if suff_lengths is not None else bytes(),
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def forward(
|
||||
model: Model[List[Doc], List[Ints2d]], docs, is_train: bool
|
||||
) -> Tuple[List[Ints2d], Callable]:
|
||||
ops = model.ops
|
||||
case_sensitive: bool = model.attrs["case_sensitive"]
|
||||
p_lengths: bytes = model.attrs["p_lengths"]
|
||||
s_lengths: bytes = model.attrs["s_lengths"]
|
||||
features: List[Ints2d] = []
|
||||
for doc in docs:
|
||||
hashes = doc.get_character_combination_hashes(
|
||||
case_sensitive=case_sensitive,
|
||||
p_lengths=p_lengths,
|
||||
s_lengths=s_lengths,
|
||||
)
|
||||
features.append(ops.asarray2i(hashes, dtype="uint64"))
|
||||
|
||||
backprop: Callable[[List[Ints2d]], List] = lambda d_features: []
|
||||
return features, backprop
|
9
spacy/ml/richfeatureextractor.pyi
Normal file
9
spacy/ml/richfeatureextractor.pyi
Normal file
|
@ -0,0 +1,9 @@
|
|||
from ..tokens import Doc
|
||||
|
||||
def get_character_combination_hashes(
|
||||
*,
|
||||
doc: Doc,
|
||||
case_sensitive: bool,
|
||||
p_lengths: bytes,
|
||||
s_lengths: bytes,
|
||||
):
|
233
spacy/ml/richfeatureextractor.pyx
Normal file
233
spacy/ml/richfeatureextractor.pyx
Normal file
|
@ -0,0 +1,233 @@
|
|||
from typing import List, Optional, Callable, Tuple
|
||||
import numpy
|
||||
from thinc.types import Ints2d
|
||||
from thinc.api import Model, registry, get_current_ops
|
||||
from ..symbols import NAMES as SYMBOLS_BY_INT
|
||||
|
||||
cimport numpy as np
|
||||
from cymem.cymem cimport Pool
|
||||
from libc.string cimport memset, strlen
|
||||
from libc.stdint cimport uint64_t
|
||||
from ..tokens.doc cimport Doc
|
||||
from ..structs cimport TokenC
|
||||
from ..typedefs cimport attr_t
|
||||
|
||||
|
||||
@registry.layers("spacy.RichFeatureExtractor.v1")
|
||||
def RichFeatureExtractor(
|
||||
*,
|
||||
case_sensitive: bool,
|
||||
pref_lengths: Optional[List[int]] = None,
|
||||
suff_lengths: Optional[List[int]] = None,
|
||||
) -> Model[List[Doc], List[Ints2d]]:
|
||||
# Because the calling code guarantees that the integers in the list are each less than 256,
|
||||
# the integer list can be converted into *bytes*.
|
||||
return Model(
|
||||
"extract_character_combination_hashes",
|
||||
forward,
|
||||
attrs={
|
||||
"case_sensitive": case_sensitive,
|
||||
"p_lengths": bytes(pref_lengths) if pref_lengths is not None else bytes(),
|
||||
"s_lengths": bytes(suff_lengths) if suff_lengths is not None else bytes(),
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def forward(
|
||||
model: Model[List[Doc], List[Ints2d]], docs, is_train: bool
|
||||
) -> Tuple[List[Ints2d], Callable]:
|
||||
ops = model.ops
|
||||
case_sensitive: bool = model.attrs["case_sensitive"]
|
||||
p_lengths: bytes = model.attrs["p_lengths"]
|
||||
s_lengths: bytes = model.attrs["s_lengths"]
|
||||
features: List[Ints2d] = []
|
||||
for doc in docs:
|
||||
hashes = get_character_combination_hashes(
|
||||
doc=doc,
|
||||
case_sensitive=case_sensitive,
|
||||
p_lengths=p_lengths,
|
||||
s_lengths=s_lengths,
|
||||
)
|
||||
features.append(ops.asarray2i(hashes, dtype="uint64"))
|
||||
|
||||
backprop: Callable[[List[Ints2d]], List] = lambda d_features: []
|
||||
return features, backprop
|
||||
|
||||
|
||||
def get_character_combination_hashes(
|
||||
*,
|
||||
Doc doc,
|
||||
const bint case_sensitive,
|
||||
const unsigned char* p_lengths,
|
||||
const unsigned char* s_lengths,
|
||||
):
|
||||
"""
|
||||
Returns a 2D NumPy array where the rows represent tokens and the columns represent hashes of various character combinations
|
||||
derived from the raw text of each token.
|
||||
|
||||
doc: the document
|
||||
case_sensitive: if *False*, hashes are generated based on the lower-case version of each token.
|
||||
p_lengths: an array of single-byte values specifying the lengths of prefixes to be hashed in ascending order.
|
||||
For example, if *p_lengths==[2, 3]*, the prefixes hashed for "spaCy" would be "sp" and "spa".
|
||||
s_lengths: an array of single-byte values specifying the lengths of suffixes to be hashed in ascending order.
|
||||
For example, if *s_lengths==[2, 3]* and *cs == True*, the suffixes hashed for "spaCy" would be "yC" and "yCa".
|
||||
|
||||
Many of the buffers passed into and used by this method contain single-byte numerical values. This takes advantage of
|
||||
the fact that we are hashing short affixes and searching for small groups of characters. The calling code is responsible
|
||||
for ensuring that lengths being passed in cannot exceed 63 and hence, with maximally four-byte
|
||||
character widths, that individual values within buffers can never exceed the capacity of a single byte (255).
|
||||
|
||||
Note that this method performs no data validation itself as it expects the calling code will already have done so, and
|
||||
that the behaviour of the code may be erratic if the supplied parameters do not conform to expectations.
|
||||
"""
|
||||
|
||||
# Work out lengths
|
||||
cdef int p_lengths_l = strlen(<char*> p_lengths)
|
||||
cdef int s_lengths_l = strlen(<char*> s_lengths)
|
||||
cdef int p_max_l = p_lengths[p_lengths_l - 1] if p_lengths_l > 0 else 0
|
||||
cdef int s_max_l = s_lengths[s_lengths_l - 1] if s_lengths_l > 0 else 0
|
||||
|
||||
# Define / allocate buffers
|
||||
cdef Pool mem = Pool()
|
||||
cdef unsigned char* pref_l_buf = <unsigned char*> mem.alloc(p_max_l, sizeof(char))
|
||||
cdef unsigned char* suff_l_buf = <unsigned char*> mem.alloc(s_max_l, sizeof(char))
|
||||
cdef int doc_l = doc.length
|
||||
cdef np.ndarray[np.uint64_t, ndim=2] hashes = numpy.empty(
|
||||
(doc_l, p_lengths_l + s_lengths_l), dtype="uint64")
|
||||
cdef np.uint64_t* hashes_ptr = <np.uint64_t*> hashes.data
|
||||
|
||||
# Define working variables
|
||||
cdef TokenC tok_c
|
||||
cdef int tok_i, tok_str_l
|
||||
cdef attr_t num_tok_attr
|
||||
cdef bytes tok_str_bytes
|
||||
cdef const unsigned char* tok_str
|
||||
|
||||
for tok_i in range(doc_l):
|
||||
tok_c = <TokenC> doc.c[tok_i]
|
||||
num_tok_attr = tok_c.lex.orth if case_sensitive else tok_c.lex.lower
|
||||
if num_tok_attr < len(SYMBOLS_BY_INT): # hardly ever happens
|
||||
if num_tok_attr == 0:
|
||||
tok_str_bytes = b""
|
||||
else:
|
||||
tok_str_bytes = SYMBOLS_BY_INT[num_tok_attr].encode("UTF-8")
|
||||
tok_str = tok_str_bytes
|
||||
tok_str_l = len(tok_str_bytes)
|
||||
else:
|
||||
tok_str, tok_str_l = doc.vocab.strings.utf8_ptr(num_tok_attr)
|
||||
|
||||
if p_max_l > 0:
|
||||
_set_prefix_lengths(tok_str, tok_str_l, p_max_l, pref_l_buf)
|
||||
hashes_ptr += _write_hashes(tok_str, p_lengths, pref_l_buf, 0, hashes_ptr)
|
||||
|
||||
if s_max_l > 0:
|
||||
_set_suffix_lengths(tok_str, tok_str_l, s_max_l, suff_l_buf)
|
||||
hashes_ptr += _write_hashes(tok_str, s_lengths, suff_l_buf, tok_str_l - 1, hashes_ptr)
|
||||
|
||||
return hashes
|
||||
|
||||
cdef void _set_prefix_lengths(
|
||||
const unsigned char* tok_str,
|
||||
const int tok_str_l,
|
||||
const int p_max_l,
|
||||
unsigned char* pref_l_buf,
|
||||
) nogil:
|
||||
""" Populate *pref_l_buf*, which has length *p_max_l*, with the byte lengths of each of the substrings terminated by the first *p_max_l*
|
||||
characters within *tok_str*. Lengths that are greater than the character length of the whole word are populated with the byte length
|
||||
of the whole word.
|
||||
|
||||
tok_str: a UTF-8 representation of a string.
|
||||
tok_str_l: the length of *tok_str*.
|
||||
p_max_l: the number of characters to process at the beginning of the word.
|
||||
pref_l_buf: a buffer of length *p_max_l* in which to store the lengths. The code calling *get_character_combination_hashes()* is
|
||||
responsible for ensuring that *p_max_l* cannot exceed 63 and hence, with maximally four-byte character widths, that individual values
|
||||
within the buffer can never exceed the capacity of a single byte (255).
|
||||
"""
|
||||
cdef int tok_str_idx = 1, pref_l_buf_idx = 0
|
||||
|
||||
while pref_l_buf_idx < p_max_l:
|
||||
if (tok_str_idx >= tok_str_l
|
||||
or
|
||||
((tok_str[tok_str_idx] & 0xc0) != 0x80) # not a continuation character
|
||||
):
|
||||
pref_l_buf[pref_l_buf_idx] = tok_str_idx
|
||||
pref_l_buf_idx += 1
|
||||
if tok_str_idx >= tok_str_l:
|
||||
break
|
||||
tok_str_idx += 1
|
||||
|
||||
if pref_l_buf_idx < p_max_l:
|
||||
memset(pref_l_buf + pref_l_buf_idx, pref_l_buf[pref_l_buf_idx - 1], p_max_l - pref_l_buf_idx)
|
||||
|
||||
|
||||
cdef void _set_suffix_lengths(
|
||||
const unsigned char* tok_str,
|
||||
const int tok_str_l,
|
||||
const int s_max_l,
|
||||
unsigned char* suff_l_buf,
|
||||
) nogil:
|
||||
""" Populate *suff_l_buf*, which has length *s_max_l*, with the byte lengths of each of the substrings started by the last *s_max_l*
|
||||
characters within *tok_str*. Lengths that are greater than the character length of the whole word are populated with the byte length
|
||||
of the whole word.
|
||||
|
||||
tok_str: a UTF-8 representation of a string.
|
||||
tok_str_l: the length of *tok_str*.
|
||||
s_max_l: the number of characters to process at the end of the word.
|
||||
suff_l_buf: a buffer of length *s_max_l* in which to store the lengths. The code calling *get_character_combination_hashes()* is
|
||||
responsible for ensuring that *s_max_l* cannot exceed 63 and hence, with maximally four-byte character widths, that individual values
|
||||
within the buffer can never exceed the capacity of a single byte (255).
|
||||
"""
|
||||
cdef int tok_str_idx = tok_str_l - 1, suff_l_buf_idx = 0
|
||||
|
||||
while suff_l_buf_idx < s_max_l:
|
||||
if (tok_str[tok_str_idx] & 0xc0) != 0x80: # not a continuation character
|
||||
suff_l_buf[suff_l_buf_idx] = tok_str_l - tok_str_idx
|
||||
suff_l_buf_idx += 1
|
||||
tok_str_idx -= 1
|
||||
if tok_str_idx < 0:
|
||||
break
|
||||
|
||||
if suff_l_buf_idx < s_max_l:
|
||||
memset(suff_l_buf + suff_l_buf_idx, suff_l_buf[suff_l_buf_idx - 1], s_max_l - suff_l_buf_idx)
|
||||
|
||||
|
||||
cdef uint64_t FNV1A_OFFSET_BASIS = 0xcbf29ce484222325
|
||||
cdef uint64_t FNV1A_PRIME = 0x00000100000001B3
|
||||
|
||||
|
||||
cdef int _write_hashes(
|
||||
const unsigned char* res_buf,
|
||||
const unsigned char* aff_l_buf,
|
||||
const unsigned char* offset_buf,
|
||||
const int res_buf_last,
|
||||
np.uint64_t* hashes_ptr,
|
||||
) nogil:
|
||||
""" Write 64-bit FNV1A hashes for a token/rich property group combination.
|
||||
|
||||
res_buf: the string from which to generate the hash values.
|
||||
aff_l_buf: one-byte lengths describing how many characters to hash.
|
||||
offset_buf: one-byte lengths specifying the byte offset of each character within *res_buf*.
|
||||
res_buf_last: if affixes should start at the end of *res_buf*, the offset of the last byte in
|
||||
*res_buf*; if affixes should start at the beginning of *res_buf*, *0*.
|
||||
hashes_ptr: a pointer starting from which the new hashes should be written.
|
||||
|
||||
Returns: the number of hashes written.
|
||||
"""
|
||||
|
||||
cdef int last_offset = 0, hash_idx = 0, offset, aff_l
|
||||
cdef uint64_t hash_val = FNV1A_OFFSET_BASIS
|
||||
|
||||
while True:
|
||||
aff_l = aff_l_buf[hash_idx]
|
||||
if aff_l == 0:
|
||||
return hash_idx
|
||||
offset = offset_buf[aff_l - 1]
|
||||
while last_offset < offset:
|
||||
if res_buf_last > 0:
|
||||
hash_val ^= res_buf[res_buf_last - last_offset]
|
||||
else:
|
||||
hash_val ^= res_buf[last_offset]
|
||||
hash_val *= FNV1A_PRIME
|
||||
last_offset += 1
|
||||
hashes_ptr[hash_idx] = hash_val
|
||||
hash_idx += 1
|
|
@ -1,8 +1,6 @@
|
|||
from pickle import EMPTY_DICT
|
||||
import weakref
|
||||
|
||||
import numpy
|
||||
from time import time
|
||||
from numpy.testing import assert_array_equal
|
||||
import pytest
|
||||
import warnings
|
||||
|
@ -992,635 +990,3 @@ def test_doc_spans_setdefault(en_tokenizer):
|
|||
assert len(doc.spans["key2"]) == 1
|
||||
doc.spans.setdefault("key3", default=SpanGroup(doc, spans=[doc[0:1], doc[1:2]]))
|
||||
assert len(doc.spans["key3"]) == 2
|
||||
|
||||
|
||||
EMPTY_HASH_VALUE = 0xCBF29CE484222325
|
||||
FNV1A_OFFSET_BASIS = 0xCBF29CE484222325
|
||||
FNV1A_PRIME = 0x00000100000001B3
|
||||
|
||||
|
||||
def _get_fnv1a_hash(input: bytes) -> int:
|
||||
hash_val = FNV1A_OFFSET_BASIS
|
||||
length = len(input)
|
||||
offset = 0
|
||||
|
||||
while offset < length:
|
||||
hash_val ^= input[offset]
|
||||
hash_val *= FNV1A_PRIME
|
||||
hash_val %= 2**64
|
||||
offset += 1
|
||||
return hash_val
|
||||
|
||||
|
||||
def test_fnv1a_hash():
|
||||
"""Checks the conformity of the 64-bit FNV1A implementation with
|
||||
http://www.isthe.com/chongo/src/fnv/test_fnv.c.
|
||||
The method called here, _get_fnv1a_hash(), is only used in testing;
|
||||
in production code, the hashing is performed in a fashion that is interweaved
|
||||
with other logic. The conformity of the production code is demonstrated by the
|
||||
character combination hash tests, where hashes produced by the production code
|
||||
are tested for equality against hashes produced by _get_fnv1a_hash().
|
||||
"""
|
||||
INPUTS = [
|
||||
b"",
|
||||
b"a",
|
||||
b"b",
|
||||
b"c",
|
||||
b"d",
|
||||
b"e",
|
||||
b"f",
|
||||
b"fo",
|
||||
b"foo",
|
||||
b"foob",
|
||||
b"fooba",
|
||||
b"foobar",
|
||||
b"\x00",
|
||||
b"a\x00",
|
||||
b"b\x00",
|
||||
b"c\x00",
|
||||
b"d\x00",
|
||||
b"e\x00",
|
||||
b"f\x00",
|
||||
b"fo\x00",
|
||||
b"foo\x00",
|
||||
b"foob\x00",
|
||||
b"fooba\x00",
|
||||
b"foobar\x00",
|
||||
b"ch",
|
||||
b"cho",
|
||||
b"chon",
|
||||
b"chong",
|
||||
b"chongo",
|
||||
b"chongo ",
|
||||
b"chongo w",
|
||||
b"chongo wa",
|
||||
b"chongo was",
|
||||
b"chongo was ",
|
||||
b"chongo was h",
|
||||
b"chongo was he",
|
||||
b"chongo was her",
|
||||
b"chongo was here",
|
||||
b"chongo was here!",
|
||||
b"chongo was here!\n",
|
||||
b"ch\x00",
|
||||
b"cho\x00",
|
||||
b"chon\x00",
|
||||
b"chong\x00",
|
||||
b"chongo\x00",
|
||||
b"chongo \x00",
|
||||
b"chongo w\x00",
|
||||
b"chongo wa\x00",
|
||||
b"chongo was\x00",
|
||||
b"chongo was \x00",
|
||||
b"chongo was h\x00",
|
||||
b"chongo was he\x00",
|
||||
b"chongo was her\x00",
|
||||
b"chongo was here\x00",
|
||||
b"chongo was here!\x00",
|
||||
b"chongo was here!\n\x00",
|
||||
b"cu",
|
||||
b"cur",
|
||||
b"curd",
|
||||
b"curds",
|
||||
b"curds ",
|
||||
b"curds a",
|
||||
b"curds an",
|
||||
b"curds and",
|
||||
b"curds and ",
|
||||
b"curds and w",
|
||||
b"curds and wh",
|
||||
b"curds and whe",
|
||||
b"curds and whey",
|
||||
b"curds and whey\n",
|
||||
b"cu\x00",
|
||||
b"cur\x00",
|
||||
b"curd\x00",
|
||||
b"curds\x00",
|
||||
b"curds \x00",
|
||||
b"curds a\x00",
|
||||
b"curds an\x00",
|
||||
b"curds and\x00",
|
||||
b"curds and \x00",
|
||||
b"curds and w\x00",
|
||||
b"curds and wh\x00",
|
||||
b"curds and whe\x00",
|
||||
b"curds and whey\x00",
|
||||
b"curds and whey\n\x00",
|
||||
b"hi",
|
||||
b"hi\x00",
|
||||
b"hello",
|
||||
b"hello\x00",
|
||||
b"\xff\x00\x00\x01",
|
||||
b"\x01\x00\x00\xff",
|
||||
b"\xff\x00\x00\x02",
|
||||
b"\x02\x00\x00\xff",
|
||||
b"\xff\x00\x00\x03",
|
||||
b"\x03\x00\x00\xff",
|
||||
b"\xff\x00\x00\x04",
|
||||
b"\x04\x00\x00\xff",
|
||||
b"\x40\x51\x4e\x44",
|
||||
b"\x44\x4e\x51\x40",
|
||||
b"\x40\x51\x4e\x4a",
|
||||
b"\x4a\x4e\x51\x40",
|
||||
b"\x40\x51\x4e\x54",
|
||||
b"\x54\x4e\x51\x40",
|
||||
b"127.0.0.1",
|
||||
b"127.0.0.1\x00",
|
||||
b"127.0.0.2",
|
||||
b"127.0.0.2\x00",
|
||||
b"127.0.0.3",
|
||||
b"127.0.0.3\x00",
|
||||
b"64.81.78.68",
|
||||
b"64.81.78.68\x00",
|
||||
b"64.81.78.74",
|
||||
b"64.81.78.74\x00",
|
||||
b"64.81.78.84",
|
||||
b"64.81.78.84\x00",
|
||||
b"feedface",
|
||||
b"feedface\x00",
|
||||
b"feedfacedaffdeed",
|
||||
b"feedfacedaffdeed\x00",
|
||||
b"feedfacedeadbeef",
|
||||
b"feedfacedeadbeef\x00",
|
||||
b"line 1\nline 2\nline 3",
|
||||
b"chongo <Landon Curt Noll> /\\../\\",
|
||||
b"chongo <Landon Curt Noll> /\\../\\\x00",
|
||||
b"chongo (Landon Curt Noll) /\\../\\",
|
||||
b"chongo (Landon Curt Noll) /\\../\\\x00",
|
||||
b"http://antwrp.gsfc.nasa.gov/apod/astropix.html",
|
||||
b"http://en.wikipedia.org/wiki/Fowler_Noll_Vo_hash",
|
||||
b"http://epod.usra.edu/",
|
||||
b"http://exoplanet.eu/",
|
||||
b"http://hvo.wr.usgs.gov/cam3/",
|
||||
b"http://hvo.wr.usgs.gov/cams/HMcam/",
|
||||
b"http://hvo.wr.usgs.gov/kilauea/update/deformation.html",
|
||||
b"http://hvo.wr.usgs.gov/kilauea/update/images.html",
|
||||
b"http://hvo.wr.usgs.gov/kilauea/update/maps.html",
|
||||
b"http://hvo.wr.usgs.gov/volcanowatch/current_issue.html",
|
||||
b"http://neo.jpl.nasa.gov/risk/",
|
||||
b"http://norvig.com/21-days.html",
|
||||
b"http://primes.utm.edu/curios/home.php",
|
||||
b"http://slashdot.org/",
|
||||
b"http://tux.wr.usgs.gov/Maps/155.25-19.5.html",
|
||||
b"http://volcano.wr.usgs.gov/kilaueastatus.php",
|
||||
b"http://www.avo.alaska.edu/activity/Redoubt.php",
|
||||
b"http://www.dilbert.com/fast/",
|
||||
b"http://www.fourmilab.ch/gravitation/orbits/",
|
||||
b"http://www.fpoa.net/",
|
||||
b"http://www.ioccc.org/index.html",
|
||||
b"http://www.isthe.com/cgi-bin/number.cgi",
|
||||
b"http://www.isthe.com/chongo/bio.html",
|
||||
b"http://www.isthe.com/chongo/index.html",
|
||||
b"http://www.isthe.com/chongo/src/calc/lucas-calc",
|
||||
b"http://www.isthe.com/chongo/tech/astro/venus2004.html",
|
||||
b"http://www.isthe.com/chongo/tech/astro/vita.html",
|
||||
b"http://www.isthe.com/chongo/tech/comp/c/expert.html",
|
||||
b"http://www.isthe.com/chongo/tech/comp/calc/index.html",
|
||||
b"http://www.isthe.com/chongo/tech/comp/fnv/index.html",
|
||||
b"http://www.isthe.com/chongo/tech/math/number/howhigh.html",
|
||||
b"http://www.isthe.com/chongo/tech/math/number/number.html",
|
||||
b"http://www.isthe.com/chongo/tech/math/prime/mersenne.html",
|
||||
b"http://www.isthe.com/chongo/tech/math/prime/mersenne.html#largest",
|
||||
b"http://www.lavarnd.org/cgi-bin/corpspeak.cgi",
|
||||
b"http://www.lavarnd.org/cgi-bin/haiku.cgi",
|
||||
b"http://www.lavarnd.org/cgi-bin/rand-none.cgi",
|
||||
b"http://www.lavarnd.org/cgi-bin/randdist.cgi",
|
||||
b"http://www.lavarnd.org/index.html",
|
||||
b"http://www.lavarnd.org/what/nist-test.html",
|
||||
b"http://www.macosxhints.com/",
|
||||
b"http://www.mellis.com/",
|
||||
b"http://www.nature.nps.gov/air/webcams/parks/havoso2alert/havoalert.cfm",
|
||||
b"http://www.nature.nps.gov/air/webcams/parks/havoso2alert/timelines_24.cfm",
|
||||
b"http://www.paulnoll.com/",
|
||||
b"http://www.pepysdiary.com/",
|
||||
b"http://www.sciencenews.org/index/home/activity/view",
|
||||
b"http://www.skyandtelescope.com/",
|
||||
b"http://www.sput.nl/~rob/sirius.html",
|
||||
b"http://www.systemexperts.com/",
|
||||
b"http://www.tq-international.com/phpBB3/index.php",
|
||||
b"http://www.travelquesttours.com/index.htm",
|
||||
b"http://www.wunderground.com/global/stations/89606.html",
|
||||
b"21701" * 10,
|
||||
b"M21701" * 10,
|
||||
b"2^21701-1" * 10,
|
||||
b"\x54\xc5" * 10,
|
||||
b"\xc5\x54" * 10,
|
||||
b"23209" * 10,
|
||||
b"M23209" * 10,
|
||||
b"2^23209-1" * 10,
|
||||
b"\x5a\xa9" * 10,
|
||||
b"\xa9\x5a" * 10,
|
||||
b"391581216093" * 10,
|
||||
b"391581*2^216093-1" * 10,
|
||||
b"\x05\xf9\x9d\x03\x4c\x81" * 10,
|
||||
b"FEDCBA9876543210" * 10,
|
||||
b"\xfe\xdc\xba\x98\x76\x54\x32\x10" * 10,
|
||||
b"EFCDAB8967452301" * 10,
|
||||
b"\xef\xcd\xab\x89\x67\x45\x23\x01" * 10,
|
||||
b"0123456789ABCDEF" * 10,
|
||||
b"\x01\x23\x45\x67\x89\xab\xcd\xef" * 10,
|
||||
b"1032547698BADCFE" * 10,
|
||||
b"\x10\x32\x54\x76\x98\xba\xdc\xfe" * 10,
|
||||
b"\x00" * 500,
|
||||
b"\x07" * 500,
|
||||
b"~" * 500,
|
||||
b"\x7f" * 500,
|
||||
]
|
||||
|
||||
OUTPUTS = [
|
||||
EMPTY_HASH_VALUE,
|
||||
0xAF63DC4C8601EC8C,
|
||||
0xAF63DF4C8601F1A5,
|
||||
0xAF63DE4C8601EFF2,
|
||||
0xAF63D94C8601E773,
|
||||
0xAF63D84C8601E5C0,
|
||||
0xAF63DB4C8601EAD9,
|
||||
0x08985907B541D342,
|
||||
0xDCB27518FED9D577,
|
||||
0xDD120E790C2512AF,
|
||||
0xCAC165AFA2FEF40A,
|
||||
0x85944171F73967E8,
|
||||
0xAF63BD4C8601B7DF,
|
||||
0x089BE207B544F1E4,
|
||||
0x08A61407B54D9B5F,
|
||||
0x08A2AE07B54AB836,
|
||||
0x0891B007B53C4869,
|
||||
0x088E4A07B5396540,
|
||||
0x08987C07B5420EBB,
|
||||
0xDCB28A18FED9F926,
|
||||
0xDD1270790C25B935,
|
||||
0xCAC146AFA2FEBF5D,
|
||||
0x8593D371F738ACFE,
|
||||
0x34531CA7168B8F38,
|
||||
0x08A25607B54A22AE,
|
||||
0xF5FAF0190CF90DF3,
|
||||
0xF27397910B3221C7,
|
||||
0x2C8C2B76062F22E0,
|
||||
0xE150688C8217B8FD,
|
||||
0xF35A83C10E4F1F87,
|
||||
0xD1EDD10B507344D0,
|
||||
0x2A5EE739B3DDB8C3,
|
||||
0xDCFB970CA1C0D310,
|
||||
0x4054DA76DAA6DA90,
|
||||
0xF70A2FF589861368,
|
||||
0x4C628B38AED25F17,
|
||||
0x9DD1F6510F78189F,
|
||||
0xA3DE85BD491270CE,
|
||||
0x858E2FA32A55E61D,
|
||||
0x46810940EFF5F915,
|
||||
0xF5FADD190CF8EDAA,
|
||||
0xF273ED910B32B3E9,
|
||||
0x2C8C5276062F6525,
|
||||
0xE150B98C821842A0,
|
||||
0xF35AA3C10E4F55E7,
|
||||
0xD1ED680B50729265,
|
||||
0x2A5F0639B3DDED70,
|
||||
0xDCFBAA0CA1C0F359,
|
||||
0x4054BA76DAA6A430,
|
||||
0xF709C7F5898562B0,
|
||||
0x4C62E638AED2F9B8,
|
||||
0x9DD1A8510F779415,
|
||||
0xA3DE2ABD4911D62D,
|
||||
0x858E0EA32A55AE0A,
|
||||
0x46810F40EFF60347,
|
||||
0xC33BCE57BEF63EAF,
|
||||
0x08A24307B54A0265,
|
||||
0xF5B9FD190CC18D15,
|
||||
0x4C968290ACE35703,
|
||||
0x07174BD5C64D9350,
|
||||
0x5A294C3FF5D18750,
|
||||
0x05B3C1AEB308B843,
|
||||
0xB92A48DA37D0F477,
|
||||
0x73CDDDCCD80EBC49,
|
||||
0xD58C4C13210A266B,
|
||||
0xE78B6081243EC194,
|
||||
0xB096F77096A39F34,
|
||||
0xB425C54FF807B6A3,
|
||||
0x23E520E2751BB46E,
|
||||
0x1A0B44CCFE1385EC,
|
||||
0xF5BA4B190CC2119F,
|
||||
0x4C962690ACE2BAAF,
|
||||
0x0716DED5C64CDA19,
|
||||
0x5A292C3FF5D150F0,
|
||||
0x05B3E0AEB308ECF0,
|
||||
0xB92A5EDA37D119D9,
|
||||
0x73CE41CCD80F6635,
|
||||
0xD58C2C132109F00B,
|
||||
0xE78BAF81243F47D1,
|
||||
0xB0968F7096A2EE7C,
|
||||
0xB425A84FF807855C,
|
||||
0x23E4E9E2751B56F9,
|
||||
0x1A0B4ECCFE1396EA,
|
||||
0x54ABD453BB2C9004,
|
||||
0x08BA5F07B55EC3DA,
|
||||
0x337354193006CB6E,
|
||||
0xA430D84680AABD0B,
|
||||
0xA9BC8ACCA21F39B1,
|
||||
0x6961196491CC682D,
|
||||
0xAD2BB1774799DFE9,
|
||||
0x6961166491CC6314,
|
||||
0x8D1BB3904A3B1236,
|
||||
0x6961176491CC64C7,
|
||||
0xED205D87F40434C7,
|
||||
0x6961146491CC5FAE,
|
||||
0xCD3BAF5E44F8AD9C,
|
||||
0xE3B36596127CD6D8,
|
||||
0xF77F1072C8E8A646,
|
||||
0xE3B36396127CD372,
|
||||
0x6067DCE9932AD458,
|
||||
0xE3B37596127CF208,
|
||||
0x4B7B10FA9FE83936,
|
||||
0xAABAFE7104D914BE,
|
||||
0xF4D3180B3CDE3EDA,
|
||||
0xAABAFD7104D9130B,
|
||||
0xF4CFB20B3CDB5BB1,
|
||||
0xAABAFC7104D91158,
|
||||
0xF4CC4C0B3CD87888,
|
||||
0xE729BAC5D2A8D3A7,
|
||||
0x74BC0524F4DFA4C5,
|
||||
0xE72630C5D2A5B352,
|
||||
0x6B983224EF8FB456,
|
||||
0xE73042C5D2AE266D,
|
||||
0x8527E324FDEB4B37,
|
||||
0x0A83C86FEE952ABC,
|
||||
0x7318523267779D74,
|
||||
0x3E66D3D56B8CACA1,
|
||||
0x956694A5C0095593,
|
||||
0xCAC54572BB1A6FC8,
|
||||
0xA7A4C9F3EDEBF0D8,
|
||||
0x7829851FAC17B143,
|
||||
0x2C8F4C9AF81BCF06,
|
||||
0xD34E31539740C732,
|
||||
0x3605A2AC253D2DB1,
|
||||
0x08C11B8346F4A3C3,
|
||||
0x6BE396289CE8A6DA,
|
||||
0xD9B957FB7FE794C5,
|
||||
0x05BE33DA04560A93,
|
||||
0x0957F1577BA9747C,
|
||||
0xDA2CC3ACC24FBA57,
|
||||
0x74136F185B29E7F0,
|
||||
0xB2F2B4590EDB93B2,
|
||||
0xB3608FCE8B86AE04,
|
||||
0x4A3A865079359063,
|
||||
0x5B3A7EF496880A50,
|
||||
0x48FAE3163854C23B,
|
||||
0x07AAA640476E0B9A,
|
||||
0x2F653656383A687D,
|
||||
0xA1031F8E7599D79C,
|
||||
0xA31908178FF92477,
|
||||
0x097EDF3C14C3FB83,
|
||||
0xB51CA83FEAA0971B,
|
||||
0xDD3C0D96D784F2E9,
|
||||
0x86CD26A9EA767D78,
|
||||
0xE6B215FF54A30C18,
|
||||
0xEC5B06A1C5531093,
|
||||
0x45665A929F9EC5E5,
|
||||
0x8C7609B4A9F10907,
|
||||
0x89AAC3A491F0D729,
|
||||
0x32CE6B26E0F4A403,
|
||||
0x614AB44E02B53E01,
|
||||
0xFA6472EB6EEF3290,
|
||||
0x9E5D75EB1948EB6A,
|
||||
0xB6D12AD4A8671852,
|
||||
0x88826F56EBA07AF1,
|
||||
0x44535BF2645BC0FD,
|
||||
0x169388FFC21E3728,
|
||||
0xF68AAC9E396D8224,
|
||||
0x8E87D7E7472B3883,
|
||||
0x295C26CAA8B423DE,
|
||||
0x322C814292E72176,
|
||||
0x8A06550EB8AF7268,
|
||||
0xEF86D60E661BCF71,
|
||||
0x9E5426C87F30EE54,
|
||||
0xF1EA8AA826FD047E,
|
||||
0x0BABAF9A642CB769,
|
||||
0x4B3341D4068D012E,
|
||||
0xD15605CBC30A335C,
|
||||
0x5B21060AED8412E5,
|
||||
0x45E2CDA1CE6F4227,
|
||||
0x50AE3745033AD7D4,
|
||||
0xAA4588CED46BF414,
|
||||
0xC1B0056C4A95467E,
|
||||
0x56576A71DE8B4089,
|
||||
0xBF20965FA6DC927E,
|
||||
0x569F8383C2040882,
|
||||
0xE1E772FBA08FECA0,
|
||||
0x4CED94AF97138AC4,
|
||||
0xC4112FFB337A82FB,
|
||||
0xD64A4FD41DE38B7D,
|
||||
0x4CFC32329EDEBCBB,
|
||||
0x0803564445050395,
|
||||
0xAA1574ECF4642FFD,
|
||||
0x694BC4E54CC315F9,
|
||||
0xA3D7CB273B011721,
|
||||
0x577C2F8B6115BFA5,
|
||||
0xB7EC8C1A769FB4C1,
|
||||
0x5D5CFCE63359AB19,
|
||||
0x33B96C3CD65B5F71,
|
||||
0xD845097780602BB9,
|
||||
0x84D47645D02DA3D5,
|
||||
0x83544F33B58773A5,
|
||||
0x9175CBB2160836C5,
|
||||
0xC71B3BC175E72BC5,
|
||||
0x636806AC222EC985,
|
||||
0xB6EF0E6950F52ED5,
|
||||
0xEAD3D8A0F3DFDAA5,
|
||||
0x922908FE9A861BA5,
|
||||
0x6D4821DE275FD5C5,
|
||||
0x1FE3FCE62BD816B5,
|
||||
0xC23E9FCCD6F70591,
|
||||
0xC1AF12BDFE16B5B5,
|
||||
0x39E9F18F2F85E221,
|
||||
]
|
||||
|
||||
assert len(INPUTS) == len(OUTPUTS)
|
||||
for i in range(len(INPUTS)):
|
||||
assert _get_fnv1a_hash(INPUTS[i]) == OUTPUTS[i]
|
||||
|
||||
|
||||
def _encode_and_hash(input: str, *, reverse: bool = False) -> int:
|
||||
encoded_input = input.encode("UTF-8")
|
||||
if reverse:
|
||||
encoded_input = encoded_input[::-1]
|
||||
return _get_fnv1a_hash(encoded_input)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("case_sensitive", [True, False])
|
||||
def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive):
|
||||
doc = en_tokenizer("spaCy✨ and Prodigy")
|
||||
|
||||
hashes = doc.get_character_combination_hashes(
|
||||
case_sensitive=case_sensitive,
|
||||
p_lengths=bytes(
|
||||
(
|
||||
1,
|
||||
3,
|
||||
4,
|
||||
)
|
||||
),
|
||||
s_lengths=bytes(
|
||||
(
|
||||
2,
|
||||
3,
|
||||
4,
|
||||
5,
|
||||
)
|
||||
),
|
||||
)
|
||||
assert hashes[0][0] == _encode_and_hash("s")
|
||||
assert hashes[0][1] == _encode_and_hash("spa")
|
||||
assert hashes[0][2] == _encode_and_hash("spaC" if case_sensitive else "spac")
|
||||
assert hashes[0][3] == _encode_and_hash("yC" if case_sensitive else "yc")
|
||||
assert hashes[0][4] == _encode_and_hash("yCa" if case_sensitive else "yca")
|
||||
assert hashes[0][5] == _encode_and_hash("yCap" if case_sensitive else "ycap")
|
||||
assert hashes[0][6] == _encode_and_hash("yCaps" if case_sensitive else "ycaps")
|
||||
assert hashes[1][0] == _encode_and_hash("✨")
|
||||
assert hashes[1][1] == _encode_and_hash("✨")
|
||||
assert hashes[1][2] == _encode_and_hash("✨")
|
||||
assert hashes[1][3] == _encode_and_hash("✨", reverse=True)
|
||||
assert hashes[1][4] == _encode_and_hash("✨", reverse=True)
|
||||
assert hashes[1][5] == _encode_and_hash("✨", reverse=True)
|
||||
assert hashes[1][6] == _encode_and_hash("✨", reverse=True)
|
||||
assert hashes[2][0] == _encode_and_hash("a")
|
||||
assert hashes[2][1] == _encode_and_hash("and")
|
||||
assert hashes[2][2] == _encode_and_hash("and")
|
||||
assert hashes[2][3] == _encode_and_hash("dn")
|
||||
assert hashes[2][4] == _encode_and_hash("dna")
|
||||
assert hashes[2][5] == _encode_and_hash("dna")
|
||||
assert hashes[2][6] == _encode_and_hash("dna")
|
||||
assert hashes[3][0] == _encode_and_hash("P" if case_sensitive else "p")
|
||||
assert hashes[3][1] == _encode_and_hash("Pro" if case_sensitive else "pro")
|
||||
assert hashes[3][2] == _encode_and_hash("Prod" if case_sensitive else "prod")
|
||||
assert hashes[3][3] == _encode_and_hash("yg")
|
||||
assert hashes[3][4] == _encode_and_hash("ygi")
|
||||
assert hashes[3][5] == _encode_and_hash("ygid")
|
||||
assert hashes[3][6] == _encode_and_hash("ygido")
|
||||
|
||||
|
||||
def test_get_character_combination_hashes_good_case_partial(en_tokenizer):
|
||||
doc = en_tokenizer("spaCy✨ and Prodigy")
|
||||
hashes = doc.get_character_combination_hashes(
|
||||
case_sensitive=False,
|
||||
p_lengths=bytes(),
|
||||
s_lengths=bytes(
|
||||
(
|
||||
2,
|
||||
3,
|
||||
4,
|
||||
5,
|
||||
)
|
||||
),
|
||||
)
|
||||
|
||||
assert hashes[0][0] == _encode_and_hash("yc")
|
||||
assert hashes[0][1] == _encode_and_hash("yca")
|
||||
assert hashes[0][2] == _encode_and_hash("ycap")
|
||||
assert hashes[0][3] == _encode_and_hash("ycaps")
|
||||
assert hashes[1][0] == _encode_and_hash("✨", reverse=True)
|
||||
assert hashes[1][1] == _encode_and_hash("✨", reverse=True)
|
||||
assert hashes[1][2] == _encode_and_hash("✨", reverse=True)
|
||||
assert hashes[1][3] == _encode_and_hash("✨", reverse=True)
|
||||
assert hashes[2][0] == _encode_and_hash("dn")
|
||||
assert hashes[2][1] == _encode_and_hash("dna")
|
||||
assert hashes[2][2] == _encode_and_hash("dna")
|
||||
assert hashes[2][3] == _encode_and_hash("dna")
|
||||
assert hashes[3][0] == _encode_and_hash("yg")
|
||||
assert hashes[3][1] == _encode_and_hash("ygi")
|
||||
assert hashes[3][2] == _encode_and_hash("ygid")
|
||||
assert hashes[3][3] == _encode_and_hash("ygido")
|
||||
|
||||
|
||||
def test_get_character_combination_hashes_various_lengths(en_tokenizer):
|
||||
doc = en_tokenizer("sp𐌞Cé")
|
||||
|
||||
for p_length in range(1, 8):
|
||||
for s_length in range(1, 8):
|
||||
|
||||
hashes = doc.get_character_combination_hashes(
|
||||
case_sensitive=False,
|
||||
p_lengths=bytes((p_length,)),
|
||||
s_lengths=bytes((s_length,)),
|
||||
)
|
||||
|
||||
assert hashes[0][0] == _encode_and_hash("sp𐌞cé"[:p_length])
|
||||
assert hashes[0][1] == _encode_and_hash("sp𐌞cé"[-s_length:], reverse=True)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("case_sensitive", [True, False])
|
||||
def test_get_character_combination_hashes_turkish_i_with_dot(
|
||||
en_tokenizer, case_sensitive
|
||||
):
|
||||
doc = en_tokenizer("İ".lower() + "İ")
|
||||
hashes = doc.get_character_combination_hashes(
|
||||
case_sensitive=case_sensitive,
|
||||
p_lengths=bytes(
|
||||
(
|
||||
1,
|
||||
2,
|
||||
3,
|
||||
4,
|
||||
)
|
||||
),
|
||||
s_lengths=bytes(
|
||||
(
|
||||
1,
|
||||
2,
|
||||
3,
|
||||
4,
|
||||
)
|
||||
),
|
||||
)
|
||||
|
||||
COMBINING_DOT_ABOVE = b"\xcc\x87".decode("UTF-8")
|
||||
assert hashes[0][0] == _encode_and_hash("i")
|
||||
assert hashes[0][1] == _encode_and_hash("İ".lower())
|
||||
if case_sensitive:
|
||||
assert hashes[0][2] == _encode_and_hash("İ".lower() + "İ")
|
||||
assert hashes[0][3] == _encode_and_hash("İ".lower() + "İ")
|
||||
assert hashes[0][4] == _encode_and_hash("İ", reverse=True)
|
||||
assert hashes[0][5] == _encode_and_hash(COMBINING_DOT_ABOVE + "İ", reverse=True)
|
||||
assert hashes[0][6] == _encode_and_hash("İ".lower() + "İ", reverse=True)
|
||||
assert hashes[0][7] == _encode_and_hash("İ".lower() + "İ", reverse=True)
|
||||
|
||||
else:
|
||||
assert hashes[0][2] == _encode_and_hash("İ".lower() + "i")
|
||||
assert hashes[0][3] == _encode_and_hash("İ".lower() * 2)
|
||||
assert hashes[0][4] == _encode_and_hash(COMBINING_DOT_ABOVE, reverse=True)
|
||||
assert hashes[0][5] == _encode_and_hash("İ".lower(), reverse=True)
|
||||
assert hashes[0][6] == _encode_and_hash(
|
||||
COMBINING_DOT_ABOVE + "İ".lower(), reverse=True
|
||||
)
|
||||
assert hashes[0][7] == _encode_and_hash("İ".lower() * 2, reverse=True)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("case_sensitive", [True, False])
|
||||
def test_get_character_combination_hashes_string_store_spec_cases(
|
||||
en_tokenizer, case_sensitive
|
||||
):
|
||||
symbol = "FLAG19"
|
||||
short_word = "bee"
|
||||
normal_word = "serendipity"
|
||||
long_word = "serendipity" * 50
|
||||
assert len(long_word) > 255
|
||||
doc = en_tokenizer(" ".join((symbol, short_word, normal_word, long_word)))
|
||||
assert len(doc) == 4
|
||||
hashes = doc.get_character_combination_hashes(
|
||||
case_sensitive=case_sensitive,
|
||||
p_lengths=bytes((2,)),
|
||||
s_lengths=bytes((2,)),
|
||||
)
|
||||
assert hashes[0][0] == _encode_and_hash("FL" if case_sensitive else "fl")
|
||||
assert hashes[0][1] == _encode_and_hash("91")
|
||||
assert hashes[1][0] == _encode_and_hash("be")
|
||||
assert hashes[1][1] == _encode_and_hash("ee")
|
||||
assert hashes[2][0] == hashes[3][0] == _encode_and_hash("se")
|
||||
assert hashes[2][1] == hashes[3][1] == _encode_and_hash("yt")
|
||||
|
||||
|
||||
def test_character_combination_hashes_empty_lengths(en_tokenizer):
|
||||
doc = en_tokenizer("and𐌞")
|
||||
assert doc.get_character_combination_hashes(
|
||||
case_sensitive=True,
|
||||
p_lengths=bytes(),
|
||||
s_lengths=bytes(),
|
||||
).shape == (1, 0)
|
||||
|
|
639
spacy/tests/doc/test_richfeatureextractor.py
Normal file
639
spacy/tests/doc/test_richfeatureextractor.py
Normal file
|
@ -0,0 +1,639 @@
|
|||
import pytest
|
||||
from ...ml.richfeatureextractor import get_character_combination_hashes
|
||||
|
||||
EMPTY_HASH_VALUE = 0xCBF29CE484222325
|
||||
FNV1A_OFFSET_BASIS = 0xCBF29CE484222325
|
||||
FNV1A_PRIME = 0x00000100000001B3
|
||||
|
||||
|
||||
def _get_fnv1a_hash(input: bytes) -> int:
|
||||
hash_val = FNV1A_OFFSET_BASIS
|
||||
length = len(input)
|
||||
offset = 0
|
||||
|
||||
while offset < length:
|
||||
hash_val ^= input[offset]
|
||||
hash_val *= FNV1A_PRIME
|
||||
hash_val %= 2**64
|
||||
offset += 1
|
||||
return hash_val
|
||||
|
||||
|
||||
def test_fnv1a_hash():
|
||||
"""Checks the conformity of the 64-bit FNV1A implementation with
|
||||
http://www.isthe.com/chongo/src/fnv/test_fnv.c.
|
||||
The method called here, _get_fnv1a_hash(), is only used in testing;
|
||||
in production code, the hashing is performed in a fashion that is interweaved
|
||||
with other logic. The conformity of the production code is demonstrated by the
|
||||
character combination hash tests, where hashes produced by the production code
|
||||
are tested for equality against hashes produced by _get_fnv1a_hash().
|
||||
"""
|
||||
INPUTS = [
|
||||
b"",
|
||||
b"a",
|
||||
b"b",
|
||||
b"c",
|
||||
b"d",
|
||||
b"e",
|
||||
b"f",
|
||||
b"fo",
|
||||
b"foo",
|
||||
b"foob",
|
||||
b"fooba",
|
||||
b"foobar",
|
||||
b"\x00",
|
||||
b"a\x00",
|
||||
b"b\x00",
|
||||
b"c\x00",
|
||||
b"d\x00",
|
||||
b"e\x00",
|
||||
b"f\x00",
|
||||
b"fo\x00",
|
||||
b"foo\x00",
|
||||
b"foob\x00",
|
||||
b"fooba\x00",
|
||||
b"foobar\x00",
|
||||
b"ch",
|
||||
b"cho",
|
||||
b"chon",
|
||||
b"chong",
|
||||
b"chongo",
|
||||
b"chongo ",
|
||||
b"chongo w",
|
||||
b"chongo wa",
|
||||
b"chongo was",
|
||||
b"chongo was ",
|
||||
b"chongo was h",
|
||||
b"chongo was he",
|
||||
b"chongo was her",
|
||||
b"chongo was here",
|
||||
b"chongo was here!",
|
||||
b"chongo was here!\n",
|
||||
b"ch\x00",
|
||||
b"cho\x00",
|
||||
b"chon\x00",
|
||||
b"chong\x00",
|
||||
b"chongo\x00",
|
||||
b"chongo \x00",
|
||||
b"chongo w\x00",
|
||||
b"chongo wa\x00",
|
||||
b"chongo was\x00",
|
||||
b"chongo was \x00",
|
||||
b"chongo was h\x00",
|
||||
b"chongo was he\x00",
|
||||
b"chongo was her\x00",
|
||||
b"chongo was here\x00",
|
||||
b"chongo was here!\x00",
|
||||
b"chongo was here!\n\x00",
|
||||
b"cu",
|
||||
b"cur",
|
||||
b"curd",
|
||||
b"curds",
|
||||
b"curds ",
|
||||
b"curds a",
|
||||
b"curds an",
|
||||
b"curds and",
|
||||
b"curds and ",
|
||||
b"curds and w",
|
||||
b"curds and wh",
|
||||
b"curds and whe",
|
||||
b"curds and whey",
|
||||
b"curds and whey\n",
|
||||
b"cu\x00",
|
||||
b"cur\x00",
|
||||
b"curd\x00",
|
||||
b"curds\x00",
|
||||
b"curds \x00",
|
||||
b"curds a\x00",
|
||||
b"curds an\x00",
|
||||
b"curds and\x00",
|
||||
b"curds and \x00",
|
||||
b"curds and w\x00",
|
||||
b"curds and wh\x00",
|
||||
b"curds and whe\x00",
|
||||
b"curds and whey\x00",
|
||||
b"curds and whey\n\x00",
|
||||
b"hi",
|
||||
b"hi\x00",
|
||||
b"hello",
|
||||
b"hello\x00",
|
||||
b"\xff\x00\x00\x01",
|
||||
b"\x01\x00\x00\xff",
|
||||
b"\xff\x00\x00\x02",
|
||||
b"\x02\x00\x00\xff",
|
||||
b"\xff\x00\x00\x03",
|
||||
b"\x03\x00\x00\xff",
|
||||
b"\xff\x00\x00\x04",
|
||||
b"\x04\x00\x00\xff",
|
||||
b"\x40\x51\x4e\x44",
|
||||
b"\x44\x4e\x51\x40",
|
||||
b"\x40\x51\x4e\x4a",
|
||||
b"\x4a\x4e\x51\x40",
|
||||
b"\x40\x51\x4e\x54",
|
||||
b"\x54\x4e\x51\x40",
|
||||
b"127.0.0.1",
|
||||
b"127.0.0.1\x00",
|
||||
b"127.0.0.2",
|
||||
b"127.0.0.2\x00",
|
||||
b"127.0.0.3",
|
||||
b"127.0.0.3\x00",
|
||||
b"64.81.78.68",
|
||||
b"64.81.78.68\x00",
|
||||
b"64.81.78.74",
|
||||
b"64.81.78.74\x00",
|
||||
b"64.81.78.84",
|
||||
b"64.81.78.84\x00",
|
||||
b"feedface",
|
||||
b"feedface\x00",
|
||||
b"feedfacedaffdeed",
|
||||
b"feedfacedaffdeed\x00",
|
||||
b"feedfacedeadbeef",
|
||||
b"feedfacedeadbeef\x00",
|
||||
b"line 1\nline 2\nline 3",
|
||||
b"chongo <Landon Curt Noll> /\\../\\",
|
||||
b"chongo <Landon Curt Noll> /\\../\\\x00",
|
||||
b"chongo (Landon Curt Noll) /\\../\\",
|
||||
b"chongo (Landon Curt Noll) /\\../\\\x00",
|
||||
b"http://antwrp.gsfc.nasa.gov/apod/astropix.html",
|
||||
b"http://en.wikipedia.org/wiki/Fowler_Noll_Vo_hash",
|
||||
b"http://epod.usra.edu/",
|
||||
b"http://exoplanet.eu/",
|
||||
b"http://hvo.wr.usgs.gov/cam3/",
|
||||
b"http://hvo.wr.usgs.gov/cams/HMcam/",
|
||||
b"http://hvo.wr.usgs.gov/kilauea/update/deformation.html",
|
||||
b"http://hvo.wr.usgs.gov/kilauea/update/images.html",
|
||||
b"http://hvo.wr.usgs.gov/kilauea/update/maps.html",
|
||||
b"http://hvo.wr.usgs.gov/volcanowatch/current_issue.html",
|
||||
b"http://neo.jpl.nasa.gov/risk/",
|
||||
b"http://norvig.com/21-days.html",
|
||||
b"http://primes.utm.edu/curios/home.php",
|
||||
b"http://slashdot.org/",
|
||||
b"http://tux.wr.usgs.gov/Maps/155.25-19.5.html",
|
||||
b"http://volcano.wr.usgs.gov/kilaueastatus.php",
|
||||
b"http://www.avo.alaska.edu/activity/Redoubt.php",
|
||||
b"http://www.dilbert.com/fast/",
|
||||
b"http://www.fourmilab.ch/gravitation/orbits/",
|
||||
b"http://www.fpoa.net/",
|
||||
b"http://www.ioccc.org/index.html",
|
||||
b"http://www.isthe.com/cgi-bin/number.cgi",
|
||||
b"http://www.isthe.com/chongo/bio.html",
|
||||
b"http://www.isthe.com/chongo/index.html",
|
||||
b"http://www.isthe.com/chongo/src/calc/lucas-calc",
|
||||
b"http://www.isthe.com/chongo/tech/astro/venus2004.html",
|
||||
b"http://www.isthe.com/chongo/tech/astro/vita.html",
|
||||
b"http://www.isthe.com/chongo/tech/comp/c/expert.html",
|
||||
b"http://www.isthe.com/chongo/tech/comp/calc/index.html",
|
||||
b"http://www.isthe.com/chongo/tech/comp/fnv/index.html",
|
||||
b"http://www.isthe.com/chongo/tech/math/number/howhigh.html",
|
||||
b"http://www.isthe.com/chongo/tech/math/number/number.html",
|
||||
b"http://www.isthe.com/chongo/tech/math/prime/mersenne.html",
|
||||
b"http://www.isthe.com/chongo/tech/math/prime/mersenne.html#largest",
|
||||
b"http://www.lavarnd.org/cgi-bin/corpspeak.cgi",
|
||||
b"http://www.lavarnd.org/cgi-bin/haiku.cgi",
|
||||
b"http://www.lavarnd.org/cgi-bin/rand-none.cgi",
|
||||
b"http://www.lavarnd.org/cgi-bin/randdist.cgi",
|
||||
b"http://www.lavarnd.org/index.html",
|
||||
b"http://www.lavarnd.org/what/nist-test.html",
|
||||
b"http://www.macosxhints.com/",
|
||||
b"http://www.mellis.com/",
|
||||
b"http://www.nature.nps.gov/air/webcams/parks/havoso2alert/havoalert.cfm",
|
||||
b"http://www.nature.nps.gov/air/webcams/parks/havoso2alert/timelines_24.cfm",
|
||||
b"http://www.paulnoll.com/",
|
||||
b"http://www.pepysdiary.com/",
|
||||
b"http://www.sciencenews.org/index/home/activity/view",
|
||||
b"http://www.skyandtelescope.com/",
|
||||
b"http://www.sput.nl/~rob/sirius.html",
|
||||
b"http://www.systemexperts.com/",
|
||||
b"http://www.tq-international.com/phpBB3/index.php",
|
||||
b"http://www.travelquesttours.com/index.htm",
|
||||
b"http://www.wunderground.com/global/stations/89606.html",
|
||||
b"21701" * 10,
|
||||
b"M21701" * 10,
|
||||
b"2^21701-1" * 10,
|
||||
b"\x54\xc5" * 10,
|
||||
b"\xc5\x54" * 10,
|
||||
b"23209" * 10,
|
||||
b"M23209" * 10,
|
||||
b"2^23209-1" * 10,
|
||||
b"\x5a\xa9" * 10,
|
||||
b"\xa9\x5a" * 10,
|
||||
b"391581216093" * 10,
|
||||
b"391581*2^216093-1" * 10,
|
||||
b"\x05\xf9\x9d\x03\x4c\x81" * 10,
|
||||
b"FEDCBA9876543210" * 10,
|
||||
b"\xfe\xdc\xba\x98\x76\x54\x32\x10" * 10,
|
||||
b"EFCDAB8967452301" * 10,
|
||||
b"\xef\xcd\xab\x89\x67\x45\x23\x01" * 10,
|
||||
b"0123456789ABCDEF" * 10,
|
||||
b"\x01\x23\x45\x67\x89\xab\xcd\xef" * 10,
|
||||
b"1032547698BADCFE" * 10,
|
||||
b"\x10\x32\x54\x76\x98\xba\xdc\xfe" * 10,
|
||||
b"\x00" * 500,
|
||||
b"\x07" * 500,
|
||||
b"~" * 500,
|
||||
b"\x7f" * 500,
|
||||
]
|
||||
|
||||
OUTPUTS = [
|
||||
EMPTY_HASH_VALUE,
|
||||
0xAF63DC4C8601EC8C,
|
||||
0xAF63DF4C8601F1A5,
|
||||
0xAF63DE4C8601EFF2,
|
||||
0xAF63D94C8601E773,
|
||||
0xAF63D84C8601E5C0,
|
||||
0xAF63DB4C8601EAD9,
|
||||
0x08985907B541D342,
|
||||
0xDCB27518FED9D577,
|
||||
0xDD120E790C2512AF,
|
||||
0xCAC165AFA2FEF40A,
|
||||
0x85944171F73967E8,
|
||||
0xAF63BD4C8601B7DF,
|
||||
0x089BE207B544F1E4,
|
||||
0x08A61407B54D9B5F,
|
||||
0x08A2AE07B54AB836,
|
||||
0x0891B007B53C4869,
|
||||
0x088E4A07B5396540,
|
||||
0x08987C07B5420EBB,
|
||||
0xDCB28A18FED9F926,
|
||||
0xDD1270790C25B935,
|
||||
0xCAC146AFA2FEBF5D,
|
||||
0x8593D371F738ACFE,
|
||||
0x34531CA7168B8F38,
|
||||
0x08A25607B54A22AE,
|
||||
0xF5FAF0190CF90DF3,
|
||||
0xF27397910B3221C7,
|
||||
0x2C8C2B76062F22E0,
|
||||
0xE150688C8217B8FD,
|
||||
0xF35A83C10E4F1F87,
|
||||
0xD1EDD10B507344D0,
|
||||
0x2A5EE739B3DDB8C3,
|
||||
0xDCFB970CA1C0D310,
|
||||
0x4054DA76DAA6DA90,
|
||||
0xF70A2FF589861368,
|
||||
0x4C628B38AED25F17,
|
||||
0x9DD1F6510F78189F,
|
||||
0xA3DE85BD491270CE,
|
||||
0x858E2FA32A55E61D,
|
||||
0x46810940EFF5F915,
|
||||
0xF5FADD190CF8EDAA,
|
||||
0xF273ED910B32B3E9,
|
||||
0x2C8C5276062F6525,
|
||||
0xE150B98C821842A0,
|
||||
0xF35AA3C10E4F55E7,
|
||||
0xD1ED680B50729265,
|
||||
0x2A5F0639B3DDED70,
|
||||
0xDCFBAA0CA1C0F359,
|
||||
0x4054BA76DAA6A430,
|
||||
0xF709C7F5898562B0,
|
||||
0x4C62E638AED2F9B8,
|
||||
0x9DD1A8510F779415,
|
||||
0xA3DE2ABD4911D62D,
|
||||
0x858E0EA32A55AE0A,
|
||||
0x46810F40EFF60347,
|
||||
0xC33BCE57BEF63EAF,
|
||||
0x08A24307B54A0265,
|
||||
0xF5B9FD190CC18D15,
|
||||
0x4C968290ACE35703,
|
||||
0x07174BD5C64D9350,
|
||||
0x5A294C3FF5D18750,
|
||||
0x05B3C1AEB308B843,
|
||||
0xB92A48DA37D0F477,
|
||||
0x73CDDDCCD80EBC49,
|
||||
0xD58C4C13210A266B,
|
||||
0xE78B6081243EC194,
|
||||
0xB096F77096A39F34,
|
||||
0xB425C54FF807B6A3,
|
||||
0x23E520E2751BB46E,
|
||||
0x1A0B44CCFE1385EC,
|
||||
0xF5BA4B190CC2119F,
|
||||
0x4C962690ACE2BAAF,
|
||||
0x0716DED5C64CDA19,
|
||||
0x5A292C3FF5D150F0,
|
||||
0x05B3E0AEB308ECF0,
|
||||
0xB92A5EDA37D119D9,
|
||||
0x73CE41CCD80F6635,
|
||||
0xD58C2C132109F00B,
|
||||
0xE78BAF81243F47D1,
|
||||
0xB0968F7096A2EE7C,
|
||||
0xB425A84FF807855C,
|
||||
0x23E4E9E2751B56F9,
|
||||
0x1A0B4ECCFE1396EA,
|
||||
0x54ABD453BB2C9004,
|
||||
0x08BA5F07B55EC3DA,
|
||||
0x337354193006CB6E,
|
||||
0xA430D84680AABD0B,
|
||||
0xA9BC8ACCA21F39B1,
|
||||
0x6961196491CC682D,
|
||||
0xAD2BB1774799DFE9,
|
||||
0x6961166491CC6314,
|
||||
0x8D1BB3904A3B1236,
|
||||
0x6961176491CC64C7,
|
||||
0xED205D87F40434C7,
|
||||
0x6961146491CC5FAE,
|
||||
0xCD3BAF5E44F8AD9C,
|
||||
0xE3B36596127CD6D8,
|
||||
0xF77F1072C8E8A646,
|
||||
0xE3B36396127CD372,
|
||||
0x6067DCE9932AD458,
|
||||
0xE3B37596127CF208,
|
||||
0x4B7B10FA9FE83936,
|
||||
0xAABAFE7104D914BE,
|
||||
0xF4D3180B3CDE3EDA,
|
||||
0xAABAFD7104D9130B,
|
||||
0xF4CFB20B3CDB5BB1,
|
||||
0xAABAFC7104D91158,
|
||||
0xF4CC4C0B3CD87888,
|
||||
0xE729BAC5D2A8D3A7,
|
||||
0x74BC0524F4DFA4C5,
|
||||
0xE72630C5D2A5B352,
|
||||
0x6B983224EF8FB456,
|
||||
0xE73042C5D2AE266D,
|
||||
0x8527E324FDEB4B37,
|
||||
0x0A83C86FEE952ABC,
|
||||
0x7318523267779D74,
|
||||
0x3E66D3D56B8CACA1,
|
||||
0x956694A5C0095593,
|
||||
0xCAC54572BB1A6FC8,
|
||||
0xA7A4C9F3EDEBF0D8,
|
||||
0x7829851FAC17B143,
|
||||
0x2C8F4C9AF81BCF06,
|
||||
0xD34E31539740C732,
|
||||
0x3605A2AC253D2DB1,
|
||||
0x08C11B8346F4A3C3,
|
||||
0x6BE396289CE8A6DA,
|
||||
0xD9B957FB7FE794C5,
|
||||
0x05BE33DA04560A93,
|
||||
0x0957F1577BA9747C,
|
||||
0xDA2CC3ACC24FBA57,
|
||||
0x74136F185B29E7F0,
|
||||
0xB2F2B4590EDB93B2,
|
||||
0xB3608FCE8B86AE04,
|
||||
0x4A3A865079359063,
|
||||
0x5B3A7EF496880A50,
|
||||
0x48FAE3163854C23B,
|
||||
0x07AAA640476E0B9A,
|
||||
0x2F653656383A687D,
|
||||
0xA1031F8E7599D79C,
|
||||
0xA31908178FF92477,
|
||||
0x097EDF3C14C3FB83,
|
||||
0xB51CA83FEAA0971B,
|
||||
0xDD3C0D96D784F2E9,
|
||||
0x86CD26A9EA767D78,
|
||||
0xE6B215FF54A30C18,
|
||||
0xEC5B06A1C5531093,
|
||||
0x45665A929F9EC5E5,
|
||||
0x8C7609B4A9F10907,
|
||||
0x89AAC3A491F0D729,
|
||||
0x32CE6B26E0F4A403,
|
||||
0x614AB44E02B53E01,
|
||||
0xFA6472EB6EEF3290,
|
||||
0x9E5D75EB1948EB6A,
|
||||
0xB6D12AD4A8671852,
|
||||
0x88826F56EBA07AF1,
|
||||
0x44535BF2645BC0FD,
|
||||
0x169388FFC21E3728,
|
||||
0xF68AAC9E396D8224,
|
||||
0x8E87D7E7472B3883,
|
||||
0x295C26CAA8B423DE,
|
||||
0x322C814292E72176,
|
||||
0x8A06550EB8AF7268,
|
||||
0xEF86D60E661BCF71,
|
||||
0x9E5426C87F30EE54,
|
||||
0xF1EA8AA826FD047E,
|
||||
0x0BABAF9A642CB769,
|
||||
0x4B3341D4068D012E,
|
||||
0xD15605CBC30A335C,
|
||||
0x5B21060AED8412E5,
|
||||
0x45E2CDA1CE6F4227,
|
||||
0x50AE3745033AD7D4,
|
||||
0xAA4588CED46BF414,
|
||||
0xC1B0056C4A95467E,
|
||||
0x56576A71DE8B4089,
|
||||
0xBF20965FA6DC927E,
|
||||
0x569F8383C2040882,
|
||||
0xE1E772FBA08FECA0,
|
||||
0x4CED94AF97138AC4,
|
||||
0xC4112FFB337A82FB,
|
||||
0xD64A4FD41DE38B7D,
|
||||
0x4CFC32329EDEBCBB,
|
||||
0x0803564445050395,
|
||||
0xAA1574ECF4642FFD,
|
||||
0x694BC4E54CC315F9,
|
||||
0xA3D7CB273B011721,
|
||||
0x577C2F8B6115BFA5,
|
||||
0xB7EC8C1A769FB4C1,
|
||||
0x5D5CFCE63359AB19,
|
||||
0x33B96C3CD65B5F71,
|
||||
0xD845097780602BB9,
|
||||
0x84D47645D02DA3D5,
|
||||
0x83544F33B58773A5,
|
||||
0x9175CBB2160836C5,
|
||||
0xC71B3BC175E72BC5,
|
||||
0x636806AC222EC985,
|
||||
0xB6EF0E6950F52ED5,
|
||||
0xEAD3D8A0F3DFDAA5,
|
||||
0x922908FE9A861BA5,
|
||||
0x6D4821DE275FD5C5,
|
||||
0x1FE3FCE62BD816B5,
|
||||
0xC23E9FCCD6F70591,
|
||||
0xC1AF12BDFE16B5B5,
|
||||
0x39E9F18F2F85E221,
|
||||
]
|
||||
|
||||
assert len(INPUTS) == len(OUTPUTS)
|
||||
for i in range(len(INPUTS)):
|
||||
assert _get_fnv1a_hash(INPUTS[i]) == OUTPUTS[i]
|
||||
|
||||
|
||||
def _encode_and_hash(input: str, *, reverse: bool = False) -> int:
|
||||
encoded_input = input.encode("UTF-8")
|
||||
if reverse:
|
||||
encoded_input = encoded_input[::-1]
|
||||
return _get_fnv1a_hash(encoded_input)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("case_sensitive", [True, False])
|
||||
def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive):
|
||||
doc = en_tokenizer("spaCy✨ and Prodigy")
|
||||
|
||||
hashes = get_character_combination_hashes(
|
||||
doc=doc,
|
||||
case_sensitive=case_sensitive,
|
||||
p_lengths=bytes(
|
||||
(
|
||||
1,
|
||||
3,
|
||||
4,
|
||||
)
|
||||
),
|
||||
s_lengths=bytes(
|
||||
(
|
||||
2,
|
||||
3,
|
||||
4,
|
||||
5,
|
||||
)
|
||||
),
|
||||
)
|
||||
assert hashes[0][0] == _encode_and_hash("s")
|
||||
assert hashes[0][1] == _encode_and_hash("spa")
|
||||
assert hashes[0][2] == _encode_and_hash("spaC" if case_sensitive else "spac")
|
||||
assert hashes[0][3] == _encode_and_hash("yC" if case_sensitive else "yc")
|
||||
assert hashes[0][4] == _encode_and_hash("yCa" if case_sensitive else "yca")
|
||||
assert hashes[0][5] == _encode_and_hash("yCap" if case_sensitive else "ycap")
|
||||
assert hashes[0][6] == _encode_and_hash("yCaps" if case_sensitive else "ycaps")
|
||||
assert hashes[1][0] == _encode_and_hash("✨")
|
||||
assert hashes[1][1] == _encode_and_hash("✨")
|
||||
assert hashes[1][2] == _encode_and_hash("✨")
|
||||
assert hashes[1][3] == _encode_and_hash("✨", reverse=True)
|
||||
assert hashes[1][4] == _encode_and_hash("✨", reverse=True)
|
||||
assert hashes[1][5] == _encode_and_hash("✨", reverse=True)
|
||||
assert hashes[1][6] == _encode_and_hash("✨", reverse=True)
|
||||
assert hashes[2][0] == _encode_and_hash("a")
|
||||
assert hashes[2][1] == _encode_and_hash("and")
|
||||
assert hashes[2][2] == _encode_and_hash("and")
|
||||
assert hashes[2][3] == _encode_and_hash("dn")
|
||||
assert hashes[2][4] == _encode_and_hash("dna")
|
||||
assert hashes[2][5] == _encode_and_hash("dna")
|
||||
assert hashes[2][6] == _encode_and_hash("dna")
|
||||
assert hashes[3][0] == _encode_and_hash("P" if case_sensitive else "p")
|
||||
assert hashes[3][1] == _encode_and_hash("Pro" if case_sensitive else "pro")
|
||||
assert hashes[3][2] == _encode_and_hash("Prod" if case_sensitive else "prod")
|
||||
assert hashes[3][3] == _encode_and_hash("yg")
|
||||
assert hashes[3][4] == _encode_and_hash("ygi")
|
||||
assert hashes[3][5] == _encode_and_hash("ygid")
|
||||
assert hashes[3][6] == _encode_and_hash("ygido")
|
||||
|
||||
|
||||
def test_get_character_combination_hashes_good_case_no_prefixes(en_tokenizer):
|
||||
doc = en_tokenizer("spaCy✨ and Prodigy")
|
||||
hashes = get_character_combination_hashes(
|
||||
doc=doc,
|
||||
case_sensitive=False,
|
||||
p_lengths=bytes(),
|
||||
s_lengths=bytes(
|
||||
(
|
||||
2,
|
||||
3,
|
||||
4,
|
||||
5,
|
||||
)
|
||||
),
|
||||
)
|
||||
|
||||
assert hashes[0][0] == _encode_and_hash("yc")
|
||||
assert hashes[0][1] == _encode_and_hash("yca")
|
||||
assert hashes[0][2] == _encode_and_hash("ycap")
|
||||
assert hashes[0][3] == _encode_and_hash("ycaps")
|
||||
assert hashes[1][0] == _encode_and_hash("✨", reverse=True)
|
||||
assert hashes[1][1] == _encode_and_hash("✨", reverse=True)
|
||||
assert hashes[1][2] == _encode_and_hash("✨", reverse=True)
|
||||
assert hashes[1][3] == _encode_and_hash("✨", reverse=True)
|
||||
assert hashes[2][0] == _encode_and_hash("dn")
|
||||
assert hashes[2][1] == _encode_and_hash("dna")
|
||||
assert hashes[2][2] == _encode_and_hash("dna")
|
||||
assert hashes[2][3] == _encode_and_hash("dna")
|
||||
assert hashes[3][0] == _encode_and_hash("yg")
|
||||
assert hashes[3][1] == _encode_and_hash("ygi")
|
||||
assert hashes[3][2] == _encode_and_hash("ygid")
|
||||
assert hashes[3][3] == _encode_and_hash("ygido")
|
||||
|
||||
|
||||
def test_get_character_combination_hashes_loop_through_lengths(en_tokenizer):
|
||||
doc = en_tokenizer("sp𐌞Cé")
|
||||
|
||||
for p_length in range(1, 8):
|
||||
for s_length in range(1, 8):
|
||||
|
||||
hashes = get_character_combination_hashes(
|
||||
doc=doc,
|
||||
case_sensitive=False,
|
||||
p_lengths=bytes((p_length,)),
|
||||
s_lengths=bytes((s_length,)),
|
||||
)
|
||||
|
||||
assert hashes[0][0] == _encode_and_hash("sp𐌞cé"[:p_length])
|
||||
assert hashes[0][1] == _encode_and_hash("sp𐌞cé"[-s_length:], reverse=True)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("case_sensitive", [True, False])
|
||||
def test_get_character_combination_hashes_turkish_i_with_dot(
|
||||
en_tokenizer, case_sensitive
|
||||
):
|
||||
doc = en_tokenizer("İ".lower() + "İ")
|
||||
hashes = get_character_combination_hashes(
|
||||
doc=doc,
|
||||
case_sensitive=case_sensitive,
|
||||
p_lengths=bytes(
|
||||
(
|
||||
1,
|
||||
2,
|
||||
3,
|
||||
4,
|
||||
)
|
||||
),
|
||||
s_lengths=bytes(
|
||||
(
|
||||
1,
|
||||
2,
|
||||
3,
|
||||
4,
|
||||
)
|
||||
),
|
||||
)
|
||||
|
||||
COMBINING_DOT_ABOVE = b"\xcc\x87".decode("UTF-8")
|
||||
assert hashes[0][0] == _encode_and_hash("i")
|
||||
assert hashes[0][1] == _encode_and_hash("İ".lower())
|
||||
if case_sensitive:
|
||||
assert hashes[0][2] == _encode_and_hash("İ".lower() + "İ")
|
||||
assert hashes[0][3] == _encode_and_hash("İ".lower() + "İ")
|
||||
assert hashes[0][4] == _encode_and_hash("İ", reverse=True)
|
||||
assert hashes[0][5] == _encode_and_hash(COMBINING_DOT_ABOVE + "İ", reverse=True)
|
||||
assert hashes[0][6] == _encode_and_hash("İ".lower() + "İ", reverse=True)
|
||||
assert hashes[0][7] == _encode_and_hash("İ".lower() + "İ", reverse=True)
|
||||
|
||||
else:
|
||||
assert hashes[0][2] == _encode_and_hash("İ".lower() + "i")
|
||||
assert hashes[0][3] == _encode_and_hash("İ".lower() * 2)
|
||||
assert hashes[0][4] == _encode_and_hash(COMBINING_DOT_ABOVE, reverse=True)
|
||||
assert hashes[0][5] == _encode_and_hash("İ".lower(), reverse=True)
|
||||
assert hashes[0][6] == _encode_and_hash(
|
||||
COMBINING_DOT_ABOVE + "İ".lower(), reverse=True
|
||||
)
|
||||
assert hashes[0][7] == _encode_and_hash("İ".lower() * 2, reverse=True)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("case_sensitive", [True, False])
|
||||
def test_get_character_combination_hashes_string_store_spec_cases(
|
||||
en_tokenizer, case_sensitive
|
||||
):
|
||||
symbol = "FLAG19"
|
||||
short_word = "bee"
|
||||
normal_word = "serendipity"
|
||||
long_word = "serendipity" * 50
|
||||
assert len(long_word) > 255
|
||||
doc = en_tokenizer(" ".join((symbol, short_word, normal_word, long_word)))
|
||||
assert len(doc) == 4
|
||||
hashes = get_character_combination_hashes(
|
||||
doc=doc,
|
||||
case_sensitive=case_sensitive,
|
||||
p_lengths=bytes((2,)),
|
||||
s_lengths=bytes((2,)),
|
||||
)
|
||||
assert hashes[0][0] == _encode_and_hash("FL" if case_sensitive else "fl")
|
||||
assert hashes[0][1] == _encode_and_hash("91")
|
||||
assert hashes[1][0] == _encode_and_hash("be")
|
||||
assert hashes[1][1] == _encode_and_hash("ee")
|
||||
assert hashes[2][0] == hashes[3][0] == _encode_and_hash("se")
|
||||
assert hashes[2][1] == hashes[3][1] == _encode_and_hash("yt")
|
||||
|
||||
|
||||
def test_character_combination_hashes_empty_lengths(en_tokenizer):
|
||||
doc = en_tokenizer("and𐌞")
|
||||
assert get_character_combination_hashes(
|
||||
doc=doc,
|
||||
case_sensitive=True,
|
||||
p_lengths=bytes(),
|
||||
s_lengths=bytes(),
|
||||
).shape == (1, 0)
|
|
@ -10,6 +10,7 @@ from ..attrs cimport attr_id_t
|
|||
cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil
|
||||
cdef attr_t get_token_attr_for_matcher(const TokenC* token, attr_id_t feat_name) nogil
|
||||
|
||||
|
||||
ctypedef const LexemeC* const_Lexeme_ptr
|
||||
ctypedef const TokenC* const_TokenC_ptr
|
||||
|
||||
|
@ -18,7 +19,6 @@ ctypedef fused LexemeOrToken:
|
|||
const_TokenC_ptr
|
||||
|
||||
|
||||
|
||||
cdef int set_children_from_heads(TokenC* tokens, int start, int end) except -1
|
||||
|
||||
|
||||
|
@ -34,31 +34,6 @@ cdef int token_by_end(const TokenC* tokens, int length, int end_char) except -2
|
|||
cdef int [:,:] _get_lca_matrix(Doc, int start, int end)
|
||||
|
||||
|
||||
cdef void _set_prefix_lengths(
|
||||
const unsigned char* tok_str,
|
||||
const int tok_str_l,
|
||||
const int p_max_l,
|
||||
unsigned char* pref_l_buf,
|
||||
) nogil
|
||||
|
||||
|
||||
cdef void _set_suffix_lengths(
|
||||
const unsigned char* tok_str,
|
||||
const int tok_str_l,
|
||||
const int s_max_l,
|
||||
unsigned char* suff_l_buf,
|
||||
) nogil
|
||||
|
||||
|
||||
cdef int _write_hashes(
|
||||
const unsigned char* res_buf,
|
||||
const unsigned char* aff_l_buf,
|
||||
const unsigned char* offset_buf,
|
||||
const int res_buf_last,
|
||||
np.uint64_t* hashes_ptr,
|
||||
) nogil
|
||||
|
||||
|
||||
cdef class Doc:
|
||||
cdef readonly Pool mem
|
||||
cdef readonly Vocab vocab
|
||||
|
|
|
@ -126,7 +126,7 @@ class Doc:
|
|||
blocked: Optional[List[Span]] = ...,
|
||||
missing: Optional[List[Span]] = ...,
|
||||
outside: Optional[List[Span]] = ...,
|
||||
default: str = ...,
|
||||
default: str = ...
|
||||
) -> None: ...
|
||||
@property
|
||||
def noun_chunks(self) -> Iterator[Span]: ...
|
||||
|
@ -174,12 +174,5 @@ class Doc:
|
|||
self, doc_json: Dict[str, Any] = ..., validate: bool = False
|
||||
) -> Doc: ...
|
||||
def to_utf8_array(self, nr_char: int = ...) -> Ints2d: ...
|
||||
def get_character_combination_hashes(
|
||||
self,
|
||||
*,
|
||||
case_sensitive: bool,
|
||||
p_lengths: bytes,
|
||||
s_lengths: bytes,
|
||||
) -> Ints2d: ...
|
||||
@staticmethod
|
||||
def _get_array_attrs() -> Tuple[Any]: ...
|
||||
|
|
|
@ -1,9 +1,9 @@
|
|||
# cython: infer_types=True, bounds_check=False, profile=True
|
||||
from typing import Set, List
|
||||
from typing import Set
|
||||
|
||||
cimport cython
|
||||
cimport numpy as np
|
||||
from libc.string cimport memcpy, memcmp, memset, strlen
|
||||
from libc.string cimport memcpy
|
||||
from libc.math cimport sqrt
|
||||
from libc.stdint cimport int32_t, uint64_t
|
||||
|
||||
|
@ -21,7 +21,6 @@ from .span cimport Span
|
|||
from .token cimport MISSING_DEP
|
||||
from ._dict_proxies import SpanGroups
|
||||
from .token cimport Token
|
||||
from ..symbols import NAMES as SYMBOLS_BY_INT
|
||||
from ..lexeme cimport Lexeme, EMPTY_LEXEME
|
||||
from ..typedefs cimport attr_t, flags_t
|
||||
from ..attrs cimport attr_id_t
|
||||
|
@ -41,7 +40,7 @@ from ._serialize import ALL_ATTRS as DOCBIN_ALL_ATTRS
|
|||
from ..util import get_words_and_spaces
|
||||
|
||||
DEF PADDING = 5
|
||||
MAX_UTF8_CHAR_BYTE_WIDTH = 4
|
||||
|
||||
|
||||
cdef int bounds_check(int i, int length, int padding) except -1:
|
||||
if (i + padding) < 0:
|
||||
|
@ -1746,77 +1745,6 @@ cdef class Doc:
|
|||
j += 1
|
||||
return output
|
||||
|
||||
def get_character_combination_hashes(self,
|
||||
*,
|
||||
const bint case_sensitive,
|
||||
const unsigned char* p_lengths,
|
||||
const unsigned char* s_lengths,
|
||||
):
|
||||
"""
|
||||
Returns a 2D NumPy array where the rows represent tokens and the columns represent hashes of various character combinations
|
||||
derived from the raw text of each token.
|
||||
|
||||
case_sensitive: if *False*, hashes are generated based on the lower-case version of each token.
|
||||
p_lengths: an array of single-byte values specifying the lengths of prefixes to be hashed in ascending order.
|
||||
For example, if *p_lengths==[2, 3]*, the prefixes hashed for "spaCy" would be "sp" and "spa".
|
||||
s_lengths: an array of single-byte values specifying the lengths of suffixes to be hashed in ascending order.
|
||||
For example, if *s_lengths==[2, 3]* and *cs == True*, the suffixes hashed for "spaCy" would be "yC" and "yCa".
|
||||
|
||||
Many of the buffers passed into and used by this method contain single-byte numerical values. This takes advantage of
|
||||
the fact that we are hashing short affixes and searching for small groups of characters. The calling code is responsible
|
||||
for ensuring that lengths being passed in cannot exceed 63 and hence, with maximally four-byte
|
||||
character widths, that individual values within buffers can never exceed the capacity of a single byte (255).
|
||||
|
||||
Note that this method performs no data validation itself as it expects the calling code will already have done so, and
|
||||
that the behaviour of the code may be erratic if the supplied parameters do not conform to expectations.
|
||||
"""
|
||||
|
||||
# Work out lengths
|
||||
cdef int p_lengths_l = strlen(<char*> p_lengths)
|
||||
cdef int s_lengths_l = strlen(<char*> s_lengths)
|
||||
cdef int p_max_l = p_lengths[p_lengths_l - 1] if p_lengths_l > 0 else 0
|
||||
cdef int s_max_l = s_lengths[s_lengths_l - 1] if s_lengths_l > 0 else 0
|
||||
|
||||
# Define / allocate buffers
|
||||
cdef Pool mem = Pool()
|
||||
cdef unsigned char* pref_l_buf = <unsigned char*> mem.alloc(p_max_l, sizeof(char))
|
||||
cdef unsigned char* suff_l_buf = <unsigned char*> mem.alloc(s_max_l, sizeof(char))
|
||||
cdef int doc_l = self.length
|
||||
cdef np.ndarray[np.uint64_t, ndim=2] hashes = numpy.empty(
|
||||
(doc_l, p_lengths_l + s_lengths_l), dtype="uint64")
|
||||
cdef np.uint64_t* hashes_ptr = <np.uint64_t*> hashes.data
|
||||
|
||||
# Define working variables
|
||||
cdef TokenC tok_c
|
||||
cdef int tok_i, tok_str_l
|
||||
cdef attr_t num_tok_attr
|
||||
cdef bytes tok_str_bytes
|
||||
cdef const unsigned char* tok_str
|
||||
|
||||
for tok_i in range(doc_l):
|
||||
tok_c = self.c[tok_i]
|
||||
num_tok_attr = tok_c.lex.orth if case_sensitive else tok_c.lex.lower
|
||||
if num_tok_attr < len(SYMBOLS_BY_INT): # hardly ever happens
|
||||
if num_tok_attr == 0:
|
||||
tok_str_bytes = b""
|
||||
else:
|
||||
tok_str_bytes = SYMBOLS_BY_INT[num_tok_attr].encode("UTF-8")
|
||||
tok_str = tok_str_bytes
|
||||
tok_str_l = len(tok_str_bytes)
|
||||
else:
|
||||
tok_str, tok_str_l = self.vocab.strings.utf8_ptr(num_tok_attr)
|
||||
|
||||
if p_max_l > 0:
|
||||
_set_prefix_lengths(tok_str, tok_str_l, p_max_l, pref_l_buf)
|
||||
hashes_ptr += _write_hashes(tok_str, p_lengths, pref_l_buf, 0, hashes_ptr)
|
||||
|
||||
if s_max_l > 0:
|
||||
_set_suffix_lengths(tok_str, tok_str_l, s_max_l, suff_l_buf)
|
||||
hashes_ptr += _write_hashes(tok_str, s_lengths, suff_l_buf, tok_str_l - 1, hashes_ptr)
|
||||
|
||||
return hashes
|
||||
|
||||
|
||||
@staticmethod
|
||||
def _get_array_attrs():
|
||||
attrs = [LENGTH, SPACY]
|
||||
|
@ -1998,113 +1926,6 @@ cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end):
|
|||
return lca_matrix
|
||||
|
||||
|
||||
cdef void _set_prefix_lengths(
|
||||
const unsigned char* tok_str,
|
||||
const int tok_str_l,
|
||||
const int p_max_l,
|
||||
unsigned char* pref_l_buf,
|
||||
) nogil:
|
||||
""" Populate *pref_l_buf*, which has length *p_max_l*, with the byte lengths of each of the substrings terminated by the first *p_max_l*
|
||||
characters within *tok_str*. Lengths that are greater than the character length of the whole word are populated with the byte length
|
||||
of the whole word.
|
||||
|
||||
tok_str: a UTF-8 representation of a string.
|
||||
tok_str_l: the length of *tok_str*.
|
||||
p_max_l: the number of characters to process at the beginning of the word.
|
||||
pref_l_buf: a buffer of length *p_max_l* in which to store the lengths. The code calling *get_character_combination_hashes()* is
|
||||
responsible for ensuring that *p_max_l* cannot exceed 63 and hence, with maximally four-byte character widths, that individual values
|
||||
within the buffer can never exceed the capacity of a single byte (255).
|
||||
"""
|
||||
cdef int tok_str_idx = 1, pref_l_buf_idx = 0
|
||||
|
||||
while pref_l_buf_idx < p_max_l:
|
||||
if (tok_str_idx >= tok_str_l
|
||||
or
|
||||
((tok_str[tok_str_idx] & 0xc0) != 0x80) # not a continuation character
|
||||
):
|
||||
pref_l_buf[pref_l_buf_idx] = tok_str_idx
|
||||
pref_l_buf_idx += 1
|
||||
if tok_str_idx >= tok_str_l:
|
||||
break
|
||||
tok_str_idx += 1
|
||||
|
||||
if pref_l_buf_idx < p_max_l:
|
||||
memset(pref_l_buf + pref_l_buf_idx, pref_l_buf[pref_l_buf_idx - 1], p_max_l - pref_l_buf_idx)
|
||||
|
||||
|
||||
cdef void _set_suffix_lengths(
|
||||
const unsigned char* tok_str,
|
||||
const int tok_str_l,
|
||||
const int s_max_l,
|
||||
unsigned char* suff_l_buf,
|
||||
) nogil:
|
||||
""" Populate *suff_l_buf*, which has length *s_max_l*, with the byte lengths of each of the substrings started by the last *s_max_l*
|
||||
characters within *tok_str*. Lengths that are greater than the character length of the whole word are populated with the byte length
|
||||
of the whole word.
|
||||
|
||||
tok_str: a UTF-8 representation of a string.
|
||||
tok_str_l: the length of *tok_str*.
|
||||
s_max_l: the number of characters to process at the end of the word.
|
||||
suff_l_buf: a buffer of length *s_max_l* in which to store the lengths. The code calling *get_character_combination_hashes()* is
|
||||
responsible for ensuring that *s_max_l* cannot exceed 63 and hence, with maximally four-byte character widths, that individual values
|
||||
within the buffer can never exceed the capacity of a single byte (255).
|
||||
"""
|
||||
cdef int tok_str_idx = tok_str_l - 1, suff_l_buf_idx = 0
|
||||
|
||||
while suff_l_buf_idx < s_max_l:
|
||||
if (tok_str[tok_str_idx] & 0xc0) != 0x80: # not a continuation character
|
||||
suff_l_buf[suff_l_buf_idx] = tok_str_l - tok_str_idx
|
||||
suff_l_buf_idx += 1
|
||||
tok_str_idx -= 1
|
||||
if tok_str_idx < 0:
|
||||
break
|
||||
|
||||
if suff_l_buf_idx < s_max_l:
|
||||
memset(suff_l_buf + suff_l_buf_idx, suff_l_buf[suff_l_buf_idx - 1], s_max_l - suff_l_buf_idx)
|
||||
|
||||
|
||||
cdef uint64_t FNV1A_OFFSET_BASIS = 0xcbf29ce484222325
|
||||
cdef uint64_t FNV1A_PRIME = 0x00000100000001B3
|
||||
|
||||
|
||||
cdef int _write_hashes(
|
||||
const unsigned char* res_buf,
|
||||
const unsigned char* aff_l_buf,
|
||||
const unsigned char* offset_buf,
|
||||
const int res_buf_last,
|
||||
np.uint64_t* hashes_ptr,
|
||||
) nogil:
|
||||
""" Write 64-bit FNV1A hashes for a token/rich property group combination.
|
||||
|
||||
res_buf: the string from which to generate the hash values.
|
||||
aff_l_buf: one-byte lengths describing how many characters to hash.
|
||||
offset_buf: one-byte lengths specifying the byte offset of each character within *res_buf*.
|
||||
res_buf_last: if affixes should start at the end of *res_buf*, the offset of the last byte in
|
||||
*res_buf*; if affixes should start at the beginning of *res_buf*, *0*.
|
||||
hashes_ptr: a pointer starting from which the new hashes should be written.
|
||||
|
||||
Returns: the number of hashes written.
|
||||
"""
|
||||
|
||||
cdef int last_offset = 0, hash_idx = 0, offset, aff_l
|
||||
cdef uint64_t hash_val = FNV1A_OFFSET_BASIS
|
||||
|
||||
while True:
|
||||
aff_l = aff_l_buf[hash_idx]
|
||||
if aff_l == 0:
|
||||
return hash_idx
|
||||
offset = offset_buf[aff_l - 1]
|
||||
while last_offset < offset:
|
||||
if res_buf_last > 0:
|
||||
hash_val ^= res_buf[res_buf_last - last_offset]
|
||||
else:
|
||||
hash_val ^= res_buf[last_offset]
|
||||
hash_val *= FNV1A_PRIME
|
||||
last_offset += 1
|
||||
hashes_ptr[hash_idx] = hash_val
|
||||
hash_idx += 1
|
||||
|
||||
|
||||
def pickle_doc(doc):
|
||||
bytes_data = doc.to_bytes(exclude=["vocab", "user_data", "user_hooks"])
|
||||
hooks_and_data = (doc.user_data, doc.user_hooks, doc.user_span_hooks,
|
||||
|
|
Loading…
Reference in New Issue
Block a user