Move functionality to richfeatureextractor

This commit is contained in:
richard@explosion.ai 2023-02-01 18:04:06 +01:00
parent 5a08596f92
commit 5d24934cf5
12 changed files with 920 additions and 897 deletions

View File

@ -34,6 +34,7 @@ MOD_NAMES = [
"spacy.kb.kb",
"spacy.kb.kb_in_memory",
"spacy.ml.parser_model",
"spacy.ml.richfeatureextractor",
"spacy.morphology",
"spacy.pipeline.dep_parser",
"spacy.pipeline._edit_tree_internals.edit_trees",

View File

@ -973,6 +973,7 @@ class Errors(metaclass=ErrorsWithCodes):
E1051 = ("Invalid rich group config '{label}'.")
E1052 = ("Length > 63 in rich group config '{label}'.")
E1053 = ("Rich group config {label} specifies lengths that are not in ascending order.")
E1054 = ("Mismatched lengths in hash embed config: {len_rows} rows, {len_attrs} attrs.")
# Deprecated model shortcuts, only used in errors and warnings

View File

@ -151,7 +151,7 @@ def MultiHashEmbed(
Requires a vectors table to be loaded in the Doc objects' vocab.
"""
if len(rows) != len(attrs):
raise ValueError(f"Mismatched lengths: {len(rows)} vs {len(attrs)}")
raise ValueError(Errors.E1054.format(len_rows=len(rows), len_attrs=len(attrs)))
seed = 7
def make_hash_embed(index):
@ -253,7 +253,7 @@ def RichMultiHashEmbed(
"""
if len(rows) != len(attrs):
raise ValueError(f"Mismatched lengths: {len(rows)} vs {len(attrs)}")
raise ValueError(Errors.E1054.format(len_rows=len(rows), len_attrs=len(attrs)))
_verify_rich_config_group("prefix", pref_lengths, pref_rows)
_verify_rich_config_group("suffix", suff_lengths, suff_rows)
@ -296,7 +296,7 @@ def RichMultiHashEmbed(
),
max_out,
ragged2list(),
Dropout(0.0)
Dropout(0.0),
)
else:
model = chain(
@ -305,7 +305,7 @@ def RichMultiHashEmbed(
with_array(concatenate(*embeddings)),
max_out,
ragged2list(),
Dropout(0.0)
Dropout(0.0),
)
return model

View File

@ -0,0 +1,27 @@
cimport numpy as np
cdef void _set_prefix_lengths(
const unsigned char* tok_str,
const int tok_str_l,
const int p_max_l,
unsigned char* pref_l_buf,
) nogil
cdef void _set_suffix_lengths(
const unsigned char* tok_str,
const int tok_str_l,
const int s_max_l,
unsigned char* suff_l_buf,
) nogil
cdef int _write_hashes(
const unsigned char* res_buf,
const unsigned char* aff_l_buf,
const unsigned char* offset_buf,
const int res_buf_last,
np.uint64_t* hashes_ptr,
) nogil

View File

@ -1,42 +0,0 @@
from typing import List, Optional, Callable, Tuple
from thinc.types import Ints2d
from thinc.api import Model, registry, get_current_ops
from ..tokens import Doc
@registry.layers("spacy.RichFeatureExtractor.v1")
def RichFeatureExtractor(
*,
case_sensitive: bool,
pref_lengths: Optional[List[int]] = None,
suff_lengths: Optional[List[int]] = None,
) -> Model[List[Doc], List[Ints2d]]:
return Model(
"extract_character_combination_hashes",
forward,
attrs={
"case_sensitive": case_sensitive,
"p_lengths": bytes(pref_lengths) if pref_lengths is not None else bytes(),
"s_lengths": bytes(suff_lengths) if suff_lengths is not None else bytes(),
},
)
def forward(
model: Model[List[Doc], List[Ints2d]], docs, is_train: bool
) -> Tuple[List[Ints2d], Callable]:
ops = model.ops
case_sensitive: bool = model.attrs["case_sensitive"]
p_lengths: bytes = model.attrs["p_lengths"]
s_lengths: bytes = model.attrs["s_lengths"]
features: List[Ints2d] = []
for doc in docs:
hashes = doc.get_character_combination_hashes(
case_sensitive=case_sensitive,
p_lengths=p_lengths,
s_lengths=s_lengths,
)
features.append(ops.asarray2i(hashes, dtype="uint64"))
backprop: Callable[[List[Ints2d]], List] = lambda d_features: []
return features, backprop

View File

@ -0,0 +1,9 @@
from ..tokens import Doc
def get_character_combination_hashes(
*,
doc: Doc,
case_sensitive: bool,
p_lengths: bytes,
s_lengths: bytes,
):

View File

@ -0,0 +1,233 @@
from typing import List, Optional, Callable, Tuple
import numpy
from thinc.types import Ints2d
from thinc.api import Model, registry, get_current_ops
from ..symbols import NAMES as SYMBOLS_BY_INT
cimport numpy as np
from cymem.cymem cimport Pool
from libc.string cimport memset, strlen
from libc.stdint cimport uint64_t
from ..tokens.doc cimport Doc
from ..structs cimport TokenC
from ..typedefs cimport attr_t
@registry.layers("spacy.RichFeatureExtractor.v1")
def RichFeatureExtractor(
*,
case_sensitive: bool,
pref_lengths: Optional[List[int]] = None,
suff_lengths: Optional[List[int]] = None,
) -> Model[List[Doc], List[Ints2d]]:
# Because the calling code guarantees that the integers in the list are each less than 256,
# the integer list can be converted into *bytes*.
return Model(
"extract_character_combination_hashes",
forward,
attrs={
"case_sensitive": case_sensitive,
"p_lengths": bytes(pref_lengths) if pref_lengths is not None else bytes(),
"s_lengths": bytes(suff_lengths) if suff_lengths is not None else bytes(),
},
)
def forward(
model: Model[List[Doc], List[Ints2d]], docs, is_train: bool
) -> Tuple[List[Ints2d], Callable]:
ops = model.ops
case_sensitive: bool = model.attrs["case_sensitive"]
p_lengths: bytes = model.attrs["p_lengths"]
s_lengths: bytes = model.attrs["s_lengths"]
features: List[Ints2d] = []
for doc in docs:
hashes = get_character_combination_hashes(
doc=doc,
case_sensitive=case_sensitive,
p_lengths=p_lengths,
s_lengths=s_lengths,
)
features.append(ops.asarray2i(hashes, dtype="uint64"))
backprop: Callable[[List[Ints2d]], List] = lambda d_features: []
return features, backprop
def get_character_combination_hashes(
*,
Doc doc,
const bint case_sensitive,
const unsigned char* p_lengths,
const unsigned char* s_lengths,
):
"""
Returns a 2D NumPy array where the rows represent tokens and the columns represent hashes of various character combinations
derived from the raw text of each token.
doc: the document
case_sensitive: if *False*, hashes are generated based on the lower-case version of each token.
p_lengths: an array of single-byte values specifying the lengths of prefixes to be hashed in ascending order.
For example, if *p_lengths==[2, 3]*, the prefixes hashed for "spaCy" would be "sp" and "spa".
s_lengths: an array of single-byte values specifying the lengths of suffixes to be hashed in ascending order.
For example, if *s_lengths==[2, 3]* and *cs == True*, the suffixes hashed for "spaCy" would be "yC" and "yCa".
Many of the buffers passed into and used by this method contain single-byte numerical values. This takes advantage of
the fact that we are hashing short affixes and searching for small groups of characters. The calling code is responsible
for ensuring that lengths being passed in cannot exceed 63 and hence, with maximally four-byte
character widths, that individual values within buffers can never exceed the capacity of a single byte (255).
Note that this method performs no data validation itself as it expects the calling code will already have done so, and
that the behaviour of the code may be erratic if the supplied parameters do not conform to expectations.
"""
# Work out lengths
cdef int p_lengths_l = strlen(<char*> p_lengths)
cdef int s_lengths_l = strlen(<char*> s_lengths)
cdef int p_max_l = p_lengths[p_lengths_l - 1] if p_lengths_l > 0 else 0
cdef int s_max_l = s_lengths[s_lengths_l - 1] if s_lengths_l > 0 else 0
# Define / allocate buffers
cdef Pool mem = Pool()
cdef unsigned char* pref_l_buf = <unsigned char*> mem.alloc(p_max_l, sizeof(char))
cdef unsigned char* suff_l_buf = <unsigned char*> mem.alloc(s_max_l, sizeof(char))
cdef int doc_l = doc.length
cdef np.ndarray[np.uint64_t, ndim=2] hashes = numpy.empty(
(doc_l, p_lengths_l + s_lengths_l), dtype="uint64")
cdef np.uint64_t* hashes_ptr = <np.uint64_t*> hashes.data
# Define working variables
cdef TokenC tok_c
cdef int tok_i, tok_str_l
cdef attr_t num_tok_attr
cdef bytes tok_str_bytes
cdef const unsigned char* tok_str
for tok_i in range(doc_l):
tok_c = <TokenC> doc.c[tok_i]
num_tok_attr = tok_c.lex.orth if case_sensitive else tok_c.lex.lower
if num_tok_attr < len(SYMBOLS_BY_INT): # hardly ever happens
if num_tok_attr == 0:
tok_str_bytes = b""
else:
tok_str_bytes = SYMBOLS_BY_INT[num_tok_attr].encode("UTF-8")
tok_str = tok_str_bytes
tok_str_l = len(tok_str_bytes)
else:
tok_str, tok_str_l = doc.vocab.strings.utf8_ptr(num_tok_attr)
if p_max_l > 0:
_set_prefix_lengths(tok_str, tok_str_l, p_max_l, pref_l_buf)
hashes_ptr += _write_hashes(tok_str, p_lengths, pref_l_buf, 0, hashes_ptr)
if s_max_l > 0:
_set_suffix_lengths(tok_str, tok_str_l, s_max_l, suff_l_buf)
hashes_ptr += _write_hashes(tok_str, s_lengths, suff_l_buf, tok_str_l - 1, hashes_ptr)
return hashes
cdef void _set_prefix_lengths(
const unsigned char* tok_str,
const int tok_str_l,
const int p_max_l,
unsigned char* pref_l_buf,
) nogil:
""" Populate *pref_l_buf*, which has length *p_max_l*, with the byte lengths of each of the substrings terminated by the first *p_max_l*
characters within *tok_str*. Lengths that are greater than the character length of the whole word are populated with the byte length
of the whole word.
tok_str: a UTF-8 representation of a string.
tok_str_l: the length of *tok_str*.
p_max_l: the number of characters to process at the beginning of the word.
pref_l_buf: a buffer of length *p_max_l* in which to store the lengths. The code calling *get_character_combination_hashes()* is
responsible for ensuring that *p_max_l* cannot exceed 63 and hence, with maximally four-byte character widths, that individual values
within the buffer can never exceed the capacity of a single byte (255).
"""
cdef int tok_str_idx = 1, pref_l_buf_idx = 0
while pref_l_buf_idx < p_max_l:
if (tok_str_idx >= tok_str_l
or
((tok_str[tok_str_idx] & 0xc0) != 0x80) # not a continuation character
):
pref_l_buf[pref_l_buf_idx] = tok_str_idx
pref_l_buf_idx += 1
if tok_str_idx >= tok_str_l:
break
tok_str_idx += 1
if pref_l_buf_idx < p_max_l:
memset(pref_l_buf + pref_l_buf_idx, pref_l_buf[pref_l_buf_idx - 1], p_max_l - pref_l_buf_idx)
cdef void _set_suffix_lengths(
const unsigned char* tok_str,
const int tok_str_l,
const int s_max_l,
unsigned char* suff_l_buf,
) nogil:
""" Populate *suff_l_buf*, which has length *s_max_l*, with the byte lengths of each of the substrings started by the last *s_max_l*
characters within *tok_str*. Lengths that are greater than the character length of the whole word are populated with the byte length
of the whole word.
tok_str: a UTF-8 representation of a string.
tok_str_l: the length of *tok_str*.
s_max_l: the number of characters to process at the end of the word.
suff_l_buf: a buffer of length *s_max_l* in which to store the lengths. The code calling *get_character_combination_hashes()* is
responsible for ensuring that *s_max_l* cannot exceed 63 and hence, with maximally four-byte character widths, that individual values
within the buffer can never exceed the capacity of a single byte (255).
"""
cdef int tok_str_idx = tok_str_l - 1, suff_l_buf_idx = 0
while suff_l_buf_idx < s_max_l:
if (tok_str[tok_str_idx] & 0xc0) != 0x80: # not a continuation character
suff_l_buf[suff_l_buf_idx] = tok_str_l - tok_str_idx
suff_l_buf_idx += 1
tok_str_idx -= 1
if tok_str_idx < 0:
break
if suff_l_buf_idx < s_max_l:
memset(suff_l_buf + suff_l_buf_idx, suff_l_buf[suff_l_buf_idx - 1], s_max_l - suff_l_buf_idx)
cdef uint64_t FNV1A_OFFSET_BASIS = 0xcbf29ce484222325
cdef uint64_t FNV1A_PRIME = 0x00000100000001B3
cdef int _write_hashes(
const unsigned char* res_buf,
const unsigned char* aff_l_buf,
const unsigned char* offset_buf,
const int res_buf_last,
np.uint64_t* hashes_ptr,
) nogil:
""" Write 64-bit FNV1A hashes for a token/rich property group combination.
res_buf: the string from which to generate the hash values.
aff_l_buf: one-byte lengths describing how many characters to hash.
offset_buf: one-byte lengths specifying the byte offset of each character within *res_buf*.
res_buf_last: if affixes should start at the end of *res_buf*, the offset of the last byte in
*res_buf*; if affixes should start at the beginning of *res_buf*, *0*.
hashes_ptr: a pointer starting from which the new hashes should be written.
Returns: the number of hashes written.
"""
cdef int last_offset = 0, hash_idx = 0, offset, aff_l
cdef uint64_t hash_val = FNV1A_OFFSET_BASIS
while True:
aff_l = aff_l_buf[hash_idx]
if aff_l == 0:
return hash_idx
offset = offset_buf[aff_l - 1]
while last_offset < offset:
if res_buf_last > 0:
hash_val ^= res_buf[res_buf_last - last_offset]
else:
hash_val ^= res_buf[last_offset]
hash_val *= FNV1A_PRIME
last_offset += 1
hashes_ptr[hash_idx] = hash_val
hash_idx += 1

View File

@ -1,8 +1,6 @@
from pickle import EMPTY_DICT
import weakref
import numpy
from time import time
from numpy.testing import assert_array_equal
import pytest
import warnings
@ -992,635 +990,3 @@ def test_doc_spans_setdefault(en_tokenizer):
assert len(doc.spans["key2"]) == 1
doc.spans.setdefault("key3", default=SpanGroup(doc, spans=[doc[0:1], doc[1:2]]))
assert len(doc.spans["key3"]) == 2
EMPTY_HASH_VALUE = 0xCBF29CE484222325
FNV1A_OFFSET_BASIS = 0xCBF29CE484222325
FNV1A_PRIME = 0x00000100000001B3
def _get_fnv1a_hash(input: bytes) -> int:
hash_val = FNV1A_OFFSET_BASIS
length = len(input)
offset = 0
while offset < length:
hash_val ^= input[offset]
hash_val *= FNV1A_PRIME
hash_val %= 2**64
offset += 1
return hash_val
def test_fnv1a_hash():
"""Checks the conformity of the 64-bit FNV1A implementation with
http://www.isthe.com/chongo/src/fnv/test_fnv.c.
The method called here, _get_fnv1a_hash(), is only used in testing;
in production code, the hashing is performed in a fashion that is interweaved
with other logic. The conformity of the production code is demonstrated by the
character combination hash tests, where hashes produced by the production code
are tested for equality against hashes produced by _get_fnv1a_hash().
"""
INPUTS = [
b"",
b"a",
b"b",
b"c",
b"d",
b"e",
b"f",
b"fo",
b"foo",
b"foob",
b"fooba",
b"foobar",
b"\x00",
b"a\x00",
b"b\x00",
b"c\x00",
b"d\x00",
b"e\x00",
b"f\x00",
b"fo\x00",
b"foo\x00",
b"foob\x00",
b"fooba\x00",
b"foobar\x00",
b"ch",
b"cho",
b"chon",
b"chong",
b"chongo",
b"chongo ",
b"chongo w",
b"chongo wa",
b"chongo was",
b"chongo was ",
b"chongo was h",
b"chongo was he",
b"chongo was her",
b"chongo was here",
b"chongo was here!",
b"chongo was here!\n",
b"ch\x00",
b"cho\x00",
b"chon\x00",
b"chong\x00",
b"chongo\x00",
b"chongo \x00",
b"chongo w\x00",
b"chongo wa\x00",
b"chongo was\x00",
b"chongo was \x00",
b"chongo was h\x00",
b"chongo was he\x00",
b"chongo was her\x00",
b"chongo was here\x00",
b"chongo was here!\x00",
b"chongo was here!\n\x00",
b"cu",
b"cur",
b"curd",
b"curds",
b"curds ",
b"curds a",
b"curds an",
b"curds and",
b"curds and ",
b"curds and w",
b"curds and wh",
b"curds and whe",
b"curds and whey",
b"curds and whey\n",
b"cu\x00",
b"cur\x00",
b"curd\x00",
b"curds\x00",
b"curds \x00",
b"curds a\x00",
b"curds an\x00",
b"curds and\x00",
b"curds and \x00",
b"curds and w\x00",
b"curds and wh\x00",
b"curds and whe\x00",
b"curds and whey\x00",
b"curds and whey\n\x00",
b"hi",
b"hi\x00",
b"hello",
b"hello\x00",
b"\xff\x00\x00\x01",
b"\x01\x00\x00\xff",
b"\xff\x00\x00\x02",
b"\x02\x00\x00\xff",
b"\xff\x00\x00\x03",
b"\x03\x00\x00\xff",
b"\xff\x00\x00\x04",
b"\x04\x00\x00\xff",
b"\x40\x51\x4e\x44",
b"\x44\x4e\x51\x40",
b"\x40\x51\x4e\x4a",
b"\x4a\x4e\x51\x40",
b"\x40\x51\x4e\x54",
b"\x54\x4e\x51\x40",
b"127.0.0.1",
b"127.0.0.1\x00",
b"127.0.0.2",
b"127.0.0.2\x00",
b"127.0.0.3",
b"127.0.0.3\x00",
b"64.81.78.68",
b"64.81.78.68\x00",
b"64.81.78.74",
b"64.81.78.74\x00",
b"64.81.78.84",
b"64.81.78.84\x00",
b"feedface",
b"feedface\x00",
b"feedfacedaffdeed",
b"feedfacedaffdeed\x00",
b"feedfacedeadbeef",
b"feedfacedeadbeef\x00",
b"line 1\nline 2\nline 3",
b"chongo <Landon Curt Noll> /\\../\\",
b"chongo <Landon Curt Noll> /\\../\\\x00",
b"chongo (Landon Curt Noll) /\\../\\",
b"chongo (Landon Curt Noll) /\\../\\\x00",
b"http://antwrp.gsfc.nasa.gov/apod/astropix.html",
b"http://en.wikipedia.org/wiki/Fowler_Noll_Vo_hash",
b"http://epod.usra.edu/",
b"http://exoplanet.eu/",
b"http://hvo.wr.usgs.gov/cam3/",
b"http://hvo.wr.usgs.gov/cams/HMcam/",
b"http://hvo.wr.usgs.gov/kilauea/update/deformation.html",
b"http://hvo.wr.usgs.gov/kilauea/update/images.html",
b"http://hvo.wr.usgs.gov/kilauea/update/maps.html",
b"http://hvo.wr.usgs.gov/volcanowatch/current_issue.html",
b"http://neo.jpl.nasa.gov/risk/",
b"http://norvig.com/21-days.html",
b"http://primes.utm.edu/curios/home.php",
b"http://slashdot.org/",
b"http://tux.wr.usgs.gov/Maps/155.25-19.5.html",
b"http://volcano.wr.usgs.gov/kilaueastatus.php",
b"http://www.avo.alaska.edu/activity/Redoubt.php",
b"http://www.dilbert.com/fast/",
b"http://www.fourmilab.ch/gravitation/orbits/",
b"http://www.fpoa.net/",
b"http://www.ioccc.org/index.html",
b"http://www.isthe.com/cgi-bin/number.cgi",
b"http://www.isthe.com/chongo/bio.html",
b"http://www.isthe.com/chongo/index.html",
b"http://www.isthe.com/chongo/src/calc/lucas-calc",
b"http://www.isthe.com/chongo/tech/astro/venus2004.html",
b"http://www.isthe.com/chongo/tech/astro/vita.html",
b"http://www.isthe.com/chongo/tech/comp/c/expert.html",
b"http://www.isthe.com/chongo/tech/comp/calc/index.html",
b"http://www.isthe.com/chongo/tech/comp/fnv/index.html",
b"http://www.isthe.com/chongo/tech/math/number/howhigh.html",
b"http://www.isthe.com/chongo/tech/math/number/number.html",
b"http://www.isthe.com/chongo/tech/math/prime/mersenne.html",
b"http://www.isthe.com/chongo/tech/math/prime/mersenne.html#largest",
b"http://www.lavarnd.org/cgi-bin/corpspeak.cgi",
b"http://www.lavarnd.org/cgi-bin/haiku.cgi",
b"http://www.lavarnd.org/cgi-bin/rand-none.cgi",
b"http://www.lavarnd.org/cgi-bin/randdist.cgi",
b"http://www.lavarnd.org/index.html",
b"http://www.lavarnd.org/what/nist-test.html",
b"http://www.macosxhints.com/",
b"http://www.mellis.com/",
b"http://www.nature.nps.gov/air/webcams/parks/havoso2alert/havoalert.cfm",
b"http://www.nature.nps.gov/air/webcams/parks/havoso2alert/timelines_24.cfm",
b"http://www.paulnoll.com/",
b"http://www.pepysdiary.com/",
b"http://www.sciencenews.org/index/home/activity/view",
b"http://www.skyandtelescope.com/",
b"http://www.sput.nl/~rob/sirius.html",
b"http://www.systemexperts.com/",
b"http://www.tq-international.com/phpBB3/index.php",
b"http://www.travelquesttours.com/index.htm",
b"http://www.wunderground.com/global/stations/89606.html",
b"21701" * 10,
b"M21701" * 10,
b"2^21701-1" * 10,
b"\x54\xc5" * 10,
b"\xc5\x54" * 10,
b"23209" * 10,
b"M23209" * 10,
b"2^23209-1" * 10,
b"\x5a\xa9" * 10,
b"\xa9\x5a" * 10,
b"391581216093" * 10,
b"391581*2^216093-1" * 10,
b"\x05\xf9\x9d\x03\x4c\x81" * 10,
b"FEDCBA9876543210" * 10,
b"\xfe\xdc\xba\x98\x76\x54\x32\x10" * 10,
b"EFCDAB8967452301" * 10,
b"\xef\xcd\xab\x89\x67\x45\x23\x01" * 10,
b"0123456789ABCDEF" * 10,
b"\x01\x23\x45\x67\x89\xab\xcd\xef" * 10,
b"1032547698BADCFE" * 10,
b"\x10\x32\x54\x76\x98\xba\xdc\xfe" * 10,
b"\x00" * 500,
b"\x07" * 500,
b"~" * 500,
b"\x7f" * 500,
]
OUTPUTS = [
EMPTY_HASH_VALUE,
0xAF63DC4C8601EC8C,
0xAF63DF4C8601F1A5,
0xAF63DE4C8601EFF2,
0xAF63D94C8601E773,
0xAF63D84C8601E5C0,
0xAF63DB4C8601EAD9,
0x08985907B541D342,
0xDCB27518FED9D577,
0xDD120E790C2512AF,
0xCAC165AFA2FEF40A,
0x85944171F73967E8,
0xAF63BD4C8601B7DF,
0x089BE207B544F1E4,
0x08A61407B54D9B5F,
0x08A2AE07B54AB836,
0x0891B007B53C4869,
0x088E4A07B5396540,
0x08987C07B5420EBB,
0xDCB28A18FED9F926,
0xDD1270790C25B935,
0xCAC146AFA2FEBF5D,
0x8593D371F738ACFE,
0x34531CA7168B8F38,
0x08A25607B54A22AE,
0xF5FAF0190CF90DF3,
0xF27397910B3221C7,
0x2C8C2B76062F22E0,
0xE150688C8217B8FD,
0xF35A83C10E4F1F87,
0xD1EDD10B507344D0,
0x2A5EE739B3DDB8C3,
0xDCFB970CA1C0D310,
0x4054DA76DAA6DA90,
0xF70A2FF589861368,
0x4C628B38AED25F17,
0x9DD1F6510F78189F,
0xA3DE85BD491270CE,
0x858E2FA32A55E61D,
0x46810940EFF5F915,
0xF5FADD190CF8EDAA,
0xF273ED910B32B3E9,
0x2C8C5276062F6525,
0xE150B98C821842A0,
0xF35AA3C10E4F55E7,
0xD1ED680B50729265,
0x2A5F0639B3DDED70,
0xDCFBAA0CA1C0F359,
0x4054BA76DAA6A430,
0xF709C7F5898562B0,
0x4C62E638AED2F9B8,
0x9DD1A8510F779415,
0xA3DE2ABD4911D62D,
0x858E0EA32A55AE0A,
0x46810F40EFF60347,
0xC33BCE57BEF63EAF,
0x08A24307B54A0265,
0xF5B9FD190CC18D15,
0x4C968290ACE35703,
0x07174BD5C64D9350,
0x5A294C3FF5D18750,
0x05B3C1AEB308B843,
0xB92A48DA37D0F477,
0x73CDDDCCD80EBC49,
0xD58C4C13210A266B,
0xE78B6081243EC194,
0xB096F77096A39F34,
0xB425C54FF807B6A3,
0x23E520E2751BB46E,
0x1A0B44CCFE1385EC,
0xF5BA4B190CC2119F,
0x4C962690ACE2BAAF,
0x0716DED5C64CDA19,
0x5A292C3FF5D150F0,
0x05B3E0AEB308ECF0,
0xB92A5EDA37D119D9,
0x73CE41CCD80F6635,
0xD58C2C132109F00B,
0xE78BAF81243F47D1,
0xB0968F7096A2EE7C,
0xB425A84FF807855C,
0x23E4E9E2751B56F9,
0x1A0B4ECCFE1396EA,
0x54ABD453BB2C9004,
0x08BA5F07B55EC3DA,
0x337354193006CB6E,
0xA430D84680AABD0B,
0xA9BC8ACCA21F39B1,
0x6961196491CC682D,
0xAD2BB1774799DFE9,
0x6961166491CC6314,
0x8D1BB3904A3B1236,
0x6961176491CC64C7,
0xED205D87F40434C7,
0x6961146491CC5FAE,
0xCD3BAF5E44F8AD9C,
0xE3B36596127CD6D8,
0xF77F1072C8E8A646,
0xE3B36396127CD372,
0x6067DCE9932AD458,
0xE3B37596127CF208,
0x4B7B10FA9FE83936,
0xAABAFE7104D914BE,
0xF4D3180B3CDE3EDA,
0xAABAFD7104D9130B,
0xF4CFB20B3CDB5BB1,
0xAABAFC7104D91158,
0xF4CC4C0B3CD87888,
0xE729BAC5D2A8D3A7,
0x74BC0524F4DFA4C5,
0xE72630C5D2A5B352,
0x6B983224EF8FB456,
0xE73042C5D2AE266D,
0x8527E324FDEB4B37,
0x0A83C86FEE952ABC,
0x7318523267779D74,
0x3E66D3D56B8CACA1,
0x956694A5C0095593,
0xCAC54572BB1A6FC8,
0xA7A4C9F3EDEBF0D8,
0x7829851FAC17B143,
0x2C8F4C9AF81BCF06,
0xD34E31539740C732,
0x3605A2AC253D2DB1,
0x08C11B8346F4A3C3,
0x6BE396289CE8A6DA,
0xD9B957FB7FE794C5,
0x05BE33DA04560A93,
0x0957F1577BA9747C,
0xDA2CC3ACC24FBA57,
0x74136F185B29E7F0,
0xB2F2B4590EDB93B2,
0xB3608FCE8B86AE04,
0x4A3A865079359063,
0x5B3A7EF496880A50,
0x48FAE3163854C23B,
0x07AAA640476E0B9A,
0x2F653656383A687D,
0xA1031F8E7599D79C,
0xA31908178FF92477,
0x097EDF3C14C3FB83,
0xB51CA83FEAA0971B,
0xDD3C0D96D784F2E9,
0x86CD26A9EA767D78,
0xE6B215FF54A30C18,
0xEC5B06A1C5531093,
0x45665A929F9EC5E5,
0x8C7609B4A9F10907,
0x89AAC3A491F0D729,
0x32CE6B26E0F4A403,
0x614AB44E02B53E01,
0xFA6472EB6EEF3290,
0x9E5D75EB1948EB6A,
0xB6D12AD4A8671852,
0x88826F56EBA07AF1,
0x44535BF2645BC0FD,
0x169388FFC21E3728,
0xF68AAC9E396D8224,
0x8E87D7E7472B3883,
0x295C26CAA8B423DE,
0x322C814292E72176,
0x8A06550EB8AF7268,
0xEF86D60E661BCF71,
0x9E5426C87F30EE54,
0xF1EA8AA826FD047E,
0x0BABAF9A642CB769,
0x4B3341D4068D012E,
0xD15605CBC30A335C,
0x5B21060AED8412E5,
0x45E2CDA1CE6F4227,
0x50AE3745033AD7D4,
0xAA4588CED46BF414,
0xC1B0056C4A95467E,
0x56576A71DE8B4089,
0xBF20965FA6DC927E,
0x569F8383C2040882,
0xE1E772FBA08FECA0,
0x4CED94AF97138AC4,
0xC4112FFB337A82FB,
0xD64A4FD41DE38B7D,
0x4CFC32329EDEBCBB,
0x0803564445050395,
0xAA1574ECF4642FFD,
0x694BC4E54CC315F9,
0xA3D7CB273B011721,
0x577C2F8B6115BFA5,
0xB7EC8C1A769FB4C1,
0x5D5CFCE63359AB19,
0x33B96C3CD65B5F71,
0xD845097780602BB9,
0x84D47645D02DA3D5,
0x83544F33B58773A5,
0x9175CBB2160836C5,
0xC71B3BC175E72BC5,
0x636806AC222EC985,
0xB6EF0E6950F52ED5,
0xEAD3D8A0F3DFDAA5,
0x922908FE9A861BA5,
0x6D4821DE275FD5C5,
0x1FE3FCE62BD816B5,
0xC23E9FCCD6F70591,
0xC1AF12BDFE16B5B5,
0x39E9F18F2F85E221,
]
assert len(INPUTS) == len(OUTPUTS)
for i in range(len(INPUTS)):
assert _get_fnv1a_hash(INPUTS[i]) == OUTPUTS[i]
def _encode_and_hash(input: str, *, reverse: bool = False) -> int:
encoded_input = input.encode("UTF-8")
if reverse:
encoded_input = encoded_input[::-1]
return _get_fnv1a_hash(encoded_input)
@pytest.mark.parametrize("case_sensitive", [True, False])
def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive):
doc = en_tokenizer("spaCy✨ and Prodigy")
hashes = doc.get_character_combination_hashes(
case_sensitive=case_sensitive,
p_lengths=bytes(
(
1,
3,
4,
)
),
s_lengths=bytes(
(
2,
3,
4,
5,
)
),
)
assert hashes[0][0] == _encode_and_hash("s")
assert hashes[0][1] == _encode_and_hash("spa")
assert hashes[0][2] == _encode_and_hash("spaC" if case_sensitive else "spac")
assert hashes[0][3] == _encode_and_hash("yC" if case_sensitive else "yc")
assert hashes[0][4] == _encode_and_hash("yCa" if case_sensitive else "yca")
assert hashes[0][5] == _encode_and_hash("yCap" if case_sensitive else "ycap")
assert hashes[0][6] == _encode_and_hash("yCaps" if case_sensitive else "ycaps")
assert hashes[1][0] == _encode_and_hash("")
assert hashes[1][1] == _encode_and_hash("")
assert hashes[1][2] == _encode_and_hash("")
assert hashes[1][3] == _encode_and_hash("", reverse=True)
assert hashes[1][4] == _encode_and_hash("", reverse=True)
assert hashes[1][5] == _encode_and_hash("", reverse=True)
assert hashes[1][6] == _encode_and_hash("", reverse=True)
assert hashes[2][0] == _encode_and_hash("a")
assert hashes[2][1] == _encode_and_hash("and")
assert hashes[2][2] == _encode_and_hash("and")
assert hashes[2][3] == _encode_and_hash("dn")
assert hashes[2][4] == _encode_and_hash("dna")
assert hashes[2][5] == _encode_and_hash("dna")
assert hashes[2][6] == _encode_and_hash("dna")
assert hashes[3][0] == _encode_and_hash("P" if case_sensitive else "p")
assert hashes[3][1] == _encode_and_hash("Pro" if case_sensitive else "pro")
assert hashes[3][2] == _encode_and_hash("Prod" if case_sensitive else "prod")
assert hashes[3][3] == _encode_and_hash("yg")
assert hashes[3][4] == _encode_and_hash("ygi")
assert hashes[3][5] == _encode_and_hash("ygid")
assert hashes[3][6] == _encode_and_hash("ygido")
def test_get_character_combination_hashes_good_case_partial(en_tokenizer):
doc = en_tokenizer("spaCy✨ and Prodigy")
hashes = doc.get_character_combination_hashes(
case_sensitive=False,
p_lengths=bytes(),
s_lengths=bytes(
(
2,
3,
4,
5,
)
),
)
assert hashes[0][0] == _encode_and_hash("yc")
assert hashes[0][1] == _encode_and_hash("yca")
assert hashes[0][2] == _encode_and_hash("ycap")
assert hashes[0][3] == _encode_and_hash("ycaps")
assert hashes[1][0] == _encode_and_hash("", reverse=True)
assert hashes[1][1] == _encode_and_hash("", reverse=True)
assert hashes[1][2] == _encode_and_hash("", reverse=True)
assert hashes[1][3] == _encode_and_hash("", reverse=True)
assert hashes[2][0] == _encode_and_hash("dn")
assert hashes[2][1] == _encode_and_hash("dna")
assert hashes[2][2] == _encode_and_hash("dna")
assert hashes[2][3] == _encode_and_hash("dna")
assert hashes[3][0] == _encode_and_hash("yg")
assert hashes[3][1] == _encode_and_hash("ygi")
assert hashes[3][2] == _encode_and_hash("ygid")
assert hashes[3][3] == _encode_and_hash("ygido")
def test_get_character_combination_hashes_various_lengths(en_tokenizer):
doc = en_tokenizer("sp𐌞Cé")
for p_length in range(1, 8):
for s_length in range(1, 8):
hashes = doc.get_character_combination_hashes(
case_sensitive=False,
p_lengths=bytes((p_length,)),
s_lengths=bytes((s_length,)),
)
assert hashes[0][0] == _encode_and_hash("sp𐌞cé"[:p_length])
assert hashes[0][1] == _encode_and_hash("sp𐌞cé"[-s_length:], reverse=True)
@pytest.mark.parametrize("case_sensitive", [True, False])
def test_get_character_combination_hashes_turkish_i_with_dot(
en_tokenizer, case_sensitive
):
doc = en_tokenizer("İ".lower() + "İ")
hashes = doc.get_character_combination_hashes(
case_sensitive=case_sensitive,
p_lengths=bytes(
(
1,
2,
3,
4,
)
),
s_lengths=bytes(
(
1,
2,
3,
4,
)
),
)
COMBINING_DOT_ABOVE = b"\xcc\x87".decode("UTF-8")
assert hashes[0][0] == _encode_and_hash("i")
assert hashes[0][1] == _encode_and_hash("İ".lower())
if case_sensitive:
assert hashes[0][2] == _encode_and_hash("İ".lower() + "İ")
assert hashes[0][3] == _encode_and_hash("İ".lower() + "İ")
assert hashes[0][4] == _encode_and_hash("İ", reverse=True)
assert hashes[0][5] == _encode_and_hash(COMBINING_DOT_ABOVE + "İ", reverse=True)
assert hashes[0][6] == _encode_and_hash("İ".lower() + "İ", reverse=True)
assert hashes[0][7] == _encode_and_hash("İ".lower() + "İ", reverse=True)
else:
assert hashes[0][2] == _encode_and_hash("İ".lower() + "i")
assert hashes[0][3] == _encode_and_hash("İ".lower() * 2)
assert hashes[0][4] == _encode_and_hash(COMBINING_DOT_ABOVE, reverse=True)
assert hashes[0][5] == _encode_and_hash("İ".lower(), reverse=True)
assert hashes[0][6] == _encode_and_hash(
COMBINING_DOT_ABOVE + "İ".lower(), reverse=True
)
assert hashes[0][7] == _encode_and_hash("İ".lower() * 2, reverse=True)
@pytest.mark.parametrize("case_sensitive", [True, False])
def test_get_character_combination_hashes_string_store_spec_cases(
en_tokenizer, case_sensitive
):
symbol = "FLAG19"
short_word = "bee"
normal_word = "serendipity"
long_word = "serendipity" * 50
assert len(long_word) > 255
doc = en_tokenizer(" ".join((symbol, short_word, normal_word, long_word)))
assert len(doc) == 4
hashes = doc.get_character_combination_hashes(
case_sensitive=case_sensitive,
p_lengths=bytes((2,)),
s_lengths=bytes((2,)),
)
assert hashes[0][0] == _encode_and_hash("FL" if case_sensitive else "fl")
assert hashes[0][1] == _encode_and_hash("91")
assert hashes[1][0] == _encode_and_hash("be")
assert hashes[1][1] == _encode_and_hash("ee")
assert hashes[2][0] == hashes[3][0] == _encode_and_hash("se")
assert hashes[2][1] == hashes[3][1] == _encode_and_hash("yt")
def test_character_combination_hashes_empty_lengths(en_tokenizer):
doc = en_tokenizer("and𐌞")
assert doc.get_character_combination_hashes(
case_sensitive=True,
p_lengths=bytes(),
s_lengths=bytes(),
).shape == (1, 0)

View File

@ -0,0 +1,639 @@
import pytest
from ...ml.richfeatureextractor import get_character_combination_hashes
EMPTY_HASH_VALUE = 0xCBF29CE484222325
FNV1A_OFFSET_BASIS = 0xCBF29CE484222325
FNV1A_PRIME = 0x00000100000001B3
def _get_fnv1a_hash(input: bytes) -> int:
hash_val = FNV1A_OFFSET_BASIS
length = len(input)
offset = 0
while offset < length:
hash_val ^= input[offset]
hash_val *= FNV1A_PRIME
hash_val %= 2**64
offset += 1
return hash_val
def test_fnv1a_hash():
"""Checks the conformity of the 64-bit FNV1A implementation with
http://www.isthe.com/chongo/src/fnv/test_fnv.c.
The method called here, _get_fnv1a_hash(), is only used in testing;
in production code, the hashing is performed in a fashion that is interweaved
with other logic. The conformity of the production code is demonstrated by the
character combination hash tests, where hashes produced by the production code
are tested for equality against hashes produced by _get_fnv1a_hash().
"""
INPUTS = [
b"",
b"a",
b"b",
b"c",
b"d",
b"e",
b"f",
b"fo",
b"foo",
b"foob",
b"fooba",
b"foobar",
b"\x00",
b"a\x00",
b"b\x00",
b"c\x00",
b"d\x00",
b"e\x00",
b"f\x00",
b"fo\x00",
b"foo\x00",
b"foob\x00",
b"fooba\x00",
b"foobar\x00",
b"ch",
b"cho",
b"chon",
b"chong",
b"chongo",
b"chongo ",
b"chongo w",
b"chongo wa",
b"chongo was",
b"chongo was ",
b"chongo was h",
b"chongo was he",
b"chongo was her",
b"chongo was here",
b"chongo was here!",
b"chongo was here!\n",
b"ch\x00",
b"cho\x00",
b"chon\x00",
b"chong\x00",
b"chongo\x00",
b"chongo \x00",
b"chongo w\x00",
b"chongo wa\x00",
b"chongo was\x00",
b"chongo was \x00",
b"chongo was h\x00",
b"chongo was he\x00",
b"chongo was her\x00",
b"chongo was here\x00",
b"chongo was here!\x00",
b"chongo was here!\n\x00",
b"cu",
b"cur",
b"curd",
b"curds",
b"curds ",
b"curds a",
b"curds an",
b"curds and",
b"curds and ",
b"curds and w",
b"curds and wh",
b"curds and whe",
b"curds and whey",
b"curds and whey\n",
b"cu\x00",
b"cur\x00",
b"curd\x00",
b"curds\x00",
b"curds \x00",
b"curds a\x00",
b"curds an\x00",
b"curds and\x00",
b"curds and \x00",
b"curds and w\x00",
b"curds and wh\x00",
b"curds and whe\x00",
b"curds and whey\x00",
b"curds and whey\n\x00",
b"hi",
b"hi\x00",
b"hello",
b"hello\x00",
b"\xff\x00\x00\x01",
b"\x01\x00\x00\xff",
b"\xff\x00\x00\x02",
b"\x02\x00\x00\xff",
b"\xff\x00\x00\x03",
b"\x03\x00\x00\xff",
b"\xff\x00\x00\x04",
b"\x04\x00\x00\xff",
b"\x40\x51\x4e\x44",
b"\x44\x4e\x51\x40",
b"\x40\x51\x4e\x4a",
b"\x4a\x4e\x51\x40",
b"\x40\x51\x4e\x54",
b"\x54\x4e\x51\x40",
b"127.0.0.1",
b"127.0.0.1\x00",
b"127.0.0.2",
b"127.0.0.2\x00",
b"127.0.0.3",
b"127.0.0.3\x00",
b"64.81.78.68",
b"64.81.78.68\x00",
b"64.81.78.74",
b"64.81.78.74\x00",
b"64.81.78.84",
b"64.81.78.84\x00",
b"feedface",
b"feedface\x00",
b"feedfacedaffdeed",
b"feedfacedaffdeed\x00",
b"feedfacedeadbeef",
b"feedfacedeadbeef\x00",
b"line 1\nline 2\nline 3",
b"chongo <Landon Curt Noll> /\\../\\",
b"chongo <Landon Curt Noll> /\\../\\\x00",
b"chongo (Landon Curt Noll) /\\../\\",
b"chongo (Landon Curt Noll) /\\../\\\x00",
b"http://antwrp.gsfc.nasa.gov/apod/astropix.html",
b"http://en.wikipedia.org/wiki/Fowler_Noll_Vo_hash",
b"http://epod.usra.edu/",
b"http://exoplanet.eu/",
b"http://hvo.wr.usgs.gov/cam3/",
b"http://hvo.wr.usgs.gov/cams/HMcam/",
b"http://hvo.wr.usgs.gov/kilauea/update/deformation.html",
b"http://hvo.wr.usgs.gov/kilauea/update/images.html",
b"http://hvo.wr.usgs.gov/kilauea/update/maps.html",
b"http://hvo.wr.usgs.gov/volcanowatch/current_issue.html",
b"http://neo.jpl.nasa.gov/risk/",
b"http://norvig.com/21-days.html",
b"http://primes.utm.edu/curios/home.php",
b"http://slashdot.org/",
b"http://tux.wr.usgs.gov/Maps/155.25-19.5.html",
b"http://volcano.wr.usgs.gov/kilaueastatus.php",
b"http://www.avo.alaska.edu/activity/Redoubt.php",
b"http://www.dilbert.com/fast/",
b"http://www.fourmilab.ch/gravitation/orbits/",
b"http://www.fpoa.net/",
b"http://www.ioccc.org/index.html",
b"http://www.isthe.com/cgi-bin/number.cgi",
b"http://www.isthe.com/chongo/bio.html",
b"http://www.isthe.com/chongo/index.html",
b"http://www.isthe.com/chongo/src/calc/lucas-calc",
b"http://www.isthe.com/chongo/tech/astro/venus2004.html",
b"http://www.isthe.com/chongo/tech/astro/vita.html",
b"http://www.isthe.com/chongo/tech/comp/c/expert.html",
b"http://www.isthe.com/chongo/tech/comp/calc/index.html",
b"http://www.isthe.com/chongo/tech/comp/fnv/index.html",
b"http://www.isthe.com/chongo/tech/math/number/howhigh.html",
b"http://www.isthe.com/chongo/tech/math/number/number.html",
b"http://www.isthe.com/chongo/tech/math/prime/mersenne.html",
b"http://www.isthe.com/chongo/tech/math/prime/mersenne.html#largest",
b"http://www.lavarnd.org/cgi-bin/corpspeak.cgi",
b"http://www.lavarnd.org/cgi-bin/haiku.cgi",
b"http://www.lavarnd.org/cgi-bin/rand-none.cgi",
b"http://www.lavarnd.org/cgi-bin/randdist.cgi",
b"http://www.lavarnd.org/index.html",
b"http://www.lavarnd.org/what/nist-test.html",
b"http://www.macosxhints.com/",
b"http://www.mellis.com/",
b"http://www.nature.nps.gov/air/webcams/parks/havoso2alert/havoalert.cfm",
b"http://www.nature.nps.gov/air/webcams/parks/havoso2alert/timelines_24.cfm",
b"http://www.paulnoll.com/",
b"http://www.pepysdiary.com/",
b"http://www.sciencenews.org/index/home/activity/view",
b"http://www.skyandtelescope.com/",
b"http://www.sput.nl/~rob/sirius.html",
b"http://www.systemexperts.com/",
b"http://www.tq-international.com/phpBB3/index.php",
b"http://www.travelquesttours.com/index.htm",
b"http://www.wunderground.com/global/stations/89606.html",
b"21701" * 10,
b"M21701" * 10,
b"2^21701-1" * 10,
b"\x54\xc5" * 10,
b"\xc5\x54" * 10,
b"23209" * 10,
b"M23209" * 10,
b"2^23209-1" * 10,
b"\x5a\xa9" * 10,
b"\xa9\x5a" * 10,
b"391581216093" * 10,
b"391581*2^216093-1" * 10,
b"\x05\xf9\x9d\x03\x4c\x81" * 10,
b"FEDCBA9876543210" * 10,
b"\xfe\xdc\xba\x98\x76\x54\x32\x10" * 10,
b"EFCDAB8967452301" * 10,
b"\xef\xcd\xab\x89\x67\x45\x23\x01" * 10,
b"0123456789ABCDEF" * 10,
b"\x01\x23\x45\x67\x89\xab\xcd\xef" * 10,
b"1032547698BADCFE" * 10,
b"\x10\x32\x54\x76\x98\xba\xdc\xfe" * 10,
b"\x00" * 500,
b"\x07" * 500,
b"~" * 500,
b"\x7f" * 500,
]
OUTPUTS = [
EMPTY_HASH_VALUE,
0xAF63DC4C8601EC8C,
0xAF63DF4C8601F1A5,
0xAF63DE4C8601EFF2,
0xAF63D94C8601E773,
0xAF63D84C8601E5C0,
0xAF63DB4C8601EAD9,
0x08985907B541D342,
0xDCB27518FED9D577,
0xDD120E790C2512AF,
0xCAC165AFA2FEF40A,
0x85944171F73967E8,
0xAF63BD4C8601B7DF,
0x089BE207B544F1E4,
0x08A61407B54D9B5F,
0x08A2AE07B54AB836,
0x0891B007B53C4869,
0x088E4A07B5396540,
0x08987C07B5420EBB,
0xDCB28A18FED9F926,
0xDD1270790C25B935,
0xCAC146AFA2FEBF5D,
0x8593D371F738ACFE,
0x34531CA7168B8F38,
0x08A25607B54A22AE,
0xF5FAF0190CF90DF3,
0xF27397910B3221C7,
0x2C8C2B76062F22E0,
0xE150688C8217B8FD,
0xF35A83C10E4F1F87,
0xD1EDD10B507344D0,
0x2A5EE739B3DDB8C3,
0xDCFB970CA1C0D310,
0x4054DA76DAA6DA90,
0xF70A2FF589861368,
0x4C628B38AED25F17,
0x9DD1F6510F78189F,
0xA3DE85BD491270CE,
0x858E2FA32A55E61D,
0x46810940EFF5F915,
0xF5FADD190CF8EDAA,
0xF273ED910B32B3E9,
0x2C8C5276062F6525,
0xE150B98C821842A0,
0xF35AA3C10E4F55E7,
0xD1ED680B50729265,
0x2A5F0639B3DDED70,
0xDCFBAA0CA1C0F359,
0x4054BA76DAA6A430,
0xF709C7F5898562B0,
0x4C62E638AED2F9B8,
0x9DD1A8510F779415,
0xA3DE2ABD4911D62D,
0x858E0EA32A55AE0A,
0x46810F40EFF60347,
0xC33BCE57BEF63EAF,
0x08A24307B54A0265,
0xF5B9FD190CC18D15,
0x4C968290ACE35703,
0x07174BD5C64D9350,
0x5A294C3FF5D18750,
0x05B3C1AEB308B843,
0xB92A48DA37D0F477,
0x73CDDDCCD80EBC49,
0xD58C4C13210A266B,
0xE78B6081243EC194,
0xB096F77096A39F34,
0xB425C54FF807B6A3,
0x23E520E2751BB46E,
0x1A0B44CCFE1385EC,
0xF5BA4B190CC2119F,
0x4C962690ACE2BAAF,
0x0716DED5C64CDA19,
0x5A292C3FF5D150F0,
0x05B3E0AEB308ECF0,
0xB92A5EDA37D119D9,
0x73CE41CCD80F6635,
0xD58C2C132109F00B,
0xE78BAF81243F47D1,
0xB0968F7096A2EE7C,
0xB425A84FF807855C,
0x23E4E9E2751B56F9,
0x1A0B4ECCFE1396EA,
0x54ABD453BB2C9004,
0x08BA5F07B55EC3DA,
0x337354193006CB6E,
0xA430D84680AABD0B,
0xA9BC8ACCA21F39B1,
0x6961196491CC682D,
0xAD2BB1774799DFE9,
0x6961166491CC6314,
0x8D1BB3904A3B1236,
0x6961176491CC64C7,
0xED205D87F40434C7,
0x6961146491CC5FAE,
0xCD3BAF5E44F8AD9C,
0xE3B36596127CD6D8,
0xF77F1072C8E8A646,
0xE3B36396127CD372,
0x6067DCE9932AD458,
0xE3B37596127CF208,
0x4B7B10FA9FE83936,
0xAABAFE7104D914BE,
0xF4D3180B3CDE3EDA,
0xAABAFD7104D9130B,
0xF4CFB20B3CDB5BB1,
0xAABAFC7104D91158,
0xF4CC4C0B3CD87888,
0xE729BAC5D2A8D3A7,
0x74BC0524F4DFA4C5,
0xE72630C5D2A5B352,
0x6B983224EF8FB456,
0xE73042C5D2AE266D,
0x8527E324FDEB4B37,
0x0A83C86FEE952ABC,
0x7318523267779D74,
0x3E66D3D56B8CACA1,
0x956694A5C0095593,
0xCAC54572BB1A6FC8,
0xA7A4C9F3EDEBF0D8,
0x7829851FAC17B143,
0x2C8F4C9AF81BCF06,
0xD34E31539740C732,
0x3605A2AC253D2DB1,
0x08C11B8346F4A3C3,
0x6BE396289CE8A6DA,
0xD9B957FB7FE794C5,
0x05BE33DA04560A93,
0x0957F1577BA9747C,
0xDA2CC3ACC24FBA57,
0x74136F185B29E7F0,
0xB2F2B4590EDB93B2,
0xB3608FCE8B86AE04,
0x4A3A865079359063,
0x5B3A7EF496880A50,
0x48FAE3163854C23B,
0x07AAA640476E0B9A,
0x2F653656383A687D,
0xA1031F8E7599D79C,
0xA31908178FF92477,
0x097EDF3C14C3FB83,
0xB51CA83FEAA0971B,
0xDD3C0D96D784F2E9,
0x86CD26A9EA767D78,
0xE6B215FF54A30C18,
0xEC5B06A1C5531093,
0x45665A929F9EC5E5,
0x8C7609B4A9F10907,
0x89AAC3A491F0D729,
0x32CE6B26E0F4A403,
0x614AB44E02B53E01,
0xFA6472EB6EEF3290,
0x9E5D75EB1948EB6A,
0xB6D12AD4A8671852,
0x88826F56EBA07AF1,
0x44535BF2645BC0FD,
0x169388FFC21E3728,
0xF68AAC9E396D8224,
0x8E87D7E7472B3883,
0x295C26CAA8B423DE,
0x322C814292E72176,
0x8A06550EB8AF7268,
0xEF86D60E661BCF71,
0x9E5426C87F30EE54,
0xF1EA8AA826FD047E,
0x0BABAF9A642CB769,
0x4B3341D4068D012E,
0xD15605CBC30A335C,
0x5B21060AED8412E5,
0x45E2CDA1CE6F4227,
0x50AE3745033AD7D4,
0xAA4588CED46BF414,
0xC1B0056C4A95467E,
0x56576A71DE8B4089,
0xBF20965FA6DC927E,
0x569F8383C2040882,
0xE1E772FBA08FECA0,
0x4CED94AF97138AC4,
0xC4112FFB337A82FB,
0xD64A4FD41DE38B7D,
0x4CFC32329EDEBCBB,
0x0803564445050395,
0xAA1574ECF4642FFD,
0x694BC4E54CC315F9,
0xA3D7CB273B011721,
0x577C2F8B6115BFA5,
0xB7EC8C1A769FB4C1,
0x5D5CFCE63359AB19,
0x33B96C3CD65B5F71,
0xD845097780602BB9,
0x84D47645D02DA3D5,
0x83544F33B58773A5,
0x9175CBB2160836C5,
0xC71B3BC175E72BC5,
0x636806AC222EC985,
0xB6EF0E6950F52ED5,
0xEAD3D8A0F3DFDAA5,
0x922908FE9A861BA5,
0x6D4821DE275FD5C5,
0x1FE3FCE62BD816B5,
0xC23E9FCCD6F70591,
0xC1AF12BDFE16B5B5,
0x39E9F18F2F85E221,
]
assert len(INPUTS) == len(OUTPUTS)
for i in range(len(INPUTS)):
assert _get_fnv1a_hash(INPUTS[i]) == OUTPUTS[i]
def _encode_and_hash(input: str, *, reverse: bool = False) -> int:
encoded_input = input.encode("UTF-8")
if reverse:
encoded_input = encoded_input[::-1]
return _get_fnv1a_hash(encoded_input)
@pytest.mark.parametrize("case_sensitive", [True, False])
def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive):
doc = en_tokenizer("spaCy✨ and Prodigy")
hashes = get_character_combination_hashes(
doc=doc,
case_sensitive=case_sensitive,
p_lengths=bytes(
(
1,
3,
4,
)
),
s_lengths=bytes(
(
2,
3,
4,
5,
)
),
)
assert hashes[0][0] == _encode_and_hash("s")
assert hashes[0][1] == _encode_and_hash("spa")
assert hashes[0][2] == _encode_and_hash("spaC" if case_sensitive else "spac")
assert hashes[0][3] == _encode_and_hash("yC" if case_sensitive else "yc")
assert hashes[0][4] == _encode_and_hash("yCa" if case_sensitive else "yca")
assert hashes[0][5] == _encode_and_hash("yCap" if case_sensitive else "ycap")
assert hashes[0][6] == _encode_and_hash("yCaps" if case_sensitive else "ycaps")
assert hashes[1][0] == _encode_and_hash("")
assert hashes[1][1] == _encode_and_hash("")
assert hashes[1][2] == _encode_and_hash("")
assert hashes[1][3] == _encode_and_hash("", reverse=True)
assert hashes[1][4] == _encode_and_hash("", reverse=True)
assert hashes[1][5] == _encode_and_hash("", reverse=True)
assert hashes[1][6] == _encode_and_hash("", reverse=True)
assert hashes[2][0] == _encode_and_hash("a")
assert hashes[2][1] == _encode_and_hash("and")
assert hashes[2][2] == _encode_and_hash("and")
assert hashes[2][3] == _encode_and_hash("dn")
assert hashes[2][4] == _encode_and_hash("dna")
assert hashes[2][5] == _encode_and_hash("dna")
assert hashes[2][6] == _encode_and_hash("dna")
assert hashes[3][0] == _encode_and_hash("P" if case_sensitive else "p")
assert hashes[3][1] == _encode_and_hash("Pro" if case_sensitive else "pro")
assert hashes[3][2] == _encode_and_hash("Prod" if case_sensitive else "prod")
assert hashes[3][3] == _encode_and_hash("yg")
assert hashes[3][4] == _encode_and_hash("ygi")
assert hashes[3][5] == _encode_and_hash("ygid")
assert hashes[3][6] == _encode_and_hash("ygido")
def test_get_character_combination_hashes_good_case_no_prefixes(en_tokenizer):
doc = en_tokenizer("spaCy✨ and Prodigy")
hashes = get_character_combination_hashes(
doc=doc,
case_sensitive=False,
p_lengths=bytes(),
s_lengths=bytes(
(
2,
3,
4,
5,
)
),
)
assert hashes[0][0] == _encode_and_hash("yc")
assert hashes[0][1] == _encode_and_hash("yca")
assert hashes[0][2] == _encode_and_hash("ycap")
assert hashes[0][3] == _encode_and_hash("ycaps")
assert hashes[1][0] == _encode_and_hash("", reverse=True)
assert hashes[1][1] == _encode_and_hash("", reverse=True)
assert hashes[1][2] == _encode_and_hash("", reverse=True)
assert hashes[1][3] == _encode_and_hash("", reverse=True)
assert hashes[2][0] == _encode_and_hash("dn")
assert hashes[2][1] == _encode_and_hash("dna")
assert hashes[2][2] == _encode_and_hash("dna")
assert hashes[2][3] == _encode_and_hash("dna")
assert hashes[3][0] == _encode_and_hash("yg")
assert hashes[3][1] == _encode_and_hash("ygi")
assert hashes[3][2] == _encode_and_hash("ygid")
assert hashes[3][3] == _encode_and_hash("ygido")
def test_get_character_combination_hashes_loop_through_lengths(en_tokenizer):
doc = en_tokenizer("sp𐌞Cé")
for p_length in range(1, 8):
for s_length in range(1, 8):
hashes = get_character_combination_hashes(
doc=doc,
case_sensitive=False,
p_lengths=bytes((p_length,)),
s_lengths=bytes((s_length,)),
)
assert hashes[0][0] == _encode_and_hash("sp𐌞cé"[:p_length])
assert hashes[0][1] == _encode_and_hash("sp𐌞cé"[-s_length:], reverse=True)
@pytest.mark.parametrize("case_sensitive", [True, False])
def test_get_character_combination_hashes_turkish_i_with_dot(
en_tokenizer, case_sensitive
):
doc = en_tokenizer("İ".lower() + "İ")
hashes = get_character_combination_hashes(
doc=doc,
case_sensitive=case_sensitive,
p_lengths=bytes(
(
1,
2,
3,
4,
)
),
s_lengths=bytes(
(
1,
2,
3,
4,
)
),
)
COMBINING_DOT_ABOVE = b"\xcc\x87".decode("UTF-8")
assert hashes[0][0] == _encode_and_hash("i")
assert hashes[0][1] == _encode_and_hash("İ".lower())
if case_sensitive:
assert hashes[0][2] == _encode_and_hash("İ".lower() + "İ")
assert hashes[0][3] == _encode_and_hash("İ".lower() + "İ")
assert hashes[0][4] == _encode_and_hash("İ", reverse=True)
assert hashes[0][5] == _encode_and_hash(COMBINING_DOT_ABOVE + "İ", reverse=True)
assert hashes[0][6] == _encode_and_hash("İ".lower() + "İ", reverse=True)
assert hashes[0][7] == _encode_and_hash("İ".lower() + "İ", reverse=True)
else:
assert hashes[0][2] == _encode_and_hash("İ".lower() + "i")
assert hashes[0][3] == _encode_and_hash("İ".lower() * 2)
assert hashes[0][4] == _encode_and_hash(COMBINING_DOT_ABOVE, reverse=True)
assert hashes[0][5] == _encode_and_hash("İ".lower(), reverse=True)
assert hashes[0][6] == _encode_and_hash(
COMBINING_DOT_ABOVE + "İ".lower(), reverse=True
)
assert hashes[0][7] == _encode_and_hash("İ".lower() * 2, reverse=True)
@pytest.mark.parametrize("case_sensitive", [True, False])
def test_get_character_combination_hashes_string_store_spec_cases(
en_tokenizer, case_sensitive
):
symbol = "FLAG19"
short_word = "bee"
normal_word = "serendipity"
long_word = "serendipity" * 50
assert len(long_word) > 255
doc = en_tokenizer(" ".join((symbol, short_word, normal_word, long_word)))
assert len(doc) == 4
hashes = get_character_combination_hashes(
doc=doc,
case_sensitive=case_sensitive,
p_lengths=bytes((2,)),
s_lengths=bytes((2,)),
)
assert hashes[0][0] == _encode_and_hash("FL" if case_sensitive else "fl")
assert hashes[0][1] == _encode_and_hash("91")
assert hashes[1][0] == _encode_and_hash("be")
assert hashes[1][1] == _encode_and_hash("ee")
assert hashes[2][0] == hashes[3][0] == _encode_and_hash("se")
assert hashes[2][1] == hashes[3][1] == _encode_and_hash("yt")
def test_character_combination_hashes_empty_lengths(en_tokenizer):
doc = en_tokenizer("and𐌞")
assert get_character_combination_hashes(
doc=doc,
case_sensitive=True,
p_lengths=bytes(),
s_lengths=bytes(),
).shape == (1, 0)

View File

@ -8,7 +8,8 @@ from ..attrs cimport attr_id_t
cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil
cdef attr_t get_token_attr_for_matcher(const TokenC* token, attr_id_t feat_name) nogil
cdef attr_t get_token_attr_for_matcher(const TokenC* token, attr_id_t feat_name) nogil
ctypedef const LexemeC* const_Lexeme_ptr
ctypedef const TokenC* const_TokenC_ptr
@ -18,7 +19,6 @@ ctypedef fused LexemeOrToken:
const_TokenC_ptr
cdef int set_children_from_heads(TokenC* tokens, int start, int end) except -1
@ -34,31 +34,6 @@ cdef int token_by_end(const TokenC* tokens, int length, int end_char) except -2
cdef int [:,:] _get_lca_matrix(Doc, int start, int end)
cdef void _set_prefix_lengths(
const unsigned char* tok_str,
const int tok_str_l,
const int p_max_l,
unsigned char* pref_l_buf,
) nogil
cdef void _set_suffix_lengths(
const unsigned char* tok_str,
const int tok_str_l,
const int s_max_l,
unsigned char* suff_l_buf,
) nogil
cdef int _write_hashes(
const unsigned char* res_buf,
const unsigned char* aff_l_buf,
const unsigned char* offset_buf,
const int res_buf_last,
np.uint64_t* hashes_ptr,
) nogil
cdef class Doc:
cdef readonly Pool mem
cdef readonly Vocab vocab

View File

@ -126,7 +126,7 @@ class Doc:
blocked: Optional[List[Span]] = ...,
missing: Optional[List[Span]] = ...,
outside: Optional[List[Span]] = ...,
default: str = ...,
default: str = ...
) -> None: ...
@property
def noun_chunks(self) -> Iterator[Span]: ...
@ -174,12 +174,5 @@ class Doc:
self, doc_json: Dict[str, Any] = ..., validate: bool = False
) -> Doc: ...
def to_utf8_array(self, nr_char: int = ...) -> Ints2d: ...
def get_character_combination_hashes(
self,
*,
case_sensitive: bool,
p_lengths: bytes,
s_lengths: bytes,
) -> Ints2d: ...
@staticmethod
def _get_array_attrs() -> Tuple[Any]: ...

View File

@ -1,9 +1,9 @@
# cython: infer_types=True, bounds_check=False, profile=True
from typing import Set, List
from typing import Set
cimport cython
cimport numpy as np
from libc.string cimport memcpy, memcmp, memset, strlen
from libc.string cimport memcpy
from libc.math cimport sqrt
from libc.stdint cimport int32_t, uint64_t
@ -21,7 +21,6 @@ from .span cimport Span
from .token cimport MISSING_DEP
from ._dict_proxies import SpanGroups
from .token cimport Token
from ..symbols import NAMES as SYMBOLS_BY_INT
from ..lexeme cimport Lexeme, EMPTY_LEXEME
from ..typedefs cimport attr_t, flags_t
from ..attrs cimport attr_id_t
@ -41,7 +40,7 @@ from ._serialize import ALL_ATTRS as DOCBIN_ALL_ATTRS
from ..util import get_words_and_spaces
DEF PADDING = 5
MAX_UTF8_CHAR_BYTE_WIDTH = 4
cdef int bounds_check(int i, int length, int padding) except -1:
if (i + padding) < 0:
@ -1746,77 +1745,6 @@ cdef class Doc:
j += 1
return output
def get_character_combination_hashes(self,
*,
const bint case_sensitive,
const unsigned char* p_lengths,
const unsigned char* s_lengths,
):
"""
Returns a 2D NumPy array where the rows represent tokens and the columns represent hashes of various character combinations
derived from the raw text of each token.
case_sensitive: if *False*, hashes are generated based on the lower-case version of each token.
p_lengths: an array of single-byte values specifying the lengths of prefixes to be hashed in ascending order.
For example, if *p_lengths==[2, 3]*, the prefixes hashed for "spaCy" would be "sp" and "spa".
s_lengths: an array of single-byte values specifying the lengths of suffixes to be hashed in ascending order.
For example, if *s_lengths==[2, 3]* and *cs == True*, the suffixes hashed for "spaCy" would be "yC" and "yCa".
Many of the buffers passed into and used by this method contain single-byte numerical values. This takes advantage of
the fact that we are hashing short affixes and searching for small groups of characters. The calling code is responsible
for ensuring that lengths being passed in cannot exceed 63 and hence, with maximally four-byte
character widths, that individual values within buffers can never exceed the capacity of a single byte (255).
Note that this method performs no data validation itself as it expects the calling code will already have done so, and
that the behaviour of the code may be erratic if the supplied parameters do not conform to expectations.
"""
# Work out lengths
cdef int p_lengths_l = strlen(<char*> p_lengths)
cdef int s_lengths_l = strlen(<char*> s_lengths)
cdef int p_max_l = p_lengths[p_lengths_l - 1] if p_lengths_l > 0 else 0
cdef int s_max_l = s_lengths[s_lengths_l - 1] if s_lengths_l > 0 else 0
# Define / allocate buffers
cdef Pool mem = Pool()
cdef unsigned char* pref_l_buf = <unsigned char*> mem.alloc(p_max_l, sizeof(char))
cdef unsigned char* suff_l_buf = <unsigned char*> mem.alloc(s_max_l, sizeof(char))
cdef int doc_l = self.length
cdef np.ndarray[np.uint64_t, ndim=2] hashes = numpy.empty(
(doc_l, p_lengths_l + s_lengths_l), dtype="uint64")
cdef np.uint64_t* hashes_ptr = <np.uint64_t*> hashes.data
# Define working variables
cdef TokenC tok_c
cdef int tok_i, tok_str_l
cdef attr_t num_tok_attr
cdef bytes tok_str_bytes
cdef const unsigned char* tok_str
for tok_i in range(doc_l):
tok_c = self.c[tok_i]
num_tok_attr = tok_c.lex.orth if case_sensitive else tok_c.lex.lower
if num_tok_attr < len(SYMBOLS_BY_INT): # hardly ever happens
if num_tok_attr == 0:
tok_str_bytes = b""
else:
tok_str_bytes = SYMBOLS_BY_INT[num_tok_attr].encode("UTF-8")
tok_str = tok_str_bytes
tok_str_l = len(tok_str_bytes)
else:
tok_str, tok_str_l = self.vocab.strings.utf8_ptr(num_tok_attr)
if p_max_l > 0:
_set_prefix_lengths(tok_str, tok_str_l, p_max_l, pref_l_buf)
hashes_ptr += _write_hashes(tok_str, p_lengths, pref_l_buf, 0, hashes_ptr)
if s_max_l > 0:
_set_suffix_lengths(tok_str, tok_str_l, s_max_l, suff_l_buf)
hashes_ptr += _write_hashes(tok_str, s_lengths, suff_l_buf, tok_str_l - 1, hashes_ptr)
return hashes
@staticmethod
def _get_array_attrs():
attrs = [LENGTH, SPACY]
@ -1998,113 +1926,6 @@ cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end):
return lca_matrix
cdef void _set_prefix_lengths(
const unsigned char* tok_str,
const int tok_str_l,
const int p_max_l,
unsigned char* pref_l_buf,
) nogil:
""" Populate *pref_l_buf*, which has length *p_max_l*, with the byte lengths of each of the substrings terminated by the first *p_max_l*
characters within *tok_str*. Lengths that are greater than the character length of the whole word are populated with the byte length
of the whole word.
tok_str: a UTF-8 representation of a string.
tok_str_l: the length of *tok_str*.
p_max_l: the number of characters to process at the beginning of the word.
pref_l_buf: a buffer of length *p_max_l* in which to store the lengths. The code calling *get_character_combination_hashes()* is
responsible for ensuring that *p_max_l* cannot exceed 63 and hence, with maximally four-byte character widths, that individual values
within the buffer can never exceed the capacity of a single byte (255).
"""
cdef int tok_str_idx = 1, pref_l_buf_idx = 0
while pref_l_buf_idx < p_max_l:
if (tok_str_idx >= tok_str_l
or
((tok_str[tok_str_idx] & 0xc0) != 0x80) # not a continuation character
):
pref_l_buf[pref_l_buf_idx] = tok_str_idx
pref_l_buf_idx += 1
if tok_str_idx >= tok_str_l:
break
tok_str_idx += 1
if pref_l_buf_idx < p_max_l:
memset(pref_l_buf + pref_l_buf_idx, pref_l_buf[pref_l_buf_idx - 1], p_max_l - pref_l_buf_idx)
cdef void _set_suffix_lengths(
const unsigned char* tok_str,
const int tok_str_l,
const int s_max_l,
unsigned char* suff_l_buf,
) nogil:
""" Populate *suff_l_buf*, which has length *s_max_l*, with the byte lengths of each of the substrings started by the last *s_max_l*
characters within *tok_str*. Lengths that are greater than the character length of the whole word are populated with the byte length
of the whole word.
tok_str: a UTF-8 representation of a string.
tok_str_l: the length of *tok_str*.
s_max_l: the number of characters to process at the end of the word.
suff_l_buf: a buffer of length *s_max_l* in which to store the lengths. The code calling *get_character_combination_hashes()* is
responsible for ensuring that *s_max_l* cannot exceed 63 and hence, with maximally four-byte character widths, that individual values
within the buffer can never exceed the capacity of a single byte (255).
"""
cdef int tok_str_idx = tok_str_l - 1, suff_l_buf_idx = 0
while suff_l_buf_idx < s_max_l:
if (tok_str[tok_str_idx] & 0xc0) != 0x80: # not a continuation character
suff_l_buf[suff_l_buf_idx] = tok_str_l - tok_str_idx
suff_l_buf_idx += 1
tok_str_idx -= 1
if tok_str_idx < 0:
break
if suff_l_buf_idx < s_max_l:
memset(suff_l_buf + suff_l_buf_idx, suff_l_buf[suff_l_buf_idx - 1], s_max_l - suff_l_buf_idx)
cdef uint64_t FNV1A_OFFSET_BASIS = 0xcbf29ce484222325
cdef uint64_t FNV1A_PRIME = 0x00000100000001B3
cdef int _write_hashes(
const unsigned char* res_buf,
const unsigned char* aff_l_buf,
const unsigned char* offset_buf,
const int res_buf_last,
np.uint64_t* hashes_ptr,
) nogil:
""" Write 64-bit FNV1A hashes for a token/rich property group combination.
res_buf: the string from which to generate the hash values.
aff_l_buf: one-byte lengths describing how many characters to hash.
offset_buf: one-byte lengths specifying the byte offset of each character within *res_buf*.
res_buf_last: if affixes should start at the end of *res_buf*, the offset of the last byte in
*res_buf*; if affixes should start at the beginning of *res_buf*, *0*.
hashes_ptr: a pointer starting from which the new hashes should be written.
Returns: the number of hashes written.
"""
cdef int last_offset = 0, hash_idx = 0, offset, aff_l
cdef uint64_t hash_val = FNV1A_OFFSET_BASIS
while True:
aff_l = aff_l_buf[hash_idx]
if aff_l == 0:
return hash_idx
offset = offset_buf[aff_l - 1]
while last_offset < offset:
if res_buf_last > 0:
hash_val ^= res_buf[res_buf_last - last_offset]
else:
hash_val ^= res_buf[last_offset]
hash_val *= FNV1A_PRIME
last_offset += 1
hashes_ptr[hash_idx] = hash_val
hash_idx += 1
def pickle_doc(doc):
bytes_data = doc.to_bytes(exclude=["vocab", "user_data", "user_hooks"])
hooks_and_data = (doc.user_data, doc.user_hooks, doc.user_span_hooks,