diff --git a/setup.py b/setup.py index 243554c7a..6be2210ee 100755 --- a/setup.py +++ b/setup.py @@ -34,6 +34,7 @@ MOD_NAMES = [ "spacy.kb.kb", "spacy.kb.kb_in_memory", "spacy.ml.parser_model", + "spacy.ml.richfeatureextractor", "spacy.morphology", "spacy.pipeline.dep_parser", "spacy.pipeline._edit_tree_internals.edit_trees", diff --git a/spacy/errors.py b/spacy/errors.py index 091997792..0dbb2b04b 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -973,6 +973,7 @@ class Errors(metaclass=ErrorsWithCodes): E1051 = ("Invalid rich group config '{label}'.") E1052 = ("Length > 63 in rich group config '{label}'.") E1053 = ("Rich group config {label} specifies lengths that are not in ascending order.") + E1054 = ("Mismatched lengths in hash embed config: {len_rows} rows, {len_attrs} attrs.") # Deprecated model shortcuts, only used in errors and warnings diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py index 164990baf..29e3e0623 100644 --- a/spacy/ml/models/tok2vec.py +++ b/spacy/ml/models/tok2vec.py @@ -151,7 +151,7 @@ def MultiHashEmbed( Requires a vectors table to be loaded in the Doc objects' vocab. """ if len(rows) != len(attrs): - raise ValueError(f"Mismatched lengths: {len(rows)} vs {len(attrs)}") + raise ValueError(Errors.E1054.format(len_rows=len(rows), len_attrs=len(attrs))) seed = 7 def make_hash_embed(index): @@ -253,7 +253,7 @@ def RichMultiHashEmbed( """ if len(rows) != len(attrs): - raise ValueError(f"Mismatched lengths: {len(rows)} vs {len(attrs)}") + raise ValueError(Errors.E1054.format(len_rows=len(rows), len_attrs=len(attrs))) _verify_rich_config_group("prefix", pref_lengths, pref_rows) _verify_rich_config_group("suffix", suff_lengths, suff_rows) @@ -296,7 +296,7 @@ def RichMultiHashEmbed( ), max_out, ragged2list(), - Dropout(0.0) + Dropout(0.0), ) else: model = chain( @@ -305,7 +305,7 @@ def RichMultiHashEmbed( with_array(concatenate(*embeddings)), max_out, ragged2list(), - Dropout(0.0) + Dropout(0.0), ) return model diff --git a/spacy/ml/richfeatureextractor.pxd b/spacy/ml/richfeatureextractor.pxd new file mode 100644 index 000000000..ff862a0b3 --- /dev/null +++ b/spacy/ml/richfeatureextractor.pxd @@ -0,0 +1,27 @@ +cimport numpy as np + +cdef void _set_prefix_lengths( + const unsigned char* tok_str, + const int tok_str_l, + const int p_max_l, + unsigned char* pref_l_buf, +) nogil + + +cdef void _set_suffix_lengths( + const unsigned char* tok_str, + const int tok_str_l, + const int s_max_l, + unsigned char* suff_l_buf, +) nogil + + +cdef int _write_hashes( + const unsigned char* res_buf, + const unsigned char* aff_l_buf, + const unsigned char* offset_buf, + const int res_buf_last, + np.uint64_t* hashes_ptr, +) nogil + + diff --git a/spacy/ml/richfeatureextractor.py b/spacy/ml/richfeatureextractor.py deleted file mode 100644 index 53df964c4..000000000 --- a/spacy/ml/richfeatureextractor.py +++ /dev/null @@ -1,42 +0,0 @@ -from typing import List, Optional, Callable, Tuple -from thinc.types import Ints2d -from thinc.api import Model, registry, get_current_ops -from ..tokens import Doc - - -@registry.layers("spacy.RichFeatureExtractor.v1") -def RichFeatureExtractor( - *, - case_sensitive: bool, - pref_lengths: Optional[List[int]] = None, - suff_lengths: Optional[List[int]] = None, -) -> Model[List[Doc], List[Ints2d]]: - return Model( - "extract_character_combination_hashes", - forward, - attrs={ - "case_sensitive": case_sensitive, - "p_lengths": bytes(pref_lengths) if pref_lengths is not None else bytes(), - "s_lengths": bytes(suff_lengths) if suff_lengths is not None else bytes(), - }, - ) - - -def forward( - model: Model[List[Doc], List[Ints2d]], docs, is_train: bool -) -> Tuple[List[Ints2d], Callable]: - ops = model.ops - case_sensitive: bool = model.attrs["case_sensitive"] - p_lengths: bytes = model.attrs["p_lengths"] - s_lengths: bytes = model.attrs["s_lengths"] - features: List[Ints2d] = [] - for doc in docs: - hashes = doc.get_character_combination_hashes( - case_sensitive=case_sensitive, - p_lengths=p_lengths, - s_lengths=s_lengths, - ) - features.append(ops.asarray2i(hashes, dtype="uint64")) - - backprop: Callable[[List[Ints2d]], List] = lambda d_features: [] - return features, backprop diff --git a/spacy/ml/richfeatureextractor.pyi b/spacy/ml/richfeatureextractor.pyi new file mode 100644 index 000000000..3aa357d01 --- /dev/null +++ b/spacy/ml/richfeatureextractor.pyi @@ -0,0 +1,9 @@ +from ..tokens import Doc + +def get_character_combination_hashes( + *, + doc: Doc, + case_sensitive: bool, + p_lengths: bytes, + s_lengths: bytes, +): \ No newline at end of file diff --git a/spacy/ml/richfeatureextractor.pyx b/spacy/ml/richfeatureextractor.pyx new file mode 100644 index 000000000..f331dc853 --- /dev/null +++ b/spacy/ml/richfeatureextractor.pyx @@ -0,0 +1,233 @@ +from typing import List, Optional, Callable, Tuple +import numpy +from thinc.types import Ints2d +from thinc.api import Model, registry, get_current_ops +from ..symbols import NAMES as SYMBOLS_BY_INT + +cimport numpy as np +from cymem.cymem cimport Pool +from libc.string cimport memset, strlen +from libc.stdint cimport uint64_t +from ..tokens.doc cimport Doc +from ..structs cimport TokenC +from ..typedefs cimport attr_t + + +@registry.layers("spacy.RichFeatureExtractor.v1") +def RichFeatureExtractor( + *, + case_sensitive: bool, + pref_lengths: Optional[List[int]] = None, + suff_lengths: Optional[List[int]] = None, +) -> Model[List[Doc], List[Ints2d]]: + # Because the calling code guarantees that the integers in the list are each less than 256, + # the integer list can be converted into *bytes*. + return Model( + "extract_character_combination_hashes", + forward, + attrs={ + "case_sensitive": case_sensitive, + "p_lengths": bytes(pref_lengths) if pref_lengths is not None else bytes(), + "s_lengths": bytes(suff_lengths) if suff_lengths is not None else bytes(), + }, + ) + + +def forward( + model: Model[List[Doc], List[Ints2d]], docs, is_train: bool +) -> Tuple[List[Ints2d], Callable]: + ops = model.ops + case_sensitive: bool = model.attrs["case_sensitive"] + p_lengths: bytes = model.attrs["p_lengths"] + s_lengths: bytes = model.attrs["s_lengths"] + features: List[Ints2d] = [] + for doc in docs: + hashes = get_character_combination_hashes( + doc=doc, + case_sensitive=case_sensitive, + p_lengths=p_lengths, + s_lengths=s_lengths, + ) + features.append(ops.asarray2i(hashes, dtype="uint64")) + + backprop: Callable[[List[Ints2d]], List] = lambda d_features: [] + return features, backprop + + +def get_character_combination_hashes( + *, + Doc doc, + const bint case_sensitive, + const unsigned char* p_lengths, + const unsigned char* s_lengths, +): + """ + Returns a 2D NumPy array where the rows represent tokens and the columns represent hashes of various character combinations + derived from the raw text of each token. + + doc: the document + case_sensitive: if *False*, hashes are generated based on the lower-case version of each token. + p_lengths: an array of single-byte values specifying the lengths of prefixes to be hashed in ascending order. + For example, if *p_lengths==[2, 3]*, the prefixes hashed for "spaCy" would be "sp" and "spa". + s_lengths: an array of single-byte values specifying the lengths of suffixes to be hashed in ascending order. + For example, if *s_lengths==[2, 3]* and *cs == True*, the suffixes hashed for "spaCy" would be "yC" and "yCa". + + Many of the buffers passed into and used by this method contain single-byte numerical values. This takes advantage of + the fact that we are hashing short affixes and searching for small groups of characters. The calling code is responsible + for ensuring that lengths being passed in cannot exceed 63 and hence, with maximally four-byte + character widths, that individual values within buffers can never exceed the capacity of a single byte (255). + + Note that this method performs no data validation itself as it expects the calling code will already have done so, and + that the behaviour of the code may be erratic if the supplied parameters do not conform to expectations. + """ + + # Work out lengths + cdef int p_lengths_l = strlen( p_lengths) + cdef int s_lengths_l = strlen( s_lengths) + cdef int p_max_l = p_lengths[p_lengths_l - 1] if p_lengths_l > 0 else 0 + cdef int s_max_l = s_lengths[s_lengths_l - 1] if s_lengths_l > 0 else 0 + + # Define / allocate buffers + cdef Pool mem = Pool() + cdef unsigned char* pref_l_buf = mem.alloc(p_max_l, sizeof(char)) + cdef unsigned char* suff_l_buf = mem.alloc(s_max_l, sizeof(char)) + cdef int doc_l = doc.length + cdef np.ndarray[np.uint64_t, ndim=2] hashes = numpy.empty( + (doc_l, p_lengths_l + s_lengths_l), dtype="uint64") + cdef np.uint64_t* hashes_ptr = hashes.data + + # Define working variables + cdef TokenC tok_c + cdef int tok_i, tok_str_l + cdef attr_t num_tok_attr + cdef bytes tok_str_bytes + cdef const unsigned char* tok_str + + for tok_i in range(doc_l): + tok_c = doc.c[tok_i] + num_tok_attr = tok_c.lex.orth if case_sensitive else tok_c.lex.lower + if num_tok_attr < len(SYMBOLS_BY_INT): # hardly ever happens + if num_tok_attr == 0: + tok_str_bytes = b"" + else: + tok_str_bytes = SYMBOLS_BY_INT[num_tok_attr].encode("UTF-8") + tok_str = tok_str_bytes + tok_str_l = len(tok_str_bytes) + else: + tok_str, tok_str_l = doc.vocab.strings.utf8_ptr(num_tok_attr) + + if p_max_l > 0: + _set_prefix_lengths(tok_str, tok_str_l, p_max_l, pref_l_buf) + hashes_ptr += _write_hashes(tok_str, p_lengths, pref_l_buf, 0, hashes_ptr) + + if s_max_l > 0: + _set_suffix_lengths(tok_str, tok_str_l, s_max_l, suff_l_buf) + hashes_ptr += _write_hashes(tok_str, s_lengths, suff_l_buf, tok_str_l - 1, hashes_ptr) + + return hashes + +cdef void _set_prefix_lengths( + const unsigned char* tok_str, + const int tok_str_l, + const int p_max_l, + unsigned char* pref_l_buf, +) nogil: + """ Populate *pref_l_buf*, which has length *p_max_l*, with the byte lengths of each of the substrings terminated by the first *p_max_l* + characters within *tok_str*. Lengths that are greater than the character length of the whole word are populated with the byte length + of the whole word. + + tok_str: a UTF-8 representation of a string. + tok_str_l: the length of *tok_str*. + p_max_l: the number of characters to process at the beginning of the word. + pref_l_buf: a buffer of length *p_max_l* in which to store the lengths. The code calling *get_character_combination_hashes()* is + responsible for ensuring that *p_max_l* cannot exceed 63 and hence, with maximally four-byte character widths, that individual values + within the buffer can never exceed the capacity of a single byte (255). + """ + cdef int tok_str_idx = 1, pref_l_buf_idx = 0 + + while pref_l_buf_idx < p_max_l: + if (tok_str_idx >= tok_str_l + or + ((tok_str[tok_str_idx] & 0xc0) != 0x80) # not a continuation character + ): + pref_l_buf[pref_l_buf_idx] = tok_str_idx + pref_l_buf_idx += 1 + if tok_str_idx >= tok_str_l: + break + tok_str_idx += 1 + + if pref_l_buf_idx < p_max_l: + memset(pref_l_buf + pref_l_buf_idx, pref_l_buf[pref_l_buf_idx - 1], p_max_l - pref_l_buf_idx) + + +cdef void _set_suffix_lengths( + const unsigned char* tok_str, + const int tok_str_l, + const int s_max_l, + unsigned char* suff_l_buf, +) nogil: + """ Populate *suff_l_buf*, which has length *s_max_l*, with the byte lengths of each of the substrings started by the last *s_max_l* + characters within *tok_str*. Lengths that are greater than the character length of the whole word are populated with the byte length + of the whole word. + + tok_str: a UTF-8 representation of a string. + tok_str_l: the length of *tok_str*. + s_max_l: the number of characters to process at the end of the word. + suff_l_buf: a buffer of length *s_max_l* in which to store the lengths. The code calling *get_character_combination_hashes()* is + responsible for ensuring that *s_max_l* cannot exceed 63 and hence, with maximally four-byte character widths, that individual values + within the buffer can never exceed the capacity of a single byte (255). + """ + cdef int tok_str_idx = tok_str_l - 1, suff_l_buf_idx = 0 + + while suff_l_buf_idx < s_max_l: + if (tok_str[tok_str_idx] & 0xc0) != 0x80: # not a continuation character + suff_l_buf[suff_l_buf_idx] = tok_str_l - tok_str_idx + suff_l_buf_idx += 1 + tok_str_idx -= 1 + if tok_str_idx < 0: + break + + if suff_l_buf_idx < s_max_l: + memset(suff_l_buf + suff_l_buf_idx, suff_l_buf[suff_l_buf_idx - 1], s_max_l - suff_l_buf_idx) + + +cdef uint64_t FNV1A_OFFSET_BASIS = 0xcbf29ce484222325 +cdef uint64_t FNV1A_PRIME = 0x00000100000001B3 + + +cdef int _write_hashes( + const unsigned char* res_buf, + const unsigned char* aff_l_buf, + const unsigned char* offset_buf, + const int res_buf_last, + np.uint64_t* hashes_ptr, +) nogil: + """ Write 64-bit FNV1A hashes for a token/rich property group combination. + + res_buf: the string from which to generate the hash values. + aff_l_buf: one-byte lengths describing how many characters to hash. + offset_buf: one-byte lengths specifying the byte offset of each character within *res_buf*. + res_buf_last: if affixes should start at the end of *res_buf*, the offset of the last byte in + *res_buf*; if affixes should start at the beginning of *res_buf*, *0*. + hashes_ptr: a pointer starting from which the new hashes should be written. + + Returns: the number of hashes written. + """ + + cdef int last_offset = 0, hash_idx = 0, offset, aff_l + cdef uint64_t hash_val = FNV1A_OFFSET_BASIS + + while True: + aff_l = aff_l_buf[hash_idx] + if aff_l == 0: + return hash_idx + offset = offset_buf[aff_l - 1] + while last_offset < offset: + if res_buf_last > 0: + hash_val ^= res_buf[res_buf_last - last_offset] + else: + hash_val ^= res_buf[last_offset] + hash_val *= FNV1A_PRIME + last_offset += 1 + hashes_ptr[hash_idx] = hash_val + hash_idx += 1 diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index 610570f53..38003dea9 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -1,8 +1,6 @@ -from pickle import EMPTY_DICT import weakref import numpy -from time import time from numpy.testing import assert_array_equal import pytest import warnings @@ -992,635 +990,3 @@ def test_doc_spans_setdefault(en_tokenizer): assert len(doc.spans["key2"]) == 1 doc.spans.setdefault("key3", default=SpanGroup(doc, spans=[doc[0:1], doc[1:2]])) assert len(doc.spans["key3"]) == 2 - - -EMPTY_HASH_VALUE = 0xCBF29CE484222325 -FNV1A_OFFSET_BASIS = 0xCBF29CE484222325 -FNV1A_PRIME = 0x00000100000001B3 - - -def _get_fnv1a_hash(input: bytes) -> int: - hash_val = FNV1A_OFFSET_BASIS - length = len(input) - offset = 0 - - while offset < length: - hash_val ^= input[offset] - hash_val *= FNV1A_PRIME - hash_val %= 2**64 - offset += 1 - return hash_val - - -def test_fnv1a_hash(): - """Checks the conformity of the 64-bit FNV1A implementation with - http://www.isthe.com/chongo/src/fnv/test_fnv.c. - The method called here, _get_fnv1a_hash(), is only used in testing; - in production code, the hashing is performed in a fashion that is interweaved - with other logic. The conformity of the production code is demonstrated by the - character combination hash tests, where hashes produced by the production code - are tested for equality against hashes produced by _get_fnv1a_hash(). - """ - INPUTS = [ - b"", - b"a", - b"b", - b"c", - b"d", - b"e", - b"f", - b"fo", - b"foo", - b"foob", - b"fooba", - b"foobar", - b"\x00", - b"a\x00", - b"b\x00", - b"c\x00", - b"d\x00", - b"e\x00", - b"f\x00", - b"fo\x00", - b"foo\x00", - b"foob\x00", - b"fooba\x00", - b"foobar\x00", - b"ch", - b"cho", - b"chon", - b"chong", - b"chongo", - b"chongo ", - b"chongo w", - b"chongo wa", - b"chongo was", - b"chongo was ", - b"chongo was h", - b"chongo was he", - b"chongo was her", - b"chongo was here", - b"chongo was here!", - b"chongo was here!\n", - b"ch\x00", - b"cho\x00", - b"chon\x00", - b"chong\x00", - b"chongo\x00", - b"chongo \x00", - b"chongo w\x00", - b"chongo wa\x00", - b"chongo was\x00", - b"chongo was \x00", - b"chongo was h\x00", - b"chongo was he\x00", - b"chongo was her\x00", - b"chongo was here\x00", - b"chongo was here!\x00", - b"chongo was here!\n\x00", - b"cu", - b"cur", - b"curd", - b"curds", - b"curds ", - b"curds a", - b"curds an", - b"curds and", - b"curds and ", - b"curds and w", - b"curds and wh", - b"curds and whe", - b"curds and whey", - b"curds and whey\n", - b"cu\x00", - b"cur\x00", - b"curd\x00", - b"curds\x00", - b"curds \x00", - b"curds a\x00", - b"curds an\x00", - b"curds and\x00", - b"curds and \x00", - b"curds and w\x00", - b"curds and wh\x00", - b"curds and whe\x00", - b"curds and whey\x00", - b"curds and whey\n\x00", - b"hi", - b"hi\x00", - b"hello", - b"hello\x00", - b"\xff\x00\x00\x01", - b"\x01\x00\x00\xff", - b"\xff\x00\x00\x02", - b"\x02\x00\x00\xff", - b"\xff\x00\x00\x03", - b"\x03\x00\x00\xff", - b"\xff\x00\x00\x04", - b"\x04\x00\x00\xff", - b"\x40\x51\x4e\x44", - b"\x44\x4e\x51\x40", - b"\x40\x51\x4e\x4a", - b"\x4a\x4e\x51\x40", - b"\x40\x51\x4e\x54", - b"\x54\x4e\x51\x40", - b"127.0.0.1", - b"127.0.0.1\x00", - b"127.0.0.2", - b"127.0.0.2\x00", - b"127.0.0.3", - b"127.0.0.3\x00", - b"64.81.78.68", - b"64.81.78.68\x00", - b"64.81.78.74", - b"64.81.78.74\x00", - b"64.81.78.84", - b"64.81.78.84\x00", - b"feedface", - b"feedface\x00", - b"feedfacedaffdeed", - b"feedfacedaffdeed\x00", - b"feedfacedeadbeef", - b"feedfacedeadbeef\x00", - b"line 1\nline 2\nline 3", - b"chongo /\\../\\", - b"chongo /\\../\\\x00", - b"chongo (Landon Curt Noll) /\\../\\", - b"chongo (Landon Curt Noll) /\\../\\\x00", - b"http://antwrp.gsfc.nasa.gov/apod/astropix.html", - b"http://en.wikipedia.org/wiki/Fowler_Noll_Vo_hash", - b"http://epod.usra.edu/", - b"http://exoplanet.eu/", - b"http://hvo.wr.usgs.gov/cam3/", - b"http://hvo.wr.usgs.gov/cams/HMcam/", - b"http://hvo.wr.usgs.gov/kilauea/update/deformation.html", - b"http://hvo.wr.usgs.gov/kilauea/update/images.html", - b"http://hvo.wr.usgs.gov/kilauea/update/maps.html", - b"http://hvo.wr.usgs.gov/volcanowatch/current_issue.html", - b"http://neo.jpl.nasa.gov/risk/", - b"http://norvig.com/21-days.html", - b"http://primes.utm.edu/curios/home.php", - b"http://slashdot.org/", - b"http://tux.wr.usgs.gov/Maps/155.25-19.5.html", - b"http://volcano.wr.usgs.gov/kilaueastatus.php", - b"http://www.avo.alaska.edu/activity/Redoubt.php", - b"http://www.dilbert.com/fast/", - b"http://www.fourmilab.ch/gravitation/orbits/", - b"http://www.fpoa.net/", - b"http://www.ioccc.org/index.html", - b"http://www.isthe.com/cgi-bin/number.cgi", - b"http://www.isthe.com/chongo/bio.html", - b"http://www.isthe.com/chongo/index.html", - b"http://www.isthe.com/chongo/src/calc/lucas-calc", - b"http://www.isthe.com/chongo/tech/astro/venus2004.html", - b"http://www.isthe.com/chongo/tech/astro/vita.html", - b"http://www.isthe.com/chongo/tech/comp/c/expert.html", - b"http://www.isthe.com/chongo/tech/comp/calc/index.html", - b"http://www.isthe.com/chongo/tech/comp/fnv/index.html", - b"http://www.isthe.com/chongo/tech/math/number/howhigh.html", - b"http://www.isthe.com/chongo/tech/math/number/number.html", - b"http://www.isthe.com/chongo/tech/math/prime/mersenne.html", - b"http://www.isthe.com/chongo/tech/math/prime/mersenne.html#largest", - b"http://www.lavarnd.org/cgi-bin/corpspeak.cgi", - b"http://www.lavarnd.org/cgi-bin/haiku.cgi", - b"http://www.lavarnd.org/cgi-bin/rand-none.cgi", - b"http://www.lavarnd.org/cgi-bin/randdist.cgi", - b"http://www.lavarnd.org/index.html", - b"http://www.lavarnd.org/what/nist-test.html", - b"http://www.macosxhints.com/", - b"http://www.mellis.com/", - b"http://www.nature.nps.gov/air/webcams/parks/havoso2alert/havoalert.cfm", - b"http://www.nature.nps.gov/air/webcams/parks/havoso2alert/timelines_24.cfm", - b"http://www.paulnoll.com/", - b"http://www.pepysdiary.com/", - b"http://www.sciencenews.org/index/home/activity/view", - b"http://www.skyandtelescope.com/", - b"http://www.sput.nl/~rob/sirius.html", - b"http://www.systemexperts.com/", - b"http://www.tq-international.com/phpBB3/index.php", - b"http://www.travelquesttours.com/index.htm", - b"http://www.wunderground.com/global/stations/89606.html", - b"21701" * 10, - b"M21701" * 10, - b"2^21701-1" * 10, - b"\x54\xc5" * 10, - b"\xc5\x54" * 10, - b"23209" * 10, - b"M23209" * 10, - b"2^23209-1" * 10, - b"\x5a\xa9" * 10, - b"\xa9\x5a" * 10, - b"391581216093" * 10, - b"391581*2^216093-1" * 10, - b"\x05\xf9\x9d\x03\x4c\x81" * 10, - b"FEDCBA9876543210" * 10, - b"\xfe\xdc\xba\x98\x76\x54\x32\x10" * 10, - b"EFCDAB8967452301" * 10, - b"\xef\xcd\xab\x89\x67\x45\x23\x01" * 10, - b"0123456789ABCDEF" * 10, - b"\x01\x23\x45\x67\x89\xab\xcd\xef" * 10, - b"1032547698BADCFE" * 10, - b"\x10\x32\x54\x76\x98\xba\xdc\xfe" * 10, - b"\x00" * 500, - b"\x07" * 500, - b"~" * 500, - b"\x7f" * 500, - ] - - OUTPUTS = [ - EMPTY_HASH_VALUE, - 0xAF63DC4C8601EC8C, - 0xAF63DF4C8601F1A5, - 0xAF63DE4C8601EFF2, - 0xAF63D94C8601E773, - 0xAF63D84C8601E5C0, - 0xAF63DB4C8601EAD9, - 0x08985907B541D342, - 0xDCB27518FED9D577, - 0xDD120E790C2512AF, - 0xCAC165AFA2FEF40A, - 0x85944171F73967E8, - 0xAF63BD4C8601B7DF, - 0x089BE207B544F1E4, - 0x08A61407B54D9B5F, - 0x08A2AE07B54AB836, - 0x0891B007B53C4869, - 0x088E4A07B5396540, - 0x08987C07B5420EBB, - 0xDCB28A18FED9F926, - 0xDD1270790C25B935, - 0xCAC146AFA2FEBF5D, - 0x8593D371F738ACFE, - 0x34531CA7168B8F38, - 0x08A25607B54A22AE, - 0xF5FAF0190CF90DF3, - 0xF27397910B3221C7, - 0x2C8C2B76062F22E0, - 0xE150688C8217B8FD, - 0xF35A83C10E4F1F87, - 0xD1EDD10B507344D0, - 0x2A5EE739B3DDB8C3, - 0xDCFB970CA1C0D310, - 0x4054DA76DAA6DA90, - 0xF70A2FF589861368, - 0x4C628B38AED25F17, - 0x9DD1F6510F78189F, - 0xA3DE85BD491270CE, - 0x858E2FA32A55E61D, - 0x46810940EFF5F915, - 0xF5FADD190CF8EDAA, - 0xF273ED910B32B3E9, - 0x2C8C5276062F6525, - 0xE150B98C821842A0, - 0xF35AA3C10E4F55E7, - 0xD1ED680B50729265, - 0x2A5F0639B3DDED70, - 0xDCFBAA0CA1C0F359, - 0x4054BA76DAA6A430, - 0xF709C7F5898562B0, - 0x4C62E638AED2F9B8, - 0x9DD1A8510F779415, - 0xA3DE2ABD4911D62D, - 0x858E0EA32A55AE0A, - 0x46810F40EFF60347, - 0xC33BCE57BEF63EAF, - 0x08A24307B54A0265, - 0xF5B9FD190CC18D15, - 0x4C968290ACE35703, - 0x07174BD5C64D9350, - 0x5A294C3FF5D18750, - 0x05B3C1AEB308B843, - 0xB92A48DA37D0F477, - 0x73CDDDCCD80EBC49, - 0xD58C4C13210A266B, - 0xE78B6081243EC194, - 0xB096F77096A39F34, - 0xB425C54FF807B6A3, - 0x23E520E2751BB46E, - 0x1A0B44CCFE1385EC, - 0xF5BA4B190CC2119F, - 0x4C962690ACE2BAAF, - 0x0716DED5C64CDA19, - 0x5A292C3FF5D150F0, - 0x05B3E0AEB308ECF0, - 0xB92A5EDA37D119D9, - 0x73CE41CCD80F6635, - 0xD58C2C132109F00B, - 0xE78BAF81243F47D1, - 0xB0968F7096A2EE7C, - 0xB425A84FF807855C, - 0x23E4E9E2751B56F9, - 0x1A0B4ECCFE1396EA, - 0x54ABD453BB2C9004, - 0x08BA5F07B55EC3DA, - 0x337354193006CB6E, - 0xA430D84680AABD0B, - 0xA9BC8ACCA21F39B1, - 0x6961196491CC682D, - 0xAD2BB1774799DFE9, - 0x6961166491CC6314, - 0x8D1BB3904A3B1236, - 0x6961176491CC64C7, - 0xED205D87F40434C7, - 0x6961146491CC5FAE, - 0xCD3BAF5E44F8AD9C, - 0xE3B36596127CD6D8, - 0xF77F1072C8E8A646, - 0xE3B36396127CD372, - 0x6067DCE9932AD458, - 0xE3B37596127CF208, - 0x4B7B10FA9FE83936, - 0xAABAFE7104D914BE, - 0xF4D3180B3CDE3EDA, - 0xAABAFD7104D9130B, - 0xF4CFB20B3CDB5BB1, - 0xAABAFC7104D91158, - 0xF4CC4C0B3CD87888, - 0xE729BAC5D2A8D3A7, - 0x74BC0524F4DFA4C5, - 0xE72630C5D2A5B352, - 0x6B983224EF8FB456, - 0xE73042C5D2AE266D, - 0x8527E324FDEB4B37, - 0x0A83C86FEE952ABC, - 0x7318523267779D74, - 0x3E66D3D56B8CACA1, - 0x956694A5C0095593, - 0xCAC54572BB1A6FC8, - 0xA7A4C9F3EDEBF0D8, - 0x7829851FAC17B143, - 0x2C8F4C9AF81BCF06, - 0xD34E31539740C732, - 0x3605A2AC253D2DB1, - 0x08C11B8346F4A3C3, - 0x6BE396289CE8A6DA, - 0xD9B957FB7FE794C5, - 0x05BE33DA04560A93, - 0x0957F1577BA9747C, - 0xDA2CC3ACC24FBA57, - 0x74136F185B29E7F0, - 0xB2F2B4590EDB93B2, - 0xB3608FCE8B86AE04, - 0x4A3A865079359063, - 0x5B3A7EF496880A50, - 0x48FAE3163854C23B, - 0x07AAA640476E0B9A, - 0x2F653656383A687D, - 0xA1031F8E7599D79C, - 0xA31908178FF92477, - 0x097EDF3C14C3FB83, - 0xB51CA83FEAA0971B, - 0xDD3C0D96D784F2E9, - 0x86CD26A9EA767D78, - 0xE6B215FF54A30C18, - 0xEC5B06A1C5531093, - 0x45665A929F9EC5E5, - 0x8C7609B4A9F10907, - 0x89AAC3A491F0D729, - 0x32CE6B26E0F4A403, - 0x614AB44E02B53E01, - 0xFA6472EB6EEF3290, - 0x9E5D75EB1948EB6A, - 0xB6D12AD4A8671852, - 0x88826F56EBA07AF1, - 0x44535BF2645BC0FD, - 0x169388FFC21E3728, - 0xF68AAC9E396D8224, - 0x8E87D7E7472B3883, - 0x295C26CAA8B423DE, - 0x322C814292E72176, - 0x8A06550EB8AF7268, - 0xEF86D60E661BCF71, - 0x9E5426C87F30EE54, - 0xF1EA8AA826FD047E, - 0x0BABAF9A642CB769, - 0x4B3341D4068D012E, - 0xD15605CBC30A335C, - 0x5B21060AED8412E5, - 0x45E2CDA1CE6F4227, - 0x50AE3745033AD7D4, - 0xAA4588CED46BF414, - 0xC1B0056C4A95467E, - 0x56576A71DE8B4089, - 0xBF20965FA6DC927E, - 0x569F8383C2040882, - 0xE1E772FBA08FECA0, - 0x4CED94AF97138AC4, - 0xC4112FFB337A82FB, - 0xD64A4FD41DE38B7D, - 0x4CFC32329EDEBCBB, - 0x0803564445050395, - 0xAA1574ECF4642FFD, - 0x694BC4E54CC315F9, - 0xA3D7CB273B011721, - 0x577C2F8B6115BFA5, - 0xB7EC8C1A769FB4C1, - 0x5D5CFCE63359AB19, - 0x33B96C3CD65B5F71, - 0xD845097780602BB9, - 0x84D47645D02DA3D5, - 0x83544F33B58773A5, - 0x9175CBB2160836C5, - 0xC71B3BC175E72BC5, - 0x636806AC222EC985, - 0xB6EF0E6950F52ED5, - 0xEAD3D8A0F3DFDAA5, - 0x922908FE9A861BA5, - 0x6D4821DE275FD5C5, - 0x1FE3FCE62BD816B5, - 0xC23E9FCCD6F70591, - 0xC1AF12BDFE16B5B5, - 0x39E9F18F2F85E221, - ] - - assert len(INPUTS) == len(OUTPUTS) - for i in range(len(INPUTS)): - assert _get_fnv1a_hash(INPUTS[i]) == OUTPUTS[i] - - -def _encode_and_hash(input: str, *, reverse: bool = False) -> int: - encoded_input = input.encode("UTF-8") - if reverse: - encoded_input = encoded_input[::-1] - return _get_fnv1a_hash(encoded_input) - - -@pytest.mark.parametrize("case_sensitive", [True, False]) -def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive): - doc = en_tokenizer("spaCy✨ and Prodigy") - - hashes = doc.get_character_combination_hashes( - case_sensitive=case_sensitive, - p_lengths=bytes( - ( - 1, - 3, - 4, - ) - ), - s_lengths=bytes( - ( - 2, - 3, - 4, - 5, - ) - ), - ) - assert hashes[0][0] == _encode_and_hash("s") - assert hashes[0][1] == _encode_and_hash("spa") - assert hashes[0][2] == _encode_and_hash("spaC" if case_sensitive else "spac") - assert hashes[0][3] == _encode_and_hash("yC" if case_sensitive else "yc") - assert hashes[0][4] == _encode_and_hash("yCa" if case_sensitive else "yca") - assert hashes[0][5] == _encode_and_hash("yCap" if case_sensitive else "ycap") - assert hashes[0][6] == _encode_and_hash("yCaps" if case_sensitive else "ycaps") - assert hashes[1][0] == _encode_and_hash("✨") - assert hashes[1][1] == _encode_and_hash("✨") - assert hashes[1][2] == _encode_and_hash("✨") - assert hashes[1][3] == _encode_and_hash("✨", reverse=True) - assert hashes[1][4] == _encode_and_hash("✨", reverse=True) - assert hashes[1][5] == _encode_and_hash("✨", reverse=True) - assert hashes[1][6] == _encode_and_hash("✨", reverse=True) - assert hashes[2][0] == _encode_and_hash("a") - assert hashes[2][1] == _encode_and_hash("and") - assert hashes[2][2] == _encode_and_hash("and") - assert hashes[2][3] == _encode_and_hash("dn") - assert hashes[2][4] == _encode_and_hash("dna") - assert hashes[2][5] == _encode_and_hash("dna") - assert hashes[2][6] == _encode_and_hash("dna") - assert hashes[3][0] == _encode_and_hash("P" if case_sensitive else "p") - assert hashes[3][1] == _encode_and_hash("Pro" if case_sensitive else "pro") - assert hashes[3][2] == _encode_and_hash("Prod" if case_sensitive else "prod") - assert hashes[3][3] == _encode_and_hash("yg") - assert hashes[3][4] == _encode_and_hash("ygi") - assert hashes[3][5] == _encode_and_hash("ygid") - assert hashes[3][6] == _encode_and_hash("ygido") - - -def test_get_character_combination_hashes_good_case_partial(en_tokenizer): - doc = en_tokenizer("spaCy✨ and Prodigy") - hashes = doc.get_character_combination_hashes( - case_sensitive=False, - p_lengths=bytes(), - s_lengths=bytes( - ( - 2, - 3, - 4, - 5, - ) - ), - ) - - assert hashes[0][0] == _encode_and_hash("yc") - assert hashes[0][1] == _encode_and_hash("yca") - assert hashes[0][2] == _encode_and_hash("ycap") - assert hashes[0][3] == _encode_and_hash("ycaps") - assert hashes[1][0] == _encode_and_hash("✨", reverse=True) - assert hashes[1][1] == _encode_and_hash("✨", reverse=True) - assert hashes[1][2] == _encode_and_hash("✨", reverse=True) - assert hashes[1][3] == _encode_and_hash("✨", reverse=True) - assert hashes[2][0] == _encode_and_hash("dn") - assert hashes[2][1] == _encode_and_hash("dna") - assert hashes[2][2] == _encode_and_hash("dna") - assert hashes[2][3] == _encode_and_hash("dna") - assert hashes[3][0] == _encode_and_hash("yg") - assert hashes[3][1] == _encode_and_hash("ygi") - assert hashes[3][2] == _encode_and_hash("ygid") - assert hashes[3][3] == _encode_and_hash("ygido") - - -def test_get_character_combination_hashes_various_lengths(en_tokenizer): - doc = en_tokenizer("sp𐌞Cé") - - for p_length in range(1, 8): - for s_length in range(1, 8): - - hashes = doc.get_character_combination_hashes( - case_sensitive=False, - p_lengths=bytes((p_length,)), - s_lengths=bytes((s_length,)), - ) - - assert hashes[0][0] == _encode_and_hash("sp𐌞cé"[:p_length]) - assert hashes[0][1] == _encode_and_hash("sp𐌞cé"[-s_length:], reverse=True) - - -@pytest.mark.parametrize("case_sensitive", [True, False]) -def test_get_character_combination_hashes_turkish_i_with_dot( - en_tokenizer, case_sensitive -): - doc = en_tokenizer("İ".lower() + "İ") - hashes = doc.get_character_combination_hashes( - case_sensitive=case_sensitive, - p_lengths=bytes( - ( - 1, - 2, - 3, - 4, - ) - ), - s_lengths=bytes( - ( - 1, - 2, - 3, - 4, - ) - ), - ) - - COMBINING_DOT_ABOVE = b"\xcc\x87".decode("UTF-8") - assert hashes[0][0] == _encode_and_hash("i") - assert hashes[0][1] == _encode_and_hash("İ".lower()) - if case_sensitive: - assert hashes[0][2] == _encode_and_hash("İ".lower() + "İ") - assert hashes[0][3] == _encode_and_hash("İ".lower() + "İ") - assert hashes[0][4] == _encode_and_hash("İ", reverse=True) - assert hashes[0][5] == _encode_and_hash(COMBINING_DOT_ABOVE + "İ", reverse=True) - assert hashes[0][6] == _encode_and_hash("İ".lower() + "İ", reverse=True) - assert hashes[0][7] == _encode_and_hash("İ".lower() + "İ", reverse=True) - - else: - assert hashes[0][2] == _encode_and_hash("İ".lower() + "i") - assert hashes[0][3] == _encode_and_hash("İ".lower() * 2) - assert hashes[0][4] == _encode_and_hash(COMBINING_DOT_ABOVE, reverse=True) - assert hashes[0][5] == _encode_and_hash("İ".lower(), reverse=True) - assert hashes[0][6] == _encode_and_hash( - COMBINING_DOT_ABOVE + "İ".lower(), reverse=True - ) - assert hashes[0][7] == _encode_and_hash("İ".lower() * 2, reverse=True) - - -@pytest.mark.parametrize("case_sensitive", [True, False]) -def test_get_character_combination_hashes_string_store_spec_cases( - en_tokenizer, case_sensitive -): - symbol = "FLAG19" - short_word = "bee" - normal_word = "serendipity" - long_word = "serendipity" * 50 - assert len(long_word) > 255 - doc = en_tokenizer(" ".join((symbol, short_word, normal_word, long_word))) - assert len(doc) == 4 - hashes = doc.get_character_combination_hashes( - case_sensitive=case_sensitive, - p_lengths=bytes((2,)), - s_lengths=bytes((2,)), - ) - assert hashes[0][0] == _encode_and_hash("FL" if case_sensitive else "fl") - assert hashes[0][1] == _encode_and_hash("91") - assert hashes[1][0] == _encode_and_hash("be") - assert hashes[1][1] == _encode_and_hash("ee") - assert hashes[2][0] == hashes[3][0] == _encode_and_hash("se") - assert hashes[2][1] == hashes[3][1] == _encode_and_hash("yt") - - -def test_character_combination_hashes_empty_lengths(en_tokenizer): - doc = en_tokenizer("and𐌞") - assert doc.get_character_combination_hashes( - case_sensitive=True, - p_lengths=bytes(), - s_lengths=bytes(), - ).shape == (1, 0) diff --git a/spacy/tests/doc/test_richfeatureextractor.py b/spacy/tests/doc/test_richfeatureextractor.py new file mode 100644 index 000000000..3e4c8c869 --- /dev/null +++ b/spacy/tests/doc/test_richfeatureextractor.py @@ -0,0 +1,639 @@ +import pytest +from ...ml.richfeatureextractor import get_character_combination_hashes + +EMPTY_HASH_VALUE = 0xCBF29CE484222325 +FNV1A_OFFSET_BASIS = 0xCBF29CE484222325 +FNV1A_PRIME = 0x00000100000001B3 + + +def _get_fnv1a_hash(input: bytes) -> int: + hash_val = FNV1A_OFFSET_BASIS + length = len(input) + offset = 0 + + while offset < length: + hash_val ^= input[offset] + hash_val *= FNV1A_PRIME + hash_val %= 2**64 + offset += 1 + return hash_val + + +def test_fnv1a_hash(): + """Checks the conformity of the 64-bit FNV1A implementation with + http://www.isthe.com/chongo/src/fnv/test_fnv.c. + The method called here, _get_fnv1a_hash(), is only used in testing; + in production code, the hashing is performed in a fashion that is interweaved + with other logic. The conformity of the production code is demonstrated by the + character combination hash tests, where hashes produced by the production code + are tested for equality against hashes produced by _get_fnv1a_hash(). + """ + INPUTS = [ + b"", + b"a", + b"b", + b"c", + b"d", + b"e", + b"f", + b"fo", + b"foo", + b"foob", + b"fooba", + b"foobar", + b"\x00", + b"a\x00", + b"b\x00", + b"c\x00", + b"d\x00", + b"e\x00", + b"f\x00", + b"fo\x00", + b"foo\x00", + b"foob\x00", + b"fooba\x00", + b"foobar\x00", + b"ch", + b"cho", + b"chon", + b"chong", + b"chongo", + b"chongo ", + b"chongo w", + b"chongo wa", + b"chongo was", + b"chongo was ", + b"chongo was h", + b"chongo was he", + b"chongo was her", + b"chongo was here", + b"chongo was here!", + b"chongo was here!\n", + b"ch\x00", + b"cho\x00", + b"chon\x00", + b"chong\x00", + b"chongo\x00", + b"chongo \x00", + b"chongo w\x00", + b"chongo wa\x00", + b"chongo was\x00", + b"chongo was \x00", + b"chongo was h\x00", + b"chongo was he\x00", + b"chongo was her\x00", + b"chongo was here\x00", + b"chongo was here!\x00", + b"chongo was here!\n\x00", + b"cu", + b"cur", + b"curd", + b"curds", + b"curds ", + b"curds a", + b"curds an", + b"curds and", + b"curds and ", + b"curds and w", + b"curds and wh", + b"curds and whe", + b"curds and whey", + b"curds and whey\n", + b"cu\x00", + b"cur\x00", + b"curd\x00", + b"curds\x00", + b"curds \x00", + b"curds a\x00", + b"curds an\x00", + b"curds and\x00", + b"curds and \x00", + b"curds and w\x00", + b"curds and wh\x00", + b"curds and whe\x00", + b"curds and whey\x00", + b"curds and whey\n\x00", + b"hi", + b"hi\x00", + b"hello", + b"hello\x00", + b"\xff\x00\x00\x01", + b"\x01\x00\x00\xff", + b"\xff\x00\x00\x02", + b"\x02\x00\x00\xff", + b"\xff\x00\x00\x03", + b"\x03\x00\x00\xff", + b"\xff\x00\x00\x04", + b"\x04\x00\x00\xff", + b"\x40\x51\x4e\x44", + b"\x44\x4e\x51\x40", + b"\x40\x51\x4e\x4a", + b"\x4a\x4e\x51\x40", + b"\x40\x51\x4e\x54", + b"\x54\x4e\x51\x40", + b"127.0.0.1", + b"127.0.0.1\x00", + b"127.0.0.2", + b"127.0.0.2\x00", + b"127.0.0.3", + b"127.0.0.3\x00", + b"64.81.78.68", + b"64.81.78.68\x00", + b"64.81.78.74", + b"64.81.78.74\x00", + b"64.81.78.84", + b"64.81.78.84\x00", + b"feedface", + b"feedface\x00", + b"feedfacedaffdeed", + b"feedfacedaffdeed\x00", + b"feedfacedeadbeef", + b"feedfacedeadbeef\x00", + b"line 1\nline 2\nline 3", + b"chongo /\\../\\", + b"chongo /\\../\\\x00", + b"chongo (Landon Curt Noll) /\\../\\", + b"chongo (Landon Curt Noll) /\\../\\\x00", + b"http://antwrp.gsfc.nasa.gov/apod/astropix.html", + b"http://en.wikipedia.org/wiki/Fowler_Noll_Vo_hash", + b"http://epod.usra.edu/", + b"http://exoplanet.eu/", + b"http://hvo.wr.usgs.gov/cam3/", + b"http://hvo.wr.usgs.gov/cams/HMcam/", + b"http://hvo.wr.usgs.gov/kilauea/update/deformation.html", + b"http://hvo.wr.usgs.gov/kilauea/update/images.html", + b"http://hvo.wr.usgs.gov/kilauea/update/maps.html", + b"http://hvo.wr.usgs.gov/volcanowatch/current_issue.html", + b"http://neo.jpl.nasa.gov/risk/", + b"http://norvig.com/21-days.html", + b"http://primes.utm.edu/curios/home.php", + b"http://slashdot.org/", + b"http://tux.wr.usgs.gov/Maps/155.25-19.5.html", + b"http://volcano.wr.usgs.gov/kilaueastatus.php", + b"http://www.avo.alaska.edu/activity/Redoubt.php", + b"http://www.dilbert.com/fast/", + b"http://www.fourmilab.ch/gravitation/orbits/", + b"http://www.fpoa.net/", + b"http://www.ioccc.org/index.html", + b"http://www.isthe.com/cgi-bin/number.cgi", + b"http://www.isthe.com/chongo/bio.html", + b"http://www.isthe.com/chongo/index.html", + b"http://www.isthe.com/chongo/src/calc/lucas-calc", + b"http://www.isthe.com/chongo/tech/astro/venus2004.html", + b"http://www.isthe.com/chongo/tech/astro/vita.html", + b"http://www.isthe.com/chongo/tech/comp/c/expert.html", + b"http://www.isthe.com/chongo/tech/comp/calc/index.html", + b"http://www.isthe.com/chongo/tech/comp/fnv/index.html", + b"http://www.isthe.com/chongo/tech/math/number/howhigh.html", + b"http://www.isthe.com/chongo/tech/math/number/number.html", + b"http://www.isthe.com/chongo/tech/math/prime/mersenne.html", + b"http://www.isthe.com/chongo/tech/math/prime/mersenne.html#largest", + b"http://www.lavarnd.org/cgi-bin/corpspeak.cgi", + b"http://www.lavarnd.org/cgi-bin/haiku.cgi", + b"http://www.lavarnd.org/cgi-bin/rand-none.cgi", + b"http://www.lavarnd.org/cgi-bin/randdist.cgi", + b"http://www.lavarnd.org/index.html", + b"http://www.lavarnd.org/what/nist-test.html", + b"http://www.macosxhints.com/", + b"http://www.mellis.com/", + b"http://www.nature.nps.gov/air/webcams/parks/havoso2alert/havoalert.cfm", + b"http://www.nature.nps.gov/air/webcams/parks/havoso2alert/timelines_24.cfm", + b"http://www.paulnoll.com/", + b"http://www.pepysdiary.com/", + b"http://www.sciencenews.org/index/home/activity/view", + b"http://www.skyandtelescope.com/", + b"http://www.sput.nl/~rob/sirius.html", + b"http://www.systemexperts.com/", + b"http://www.tq-international.com/phpBB3/index.php", + b"http://www.travelquesttours.com/index.htm", + b"http://www.wunderground.com/global/stations/89606.html", + b"21701" * 10, + b"M21701" * 10, + b"2^21701-1" * 10, + b"\x54\xc5" * 10, + b"\xc5\x54" * 10, + b"23209" * 10, + b"M23209" * 10, + b"2^23209-1" * 10, + b"\x5a\xa9" * 10, + b"\xa9\x5a" * 10, + b"391581216093" * 10, + b"391581*2^216093-1" * 10, + b"\x05\xf9\x9d\x03\x4c\x81" * 10, + b"FEDCBA9876543210" * 10, + b"\xfe\xdc\xba\x98\x76\x54\x32\x10" * 10, + b"EFCDAB8967452301" * 10, + b"\xef\xcd\xab\x89\x67\x45\x23\x01" * 10, + b"0123456789ABCDEF" * 10, + b"\x01\x23\x45\x67\x89\xab\xcd\xef" * 10, + b"1032547698BADCFE" * 10, + b"\x10\x32\x54\x76\x98\xba\xdc\xfe" * 10, + b"\x00" * 500, + b"\x07" * 500, + b"~" * 500, + b"\x7f" * 500, + ] + + OUTPUTS = [ + EMPTY_HASH_VALUE, + 0xAF63DC4C8601EC8C, + 0xAF63DF4C8601F1A5, + 0xAF63DE4C8601EFF2, + 0xAF63D94C8601E773, + 0xAF63D84C8601E5C0, + 0xAF63DB4C8601EAD9, + 0x08985907B541D342, + 0xDCB27518FED9D577, + 0xDD120E790C2512AF, + 0xCAC165AFA2FEF40A, + 0x85944171F73967E8, + 0xAF63BD4C8601B7DF, + 0x089BE207B544F1E4, + 0x08A61407B54D9B5F, + 0x08A2AE07B54AB836, + 0x0891B007B53C4869, + 0x088E4A07B5396540, + 0x08987C07B5420EBB, + 0xDCB28A18FED9F926, + 0xDD1270790C25B935, + 0xCAC146AFA2FEBF5D, + 0x8593D371F738ACFE, + 0x34531CA7168B8F38, + 0x08A25607B54A22AE, + 0xF5FAF0190CF90DF3, + 0xF27397910B3221C7, + 0x2C8C2B76062F22E0, + 0xE150688C8217B8FD, + 0xF35A83C10E4F1F87, + 0xD1EDD10B507344D0, + 0x2A5EE739B3DDB8C3, + 0xDCFB970CA1C0D310, + 0x4054DA76DAA6DA90, + 0xF70A2FF589861368, + 0x4C628B38AED25F17, + 0x9DD1F6510F78189F, + 0xA3DE85BD491270CE, + 0x858E2FA32A55E61D, + 0x46810940EFF5F915, + 0xF5FADD190CF8EDAA, + 0xF273ED910B32B3E9, + 0x2C8C5276062F6525, + 0xE150B98C821842A0, + 0xF35AA3C10E4F55E7, + 0xD1ED680B50729265, + 0x2A5F0639B3DDED70, + 0xDCFBAA0CA1C0F359, + 0x4054BA76DAA6A430, + 0xF709C7F5898562B0, + 0x4C62E638AED2F9B8, + 0x9DD1A8510F779415, + 0xA3DE2ABD4911D62D, + 0x858E0EA32A55AE0A, + 0x46810F40EFF60347, + 0xC33BCE57BEF63EAF, + 0x08A24307B54A0265, + 0xF5B9FD190CC18D15, + 0x4C968290ACE35703, + 0x07174BD5C64D9350, + 0x5A294C3FF5D18750, + 0x05B3C1AEB308B843, + 0xB92A48DA37D0F477, + 0x73CDDDCCD80EBC49, + 0xD58C4C13210A266B, + 0xE78B6081243EC194, + 0xB096F77096A39F34, + 0xB425C54FF807B6A3, + 0x23E520E2751BB46E, + 0x1A0B44CCFE1385EC, + 0xF5BA4B190CC2119F, + 0x4C962690ACE2BAAF, + 0x0716DED5C64CDA19, + 0x5A292C3FF5D150F0, + 0x05B3E0AEB308ECF0, + 0xB92A5EDA37D119D9, + 0x73CE41CCD80F6635, + 0xD58C2C132109F00B, + 0xE78BAF81243F47D1, + 0xB0968F7096A2EE7C, + 0xB425A84FF807855C, + 0x23E4E9E2751B56F9, + 0x1A0B4ECCFE1396EA, + 0x54ABD453BB2C9004, + 0x08BA5F07B55EC3DA, + 0x337354193006CB6E, + 0xA430D84680AABD0B, + 0xA9BC8ACCA21F39B1, + 0x6961196491CC682D, + 0xAD2BB1774799DFE9, + 0x6961166491CC6314, + 0x8D1BB3904A3B1236, + 0x6961176491CC64C7, + 0xED205D87F40434C7, + 0x6961146491CC5FAE, + 0xCD3BAF5E44F8AD9C, + 0xE3B36596127CD6D8, + 0xF77F1072C8E8A646, + 0xE3B36396127CD372, + 0x6067DCE9932AD458, + 0xE3B37596127CF208, + 0x4B7B10FA9FE83936, + 0xAABAFE7104D914BE, + 0xF4D3180B3CDE3EDA, + 0xAABAFD7104D9130B, + 0xF4CFB20B3CDB5BB1, + 0xAABAFC7104D91158, + 0xF4CC4C0B3CD87888, + 0xE729BAC5D2A8D3A7, + 0x74BC0524F4DFA4C5, + 0xE72630C5D2A5B352, + 0x6B983224EF8FB456, + 0xE73042C5D2AE266D, + 0x8527E324FDEB4B37, + 0x0A83C86FEE952ABC, + 0x7318523267779D74, + 0x3E66D3D56B8CACA1, + 0x956694A5C0095593, + 0xCAC54572BB1A6FC8, + 0xA7A4C9F3EDEBF0D8, + 0x7829851FAC17B143, + 0x2C8F4C9AF81BCF06, + 0xD34E31539740C732, + 0x3605A2AC253D2DB1, + 0x08C11B8346F4A3C3, + 0x6BE396289CE8A6DA, + 0xD9B957FB7FE794C5, + 0x05BE33DA04560A93, + 0x0957F1577BA9747C, + 0xDA2CC3ACC24FBA57, + 0x74136F185B29E7F0, + 0xB2F2B4590EDB93B2, + 0xB3608FCE8B86AE04, + 0x4A3A865079359063, + 0x5B3A7EF496880A50, + 0x48FAE3163854C23B, + 0x07AAA640476E0B9A, + 0x2F653656383A687D, + 0xA1031F8E7599D79C, + 0xA31908178FF92477, + 0x097EDF3C14C3FB83, + 0xB51CA83FEAA0971B, + 0xDD3C0D96D784F2E9, + 0x86CD26A9EA767D78, + 0xE6B215FF54A30C18, + 0xEC5B06A1C5531093, + 0x45665A929F9EC5E5, + 0x8C7609B4A9F10907, + 0x89AAC3A491F0D729, + 0x32CE6B26E0F4A403, + 0x614AB44E02B53E01, + 0xFA6472EB6EEF3290, + 0x9E5D75EB1948EB6A, + 0xB6D12AD4A8671852, + 0x88826F56EBA07AF1, + 0x44535BF2645BC0FD, + 0x169388FFC21E3728, + 0xF68AAC9E396D8224, + 0x8E87D7E7472B3883, + 0x295C26CAA8B423DE, + 0x322C814292E72176, + 0x8A06550EB8AF7268, + 0xEF86D60E661BCF71, + 0x9E5426C87F30EE54, + 0xF1EA8AA826FD047E, + 0x0BABAF9A642CB769, + 0x4B3341D4068D012E, + 0xD15605CBC30A335C, + 0x5B21060AED8412E5, + 0x45E2CDA1CE6F4227, + 0x50AE3745033AD7D4, + 0xAA4588CED46BF414, + 0xC1B0056C4A95467E, + 0x56576A71DE8B4089, + 0xBF20965FA6DC927E, + 0x569F8383C2040882, + 0xE1E772FBA08FECA0, + 0x4CED94AF97138AC4, + 0xC4112FFB337A82FB, + 0xD64A4FD41DE38B7D, + 0x4CFC32329EDEBCBB, + 0x0803564445050395, + 0xAA1574ECF4642FFD, + 0x694BC4E54CC315F9, + 0xA3D7CB273B011721, + 0x577C2F8B6115BFA5, + 0xB7EC8C1A769FB4C1, + 0x5D5CFCE63359AB19, + 0x33B96C3CD65B5F71, + 0xD845097780602BB9, + 0x84D47645D02DA3D5, + 0x83544F33B58773A5, + 0x9175CBB2160836C5, + 0xC71B3BC175E72BC5, + 0x636806AC222EC985, + 0xB6EF0E6950F52ED5, + 0xEAD3D8A0F3DFDAA5, + 0x922908FE9A861BA5, + 0x6D4821DE275FD5C5, + 0x1FE3FCE62BD816B5, + 0xC23E9FCCD6F70591, + 0xC1AF12BDFE16B5B5, + 0x39E9F18F2F85E221, + ] + + assert len(INPUTS) == len(OUTPUTS) + for i in range(len(INPUTS)): + assert _get_fnv1a_hash(INPUTS[i]) == OUTPUTS[i] + + +def _encode_and_hash(input: str, *, reverse: bool = False) -> int: + encoded_input = input.encode("UTF-8") + if reverse: + encoded_input = encoded_input[::-1] + return _get_fnv1a_hash(encoded_input) + + +@pytest.mark.parametrize("case_sensitive", [True, False]) +def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive): + doc = en_tokenizer("spaCy✨ and Prodigy") + + hashes = get_character_combination_hashes( + doc=doc, + case_sensitive=case_sensitive, + p_lengths=bytes( + ( + 1, + 3, + 4, + ) + ), + s_lengths=bytes( + ( + 2, + 3, + 4, + 5, + ) + ), + ) + assert hashes[0][0] == _encode_and_hash("s") + assert hashes[0][1] == _encode_and_hash("spa") + assert hashes[0][2] == _encode_and_hash("spaC" if case_sensitive else "spac") + assert hashes[0][3] == _encode_and_hash("yC" if case_sensitive else "yc") + assert hashes[0][4] == _encode_and_hash("yCa" if case_sensitive else "yca") + assert hashes[0][5] == _encode_and_hash("yCap" if case_sensitive else "ycap") + assert hashes[0][6] == _encode_and_hash("yCaps" if case_sensitive else "ycaps") + assert hashes[1][0] == _encode_and_hash("✨") + assert hashes[1][1] == _encode_and_hash("✨") + assert hashes[1][2] == _encode_and_hash("✨") + assert hashes[1][3] == _encode_and_hash("✨", reverse=True) + assert hashes[1][4] == _encode_and_hash("✨", reverse=True) + assert hashes[1][5] == _encode_and_hash("✨", reverse=True) + assert hashes[1][6] == _encode_and_hash("✨", reverse=True) + assert hashes[2][0] == _encode_and_hash("a") + assert hashes[2][1] == _encode_and_hash("and") + assert hashes[2][2] == _encode_and_hash("and") + assert hashes[2][3] == _encode_and_hash("dn") + assert hashes[2][4] == _encode_and_hash("dna") + assert hashes[2][5] == _encode_and_hash("dna") + assert hashes[2][6] == _encode_and_hash("dna") + assert hashes[3][0] == _encode_and_hash("P" if case_sensitive else "p") + assert hashes[3][1] == _encode_and_hash("Pro" if case_sensitive else "pro") + assert hashes[3][2] == _encode_and_hash("Prod" if case_sensitive else "prod") + assert hashes[3][3] == _encode_and_hash("yg") + assert hashes[3][4] == _encode_and_hash("ygi") + assert hashes[3][5] == _encode_and_hash("ygid") + assert hashes[3][6] == _encode_and_hash("ygido") + + +def test_get_character_combination_hashes_good_case_no_prefixes(en_tokenizer): + doc = en_tokenizer("spaCy✨ and Prodigy") + hashes = get_character_combination_hashes( + doc=doc, + case_sensitive=False, + p_lengths=bytes(), + s_lengths=bytes( + ( + 2, + 3, + 4, + 5, + ) + ), + ) + + assert hashes[0][0] == _encode_and_hash("yc") + assert hashes[0][1] == _encode_and_hash("yca") + assert hashes[0][2] == _encode_and_hash("ycap") + assert hashes[0][3] == _encode_and_hash("ycaps") + assert hashes[1][0] == _encode_and_hash("✨", reverse=True) + assert hashes[1][1] == _encode_and_hash("✨", reverse=True) + assert hashes[1][2] == _encode_and_hash("✨", reverse=True) + assert hashes[1][3] == _encode_and_hash("✨", reverse=True) + assert hashes[2][0] == _encode_and_hash("dn") + assert hashes[2][1] == _encode_and_hash("dna") + assert hashes[2][2] == _encode_and_hash("dna") + assert hashes[2][3] == _encode_and_hash("dna") + assert hashes[3][0] == _encode_and_hash("yg") + assert hashes[3][1] == _encode_and_hash("ygi") + assert hashes[3][2] == _encode_and_hash("ygid") + assert hashes[3][3] == _encode_and_hash("ygido") + + +def test_get_character_combination_hashes_loop_through_lengths(en_tokenizer): + doc = en_tokenizer("sp𐌞Cé") + + for p_length in range(1, 8): + for s_length in range(1, 8): + + hashes = get_character_combination_hashes( + doc=doc, + case_sensitive=False, + p_lengths=bytes((p_length,)), + s_lengths=bytes((s_length,)), + ) + + assert hashes[0][0] == _encode_and_hash("sp𐌞cé"[:p_length]) + assert hashes[0][1] == _encode_and_hash("sp𐌞cé"[-s_length:], reverse=True) + + +@pytest.mark.parametrize("case_sensitive", [True, False]) +def test_get_character_combination_hashes_turkish_i_with_dot( + en_tokenizer, case_sensitive +): + doc = en_tokenizer("İ".lower() + "İ") + hashes = get_character_combination_hashes( + doc=doc, + case_sensitive=case_sensitive, + p_lengths=bytes( + ( + 1, + 2, + 3, + 4, + ) + ), + s_lengths=bytes( + ( + 1, + 2, + 3, + 4, + ) + ), + ) + + COMBINING_DOT_ABOVE = b"\xcc\x87".decode("UTF-8") + assert hashes[0][0] == _encode_and_hash("i") + assert hashes[0][1] == _encode_and_hash("İ".lower()) + if case_sensitive: + assert hashes[0][2] == _encode_and_hash("İ".lower() + "İ") + assert hashes[0][3] == _encode_and_hash("İ".lower() + "İ") + assert hashes[0][4] == _encode_and_hash("İ", reverse=True) + assert hashes[0][5] == _encode_and_hash(COMBINING_DOT_ABOVE + "İ", reverse=True) + assert hashes[0][6] == _encode_and_hash("İ".lower() + "İ", reverse=True) + assert hashes[0][7] == _encode_and_hash("İ".lower() + "İ", reverse=True) + + else: + assert hashes[0][2] == _encode_and_hash("İ".lower() + "i") + assert hashes[0][3] == _encode_and_hash("İ".lower() * 2) + assert hashes[0][4] == _encode_and_hash(COMBINING_DOT_ABOVE, reverse=True) + assert hashes[0][5] == _encode_and_hash("İ".lower(), reverse=True) + assert hashes[0][6] == _encode_and_hash( + COMBINING_DOT_ABOVE + "İ".lower(), reverse=True + ) + assert hashes[0][7] == _encode_and_hash("İ".lower() * 2, reverse=True) + + +@pytest.mark.parametrize("case_sensitive", [True, False]) +def test_get_character_combination_hashes_string_store_spec_cases( + en_tokenizer, case_sensitive +): + symbol = "FLAG19" + short_word = "bee" + normal_word = "serendipity" + long_word = "serendipity" * 50 + assert len(long_word) > 255 + doc = en_tokenizer(" ".join((symbol, short_word, normal_word, long_word))) + assert len(doc) == 4 + hashes = get_character_combination_hashes( + doc=doc, + case_sensitive=case_sensitive, + p_lengths=bytes((2,)), + s_lengths=bytes((2,)), + ) + assert hashes[0][0] == _encode_and_hash("FL" if case_sensitive else "fl") + assert hashes[0][1] == _encode_and_hash("91") + assert hashes[1][0] == _encode_and_hash("be") + assert hashes[1][1] == _encode_and_hash("ee") + assert hashes[2][0] == hashes[3][0] == _encode_and_hash("se") + assert hashes[2][1] == hashes[3][1] == _encode_and_hash("yt") + + +def test_character_combination_hashes_empty_lengths(en_tokenizer): + doc = en_tokenizer("and𐌞") + assert get_character_combination_hashes( + doc=doc, + case_sensitive=True, + p_lengths=bytes(), + s_lengths=bytes(), + ).shape == (1, 0) diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd index e6a554e41..57d087958 100644 --- a/spacy/tokens/doc.pxd +++ b/spacy/tokens/doc.pxd @@ -8,7 +8,8 @@ from ..attrs cimport attr_id_t cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil -cdef attr_t get_token_attr_for_matcher(const TokenC* token, attr_id_t feat_name) nogil +cdef attr_t get_token_attr_for_matcher(const TokenC* token, attr_id_t feat_name) nogil + ctypedef const LexemeC* const_Lexeme_ptr ctypedef const TokenC* const_TokenC_ptr @@ -18,7 +19,6 @@ ctypedef fused LexemeOrToken: const_TokenC_ptr - cdef int set_children_from_heads(TokenC* tokens, int start, int end) except -1 @@ -34,31 +34,6 @@ cdef int token_by_end(const TokenC* tokens, int length, int end_char) except -2 cdef int [:,:] _get_lca_matrix(Doc, int start, int end) -cdef void _set_prefix_lengths( - const unsigned char* tok_str, - const int tok_str_l, - const int p_max_l, - unsigned char* pref_l_buf, -) nogil - - -cdef void _set_suffix_lengths( - const unsigned char* tok_str, - const int tok_str_l, - const int s_max_l, - unsigned char* suff_l_buf, -) nogil - - -cdef int _write_hashes( - const unsigned char* res_buf, - const unsigned char* aff_l_buf, - const unsigned char* offset_buf, - const int res_buf_last, - np.uint64_t* hashes_ptr, -) nogil - - cdef class Doc: cdef readonly Pool mem cdef readonly Vocab vocab diff --git a/spacy/tokens/doc.pyi b/spacy/tokens/doc.pyi index fdf515c8f..f0cdaee87 100644 --- a/spacy/tokens/doc.pyi +++ b/spacy/tokens/doc.pyi @@ -126,7 +126,7 @@ class Doc: blocked: Optional[List[Span]] = ..., missing: Optional[List[Span]] = ..., outside: Optional[List[Span]] = ..., - default: str = ..., + default: str = ... ) -> None: ... @property def noun_chunks(self) -> Iterator[Span]: ... @@ -174,12 +174,5 @@ class Doc: self, doc_json: Dict[str, Any] = ..., validate: bool = False ) -> Doc: ... def to_utf8_array(self, nr_char: int = ...) -> Ints2d: ... - def get_character_combination_hashes( - self, - *, - case_sensitive: bool, - p_lengths: bytes, - s_lengths: bytes, - ) -> Ints2d: ... @staticmethod def _get_array_attrs() -> Tuple[Any]: ... diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 93c6697e4..075bc4d15 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -1,9 +1,9 @@ # cython: infer_types=True, bounds_check=False, profile=True -from typing import Set, List +from typing import Set cimport cython cimport numpy as np -from libc.string cimport memcpy, memcmp, memset, strlen +from libc.string cimport memcpy from libc.math cimport sqrt from libc.stdint cimport int32_t, uint64_t @@ -21,7 +21,6 @@ from .span cimport Span from .token cimport MISSING_DEP from ._dict_proxies import SpanGroups from .token cimport Token -from ..symbols import NAMES as SYMBOLS_BY_INT from ..lexeme cimport Lexeme, EMPTY_LEXEME from ..typedefs cimport attr_t, flags_t from ..attrs cimport attr_id_t @@ -41,7 +40,7 @@ from ._serialize import ALL_ATTRS as DOCBIN_ALL_ATTRS from ..util import get_words_and_spaces DEF PADDING = 5 -MAX_UTF8_CHAR_BYTE_WIDTH = 4 + cdef int bounds_check(int i, int length, int padding) except -1: if (i + padding) < 0: @@ -1746,77 +1745,6 @@ cdef class Doc: j += 1 return output - def get_character_combination_hashes(self, - *, - const bint case_sensitive, - const unsigned char* p_lengths, - const unsigned char* s_lengths, - ): - """ - Returns a 2D NumPy array where the rows represent tokens and the columns represent hashes of various character combinations - derived from the raw text of each token. - - case_sensitive: if *False*, hashes are generated based on the lower-case version of each token. - p_lengths: an array of single-byte values specifying the lengths of prefixes to be hashed in ascending order. - For example, if *p_lengths==[2, 3]*, the prefixes hashed for "spaCy" would be "sp" and "spa". - s_lengths: an array of single-byte values specifying the lengths of suffixes to be hashed in ascending order. - For example, if *s_lengths==[2, 3]* and *cs == True*, the suffixes hashed for "spaCy" would be "yC" and "yCa". - - Many of the buffers passed into and used by this method contain single-byte numerical values. This takes advantage of - the fact that we are hashing short affixes and searching for small groups of characters. The calling code is responsible - for ensuring that lengths being passed in cannot exceed 63 and hence, with maximally four-byte - character widths, that individual values within buffers can never exceed the capacity of a single byte (255). - - Note that this method performs no data validation itself as it expects the calling code will already have done so, and - that the behaviour of the code may be erratic if the supplied parameters do not conform to expectations. - """ - - # Work out lengths - cdef int p_lengths_l = strlen( p_lengths) - cdef int s_lengths_l = strlen( s_lengths) - cdef int p_max_l = p_lengths[p_lengths_l - 1] if p_lengths_l > 0 else 0 - cdef int s_max_l = s_lengths[s_lengths_l - 1] if s_lengths_l > 0 else 0 - - # Define / allocate buffers - cdef Pool mem = Pool() - cdef unsigned char* pref_l_buf = mem.alloc(p_max_l, sizeof(char)) - cdef unsigned char* suff_l_buf = mem.alloc(s_max_l, sizeof(char)) - cdef int doc_l = self.length - cdef np.ndarray[np.uint64_t, ndim=2] hashes = numpy.empty( - (doc_l, p_lengths_l + s_lengths_l), dtype="uint64") - cdef np.uint64_t* hashes_ptr = hashes.data - - # Define working variables - cdef TokenC tok_c - cdef int tok_i, tok_str_l - cdef attr_t num_tok_attr - cdef bytes tok_str_bytes - cdef const unsigned char* tok_str - - for tok_i in range(doc_l): - tok_c = self.c[tok_i] - num_tok_attr = tok_c.lex.orth if case_sensitive else tok_c.lex.lower - if num_tok_attr < len(SYMBOLS_BY_INT): # hardly ever happens - if num_tok_attr == 0: - tok_str_bytes = b"" - else: - tok_str_bytes = SYMBOLS_BY_INT[num_tok_attr].encode("UTF-8") - tok_str = tok_str_bytes - tok_str_l = len(tok_str_bytes) - else: - tok_str, tok_str_l = self.vocab.strings.utf8_ptr(num_tok_attr) - - if p_max_l > 0: - _set_prefix_lengths(tok_str, tok_str_l, p_max_l, pref_l_buf) - hashes_ptr += _write_hashes(tok_str, p_lengths, pref_l_buf, 0, hashes_ptr) - - if s_max_l > 0: - _set_suffix_lengths(tok_str, tok_str_l, s_max_l, suff_l_buf) - hashes_ptr += _write_hashes(tok_str, s_lengths, suff_l_buf, tok_str_l - 1, hashes_ptr) - - return hashes - - @staticmethod def _get_array_attrs(): attrs = [LENGTH, SPACY] @@ -1998,113 +1926,6 @@ cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end): return lca_matrix -cdef void _set_prefix_lengths( - const unsigned char* tok_str, - const int tok_str_l, - const int p_max_l, - unsigned char* pref_l_buf, -) nogil: - """ Populate *pref_l_buf*, which has length *p_max_l*, with the byte lengths of each of the substrings terminated by the first *p_max_l* - characters within *tok_str*. Lengths that are greater than the character length of the whole word are populated with the byte length - of the whole word. - - tok_str: a UTF-8 representation of a string. - tok_str_l: the length of *tok_str*. - p_max_l: the number of characters to process at the beginning of the word. - pref_l_buf: a buffer of length *p_max_l* in which to store the lengths. The code calling *get_character_combination_hashes()* is - responsible for ensuring that *p_max_l* cannot exceed 63 and hence, with maximally four-byte character widths, that individual values - within the buffer can never exceed the capacity of a single byte (255). - """ - cdef int tok_str_idx = 1, pref_l_buf_idx = 0 - - while pref_l_buf_idx < p_max_l: - if (tok_str_idx >= tok_str_l - or - ((tok_str[tok_str_idx] & 0xc0) != 0x80) # not a continuation character - ): - pref_l_buf[pref_l_buf_idx] = tok_str_idx - pref_l_buf_idx += 1 - if tok_str_idx >= tok_str_l: - break - tok_str_idx += 1 - - if pref_l_buf_idx < p_max_l: - memset(pref_l_buf + pref_l_buf_idx, pref_l_buf[pref_l_buf_idx - 1], p_max_l - pref_l_buf_idx) - - -cdef void _set_suffix_lengths( - const unsigned char* tok_str, - const int tok_str_l, - const int s_max_l, - unsigned char* suff_l_buf, -) nogil: - """ Populate *suff_l_buf*, which has length *s_max_l*, with the byte lengths of each of the substrings started by the last *s_max_l* - characters within *tok_str*. Lengths that are greater than the character length of the whole word are populated with the byte length - of the whole word. - - tok_str: a UTF-8 representation of a string. - tok_str_l: the length of *tok_str*. - s_max_l: the number of characters to process at the end of the word. - suff_l_buf: a buffer of length *s_max_l* in which to store the lengths. The code calling *get_character_combination_hashes()* is - responsible for ensuring that *s_max_l* cannot exceed 63 and hence, with maximally four-byte character widths, that individual values - within the buffer can never exceed the capacity of a single byte (255). - """ - cdef int tok_str_idx = tok_str_l - 1, suff_l_buf_idx = 0 - - while suff_l_buf_idx < s_max_l: - if (tok_str[tok_str_idx] & 0xc0) != 0x80: # not a continuation character - suff_l_buf[suff_l_buf_idx] = tok_str_l - tok_str_idx - suff_l_buf_idx += 1 - tok_str_idx -= 1 - if tok_str_idx < 0: - break - - if suff_l_buf_idx < s_max_l: - memset(suff_l_buf + suff_l_buf_idx, suff_l_buf[suff_l_buf_idx - 1], s_max_l - suff_l_buf_idx) - - -cdef uint64_t FNV1A_OFFSET_BASIS = 0xcbf29ce484222325 -cdef uint64_t FNV1A_PRIME = 0x00000100000001B3 - - -cdef int _write_hashes( - const unsigned char* res_buf, - const unsigned char* aff_l_buf, - const unsigned char* offset_buf, - const int res_buf_last, - np.uint64_t* hashes_ptr, -) nogil: - """ Write 64-bit FNV1A hashes for a token/rich property group combination. - - res_buf: the string from which to generate the hash values. - aff_l_buf: one-byte lengths describing how many characters to hash. - offset_buf: one-byte lengths specifying the byte offset of each character within *res_buf*. - res_buf_last: if affixes should start at the end of *res_buf*, the offset of the last byte in - *res_buf*; if affixes should start at the beginning of *res_buf*, *0*. - hashes_ptr: a pointer starting from which the new hashes should be written. - - Returns: the number of hashes written. - """ - - cdef int last_offset = 0, hash_idx = 0, offset, aff_l - cdef uint64_t hash_val = FNV1A_OFFSET_BASIS - - while True: - aff_l = aff_l_buf[hash_idx] - if aff_l == 0: - return hash_idx - offset = offset_buf[aff_l - 1] - while last_offset < offset: - if res_buf_last > 0: - hash_val ^= res_buf[res_buf_last - last_offset] - else: - hash_val ^= res_buf[last_offset] - hash_val *= FNV1A_PRIME - last_offset += 1 - hashes_ptr[hash_idx] = hash_val - hash_idx += 1 - - def pickle_doc(doc): bytes_data = doc.to_bytes(exclude=["vocab", "user_data", "user_hooks"]) hooks_and_data = (doc.user_data, doc.user_hooks, doc.user_span_hooks,