Intermediate state

2025-08-02 11:20:19 +03:00 · 2022-09-16 20:00:20 +02:00 · 2022-09-16 20:00:20 +02:00 · 6f42d79c1e
commit 6f42d79c1e
parent d575b9f8d4
4 changed files with 210 additions and 2 deletions
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -941,6 +941,8 @@ class Errors(metaclass=ErrorsWithCodes):
             "`{arg2}`={arg2_values} but these arguments are conflicting.")
    E1043 = ("Expected None or a value in range [{range_start}, {range_end}] for entity linker threshold, but got "
             "{value}.")
    E1044 = ("Special characters definition for '{label}' may not contain upper-case chars where case_sensitive==False.")
    E1045 = ("Invalid affix group config '{label}'.")
 # Deprecated model shortcuts, only used in errors and warnings
--- a/spacy/ml/affixextractor.py
+++ b/spacy/ml/affixextractor.py
@ -0,0 +1,61 @@
 from typing import List, Optional, Callable, Tuple
 from thinc.types import Ints2d
 from thinc.api import Model, registry, get_current_ops
 from ..tokens import Doc
@registry.layers("spacy.AffixExtractor.v1")
 def AffixExtractor(
    *,
    suffs_not_prefs: bool,
    case_sensitive: bool,
    len_start: Optional[int],
    len_end: Optional[int],
    special_chars: Optional[str],
    sc_len_start: Optional[int],
    sc_len_end: Optional[int],
 ) -> Model[List[Doc], List[Ints2d]]:
    return Model(
        "extract_affixes",
        forward,
        attrs={
            "suffs_not_prefs": suffs_not_prefs,
            "case_sensitive": case_sensitive,
            "len_start": len_start if len_start is not None else 0,
            "len_end": len_end if len_end is not None else 0,
            "special_chars": special_chars if special_chars is not None else "",
            "sc_len_start": sc_len_start if sc_len_start is not None else 0,
            "sc_len_end": sc_len_end if sc_len_end is not None else 0,
        },
    )
 def forward(
    model: Model[List[Doc], List[Ints2d]], docs, is_train: bool
 ) -> Tuple[List[Ints2d], Callable]:
    suffs_not_prefs: bool = model.attrs["suffs_not_prefs"]
    case_sensitive: bool = model.attrs["case_sensitive"]
    len_start: int = model.attrs["len_start"]
    len_end: int = model.attrs["len_end"]
    special_chars: str = model.attrs["special_chars"]
    sc_len_start: int = model.attrs["sc_len_start"]
    sc_len_end: int = model.attrs["sc_len_end"]
    features: List[Ints2d] = []
    for doc in docs:
        features.append(
            model.ops.asarray2i(
                doc.get_affix_hashes(
                    suffs_not_prefs,
                    case_sensitive,
                    len_start,
                    len_end,
                    special_chars,
                    sc_len_start,
                    sc_len_end,
                )
            )
        )
    backprop: Callable[[List[Ints2d]], List] = lambda d_features: []
    return features, backprop
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@ -1,4 +1,5 @@
 from typing import Optional, List, Union, cast
 from spacy.ml.affixextractor import AffixExtractor
 from thinc.types import Floats2d, Ints2d, Ragged, Ints1d
 from thinc.api import chain, clone, concatenate, with_array, with_padded
 from thinc.api import Model, noop, list2ragged, ragged2list, HashEmbed
@ -185,6 +186,150 @@ def MultiHashEmbed(
        )
    return model
 def process_affix_config_group(
    label: str,
    start_len: Optional[int],
    end_len: Optional[int],
    rows: Optional[List[int]],
    scs: Optional[str],
    is_sc: bool,
 ) -> List[int]:
    if start_len is not None or end_len is not None or rows is not None:
        if start_len is None or end_len is None or rows is None:
            raise ValueError(Errors.E1045.format(label=label))
        if start_len < 0 or end_len < start_len + 1:
            raise ValueError(Errors.E1045.format(label=label))
        if is_sc and scs is None:
            raise ValueError(Errors.E1045.format(label=label))
        if scs is not None and scs != scs.lower():
            raise ValueError(Errors.E1044.format(label=label))
        if len(rows) != end_len - start_len:
            raise ValueError(Errors.E1045.format(label=label))
    elif scs is not None:
        raise ValueError(Errors.E1045.format(label=label))
    return rows if rows is not None else []
@registry.architectures("spacy.AffixMultiHashEmbed.v1")
 def AffixMultiHashEmbed(
    width: int,
    attrs: List[Union[str, int]],
    rows: List[int],
    include_static_vectors: bool,
    *,
    affix_case_sensitive: bool,
    suffix_start_len: Optional[int] = None,
    suffix_end_len: Optional[int] = None,
    suffix_rows: Optional[List[int]] = None,
    suffix_scs: Optional[str] = None,
    suffix_sc_start_len: Optional[int] = None,
    suffix_sc_end_len: Optional[int] = None,
    suffix_sc_rows: Optional[List[int]] = None,
    prefix_start_len: Optional[int] = None,
    prefix_end_len: Optional[int] = None,
    prefix_rows: Optional[List[int]] = None,
    prefix_scs: Optional[str] = None,
    prefix_sc_start_len: Optional[int] = None,
    prefix_sc_end_len: Optional[int] = None,
    prefix_sc_rows: Optional[List[int]] = None,
 ) -> Model[List[Doc], List[Floats2d]]:
    """
    TODO
    """
    if len(rows) != len(attrs):
        raise ValueError(f"Mismatched lengths: {len(rows)} vs {len(attrs)}")
    rows.extend(
        process_affix_config_group(
            "prefix", prefix_start_len, prefix_end_len, prefix_rows, None, False
        )
    )
    rows.extend(
        process_affix_config_group(
            "prefix_sc",
            prefix_sc_start_len,
            prefix_sc_end_len,
            prefix_sc_rows,
            prefix_scs,
            True,
        )
    )
    rows.extend(
        process_affix_config_group(
            "suffix", suffix_start_len, suffix_end_len, suffix_rows, None, False
        )
    )
    rows.extend(
        process_affix_config_group(
            "suffix_sc",
            suffix_sc_start_len,
            suffix_sc_end_len,
            suffix_sc_rows,
            suffix_scs,
            True,
        )
    )
    embeddings = [
        HashEmbed(width, row, column=i, seed=i + 7, dropout=0.0)
        for i, row in enumerate(rows)
    ]
    concat_size = width * (len(embeddings) + include_static_vectors)
    max_out: Model[Ragged, Ragged] = with_array(
        Maxout(width, concat_size, nP=3, dropout=0.0, normalize=True)
    )
    extractors = [FeatureExtractor(attrs)]
    if prefix_start_len is not None or prefix_sc_start_len is not None:
        extractors.append(
            AffixExtractor(
                suffs_not_prefs=False,
                case_sensitive=affix_case_sensitive,
                len_start=prefix_start_len,
                len_end=prefix_end_len,
                special_chars=prefix_scs,
                sc_len_start=prefix_sc_start_len,
                sc_len_end=prefix_sc_end_len,
            )
        )
    if suffix_start_len is not None or suffix_sc_start_len is not None:
        extractors.append(
            AffixExtractor(
                suffs_not_prefs=True,
                case_sensitive=affix_case_sensitive,
                len_start=suffix_start_len,
                len_end=suffix_end_len,
                special_chars=suffix_scs,
                sc_len_start=suffix_sc_start_len,
                sc_len_end=suffix_sc_end_len,
            )
        )
    if include_static_vectors:
        feature_extractor: Model[List[Doc], Ragged] = chain(
            concatenate(*extractors),
            cast(Model[List[Ints2d], Ragged], list2ragged()),
            with_array(concatenate(*embeddings)),
        )
        model = chain(
            concatenate(
                feature_extractor,
                StaticVectors(width, dropout=0.0),
            ),
            max_out,
            ragged2list(),
        )
    else:
        model = chain(
            concatenate(*extractors),
            cast(Model[List[Ints2d], Ragged], list2ragged()),
            with_array(concatenate(*embeddings)),
            max_out,
            ragged2list(),
        )
    return model
@registry.architectures("spacy.CharacterEmbed.v2")
 def CharacterEmbed(
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -1735,7 +1735,7 @@ cdef class Doc:
                    j += 1
        return output
-    def get_affix_hashes(self, bint suffs_not_prefs, bint lower_not_orth, unsigned int len_start, unsigned int len_end, 
+    def get_affix_hashes(self, bint suffs_not_prefs, bint case_sensitive, unsigned int len_start, unsigned int len_end, 
        str special_chars, unsigned int sc_len_start, unsigned int sc_len_end):
        """
        TODO
@ -1746,7 +1746,7 @@ cdef class Doc:
        cdef np.ndarray[np.int64_t, ndim=2] output = numpy.empty((num_tokens, num_norm_hashes + num_spec_hashes), dtype="int64")
        for token_index in range(num_tokens):
-            token_string = self[token_index].orth_ if lower_not_orth else self[token_index].lower_
+            token_string = self[token_index].orth_ if case_sensitive else self[token_index].lower_
            if suffs_not_prefs:
                token_string = token_string[::-1]