spaCy/spacy/attrs.pyx


IDS = {
    "": NULL_ATTR,
    "IS_ALPHA": IS_ALPHA,
    "IS_ASCII": IS_ASCII,
    "IS_DIGIT": IS_DIGIT,
    "IS_LOWER": IS_LOWER,
    "IS_PUNCT": IS_PUNCT,
    "IS_SPACE": IS_SPACE,
    "IS_TITLE": IS_TITLE,
    "IS_UPPER": IS_UPPER,
    "LIKE_URL": LIKE_URL,
    "LIKE_NUM": LIKE_NUM,
    "LIKE_EMAIL": LIKE_EMAIL,
    "IS_STOP": IS_STOP,
    "IS_OOV_DEPRECATED": IS_OOV_DEPRECATED,
    "IS_BRACKET": IS_BRACKET,
    "IS_QUOTE": IS_QUOTE,
    "IS_LEFT_PUNCT": IS_LEFT_PUNCT,
    "IS_RIGHT_PUNCT": IS_RIGHT_PUNCT,
    "IS_CURRENCY": IS_CURRENCY,
    "FLAG19": FLAG19,
    "FLAG20": FLAG20,
    "FLAG21": FLAG21,
    "FLAG22": FLAG22,
    "FLAG23": FLAG23,
    "FLAG24": FLAG24,
    "FLAG25": FLAG25,
    "FLAG26": FLAG26,
    "FLAG27": FLAG27,
    "FLAG28": FLAG28,
    "FLAG29": FLAG29,
    "FLAG30": FLAG30,
    "FLAG31": FLAG31,
    "FLAG32": FLAG32,
    "FLAG33": FLAG33,
    "FLAG34": FLAG34,
    "FLAG35": FLAG35,
    "FLAG36": FLAG36,
    "FLAG37": FLAG37,
    "FLAG38": FLAG38,
    "FLAG39": FLAG39,
    "FLAG40": FLAG40,
    "FLAG41": FLAG41,
    "FLAG42": FLAG42,
    "FLAG43": FLAG43,
    "FLAG44": FLAG44,
    "FLAG45": FLAG45,
    "FLAG46": FLAG46,
    "FLAG47": FLAG47,
    "FLAG48": FLAG48,
    "FLAG49": FLAG49,
    "FLAG50": FLAG50,
    "FLAG51": FLAG51,
    "FLAG52": FLAG52,
    "FLAG53": FLAG53,
    "FLAG54": FLAG54,
    "FLAG55": FLAG55,
    "FLAG56": FLAG56,
    "FLAG57": FLAG57,
    "FLAG58": FLAG58,
    "FLAG59": FLAG59,
    "FLAG60": FLAG60,
    "FLAG61": FLAG61,
    "FLAG62": FLAG62,
    "FLAG63": FLAG63,

    "ID": ID,
    "ORTH": ORTH,
    "LOWER": LOWER,
    "NORM": NORM,
    "SHAPE": SHAPE,
    "PREFIX": PREFIX,
    "SUFFIX": SUFFIX,

    "LENGTH": LENGTH,
    "CLUSTER": CLUSTER,
    "LEMMA": LEMMA,
    "POS": POS,
    "TAG": TAG,
    "DEP": DEP,
    "ENT_IOB": ENT_IOB,
    "ENT_TYPE": ENT_TYPE,
    "ENT_ID": ENT_ID,
    "ENT_KB_ID": ENT_KB_ID,
    "HEAD": HEAD,
    "SENT_START": SENT_START,
    "SENT_END": SENT_END,
    "SPACY": SPACY,
    "PROB": PROB,
    "LANG": LANG,
    "MORPH": MORPH,
    "IDX": IDX
}


# ATTR IDs, in order of the symbol
NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])]
locals().update(IDS)


def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
    """
    Normalize a dictionary of attributes, converting them to ints.

    stringy_attrs (dict): Dictionary keyed by attribute string names. Values
        can be ints or strings.
    strings_map (StringStore): Defaults to None. If provided, encodes string
        values into ints.
    RETURNS (dict): Attributes dictionary with keys and optionally values
        converted to ints.
    """
    inty_attrs = {}
    if _do_deprecated:
        if 'F' in stringy_attrs:
            stringy_attrs["ORTH"] = stringy_attrs.pop("F")
        if 'L' in stringy_attrs:
            stringy_attrs["LEMMA"] = stringy_attrs.pop("L")
        if 'pos' in stringy_attrs:
            stringy_attrs["TAG"] = stringy_attrs.pop("pos")
        if 'morph' in stringy_attrs:
            morphs = stringy_attrs.pop('morph')
        if 'number' in stringy_attrs:
            stringy_attrs.pop('number')
        if 'tenspect' in stringy_attrs:
            stringy_attrs.pop('tenspect')
        morph_keys = [
            'PunctType', 'PunctSide', 'Other', 'Degree', 'AdvType', 'Number',
            'VerbForm', 'PronType', 'Aspect', 'Tense', 'PartType', 'Poss',
            'Hyph', 'ConjType', 'NumType', 'Foreign', 'VerbType', 'NounType',
            'Gender', 'Mood', 'Negative', 'Tense', 'Voice', 'Abbr',
            'Derivation', 'Echo', 'Foreign', 'NameType', 'NounType', 'NumForm',
            'NumValue', 'PartType', 'Polite', 'StyleVariant',
            'PronType', 'AdjType', 'Person', 'Variant', 'AdpType',
            'Reflex', 'Negative', 'Mood', 'Aspect', 'Case',
            'Polarity', 'PrepCase', 'Animacy' # U20
        ]
        for key in morph_keys:
            if key in stringy_attrs:
                stringy_attrs.pop(key)
            elif key.lower() in stringy_attrs:
                stringy_attrs.pop(key.lower())
            elif key.upper() in stringy_attrs:
                stringy_attrs.pop(key.upper())
    for name, value in stringy_attrs.items():
        int_key = intify_attr(name)
        if int_key is not None:
            if strings_map is not None and isinstance(value, basestring):
                if hasattr(strings_map, 'add'):
                    value = strings_map.add(value)
                else:
                    value = strings_map[value]
            inty_attrs[int_key] = value
    return inty_attrs


def intify_attr(name):
    """
    Normalize an attribute name, converting it to int.

    stringy_attr (string): Attribute string name. Can also be int (will then be left unchanged)
    RETURNS (int): int representation of the attribute, or None if it couldn't be converted.
    """
    if isinstance(name, int):
        return name
    elif name in IDS:
        return IDS[name]
    elif name.upper() in IDS:
        return IDS[name.upper()]
    return None
Clean up imports, unused code, whitespace, docstrings 2017-04-15 13:05:47 +03:00
* Rename ATTR_IDS to attrs.IDS. Rename ATTR_NAMES to attrs.NAMES. Rename UNIV_POS_IDS to parts_of_speech.IDS 2015-10-10 09:55:55 +03:00			`IDS = {`
* Map empty string to NULL_ATTR in attrs 2015-10-10 14:10:19 +03:00			`"": NULL_ATTR,`
* Refactor symbols, so that frequency rank can be derived from the orth id of a word. 2015-10-06 16:39:50 +03:00			`"IS_ALPHA": IS_ALPHA,`
			`"IS_ASCII": IS_ASCII,`
			`"IS_DIGIT": IS_DIGIT,`
			`"IS_LOWER": IS_LOWER,`
			`"IS_PUNCT": IS_PUNCT,`
			`"IS_SPACE": IS_SPACE,`
			`"IS_TITLE": IS_TITLE,`
			`"IS_UPPER": IS_UPPER,`
			`"LIKE_URL": LIKE_URL,`
			`"LIKE_NUM": LIKE_NUM,`
			`"LIKE_EMAIL": LIKE_EMAIL,`
			`"IS_STOP": IS_STOP,`
Reduce stored lexemes data, move feats to lookups (#5238) * Reduce stored lexemes data, move feats to lookups * Move non-derivable lexemes features (`norm / cluster / prob`) to `spacy-lookups-data` as lookups * Get/set `norm` in both lookups and `LexemeC`, serialize in lookups * Remove `cluster` and `prob` from `LexemesC`, get/set/serialize in lookups only * Remove serialization of lexemes data as `vocab/lexemes.bin` * Remove `SerializedLexemeC` * Remove `Lexeme.to_bytes/from_bytes` * Modify normalization exception loading: * Always create `Vocab.lookups` table `lexeme_norm` for normalization exceptions * Load base exceptions from `lang.norm_exceptions`, but load language-specific exceptions from lookups * Set `lex_attr_getter[NORM]` including new lookups table in `BaseDefaults.create_vocab()` and when deserializing `Vocab` * Remove all cached lexemes when deserializing vocab to override existing normalizations with the new normalizations (as a replacement for the previous step that replaced all lexemes data with the deserialized data) * Skip English normalization test Skip English normalization test because the data is now in `spacy-lookups-data`. * Remove norm exceptions Moved to spacy-lookups-data. * Move norm exceptions test to spacy-lookups-data * Load extra lookups from spacy-lookups-data lazily Load extra lookups (currently for cluster and prob) lazily from the entry point `lg_extra` as `Vocab.lookups_extra`. * Skip creating lexeme cache on load To improve model loading times, do not create the full lexeme cache when loading. The lexemes will be created on demand when processing. * Identify numeric values in Lexeme.set_attrs() With the removal of a special case for `PROB`, also identify `float` to avoid trying to convert it with the `StringStore`. * Skip lexeme cache init in from_bytes * Unskip and update lookups tests for python3.6+ * Update vocab pickle to include lookups_extra * Update vocab serialization tests Check strings rather than lexemes since lexemes aren't initialized automatically, account for addition of "_SP". * Re-skip lookups test because of python3.5 * Skip PROB/float values in Lexeme.set_attrs * Convert is_oov from lexeme flag to lex in vectors Instead of storing `is_oov` as a lexeme flag, `is_oov` reports whether the lexeme has a vector. Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com> 2020-05-19 16:59:14 +03:00			`"IS_OOV_DEPRECATED": IS_OOV_DEPRECATED,`
introduce lang field for LexemeC to hold language id put noun_chunk logic into iterators.py for each language separately 2016-03-10 15:01:34 +03:00			`"IS_BRACKET": IS_BRACKET,`
			`"IS_QUOTE": IS_QUOTE,`
			`"IS_LEFT_PUNCT": IS_LEFT_PUNCT,`
			`"IS_RIGHT_PUNCT": IS_RIGHT_PUNCT,`
code for is_currency 2018-02-11 20:51:32 +03:00			`"IS_CURRENCY": IS_CURRENCY,`
* Refactor symbols, so that frequency rank can be derived from the orth id of a word. 2015-10-06 16:39:50 +03:00			`"FLAG19": FLAG19,`
			`"FLAG20": FLAG20,`
			`"FLAG21": FLAG21,`
			`"FLAG22": FLAG22,`
			`"FLAG23": FLAG23,`
			`"FLAG24": FLAG24,`
			`"FLAG25": FLAG25,`
			`"FLAG26": FLAG26,`
			`"FLAG27": FLAG27,`
			`"FLAG28": FLAG28,`
			`"FLAG29": FLAG29,`
			`"FLAG30": FLAG30,`
			`"FLAG31": FLAG31,`
			`"FLAG32": FLAG32,`
			`"FLAG33": FLAG33,`
			`"FLAG34": FLAG34,`
			`"FLAG35": FLAG35,`
			`"FLAG36": FLAG36,`
			`"FLAG37": FLAG37,`
			`"FLAG38": FLAG38,`
			`"FLAG39": FLAG39,`
			`"FLAG40": FLAG40,`
			`"FLAG41": FLAG41,`
			`"FLAG42": FLAG42,`
			`"FLAG43": FLAG43,`
			`"FLAG44": FLAG44,`
			`"FLAG45": FLAG45,`
			`"FLAG46": FLAG46,`
			`"FLAG47": FLAG47,`
			`"FLAG48": FLAG48,`
			`"FLAG49": FLAG49,`
			`"FLAG50": FLAG50,`
			`"FLAG51": FLAG51,`
			`"FLAG52": FLAG52,`
			`"FLAG53": FLAG53,`
			`"FLAG54": FLAG54,`
			`"FLAG55": FLAG55,`
			`"FLAG56": FLAG56,`
			`"FLAG57": FLAG57,`
			`"FLAG58": FLAG58,`
			`"FLAG59": FLAG59,`
			`"FLAG60": FLAG60,`
			`"FLAG61": FLAG61,`
			`"FLAG62": FLAG62,`
			`"FLAG63": FLAG63,`

			`"ID": ID,`
			`"ORTH": ORTH,`
			`"LOWER": LOWER,`
			`"NORM": NORM,`
			`"SHAPE": SHAPE,`
			`"PREFIX": PREFIX,`
			`"SUFFIX": SUFFIX,`

			`"LENGTH": LENGTH,`
			`"CLUSTER": CLUSTER,`
			`"LEMMA": LEMMA,`
			`"POS": POS,`
			`"TAG": TAG,`
			`"DEP": DEP,`
			`"ENT_IOB": ENT_IOB,`
			`"ENT_TYPE": ENT_TYPE,`
serialize ENT_ID (#4852) * expand serialization test for custom token attribute * add failing test for issue 4849 * define ENT_ID as attr and use in doc serialization * fix few typos 2020-01-06 16:57:34 +03:00			`"ENT_ID": ENT_ID,`
ensure Span.as_doc keeps the entity links + unit test 2019-06-25 16:28:51 +03:00			`"ENT_KB_ID": ENT_KB_ID,`
* Refactor symbols, so that frequency rank can be derived from the orth id of a word. 2015-10-06 16:39:50 +03:00			`"HEAD": HEAD,`
Add SENT_START attribute, for custom sentence boundary detection 2016-05-05 13:11:57 +03:00			`"SENT_START": SENT_START,`
Add is_sent_end token property (#5375) Reconstruction of the original PR #4697 by @MiniLau. Removes unused `SENT_END` symbol and `IS_SENT_END` from `Matcher` schema because the Matcher is only going to be able to support `IS_SENT_START`. 2020-04-29 13:53:16 +03:00			`"SENT_END": SENT_END,`
* Refactor symbols, so that frequency rank can be derived from the orth id of a word. 2015-10-06 16:39:50 +03:00			`"SPACY": SPACY,`
			`"PROB": PROB,`
introduce lang field for LexemeC to hold language id put noun_chunk logic into iterators.py for each language separately 2016-03-10 15:01:34 +03:00			`"LANG": LANG,`
Add MORPH attr, add support in retokenizer (#4947) * Add MORPH attr / symbol for token attrs * Update retokenizer for MORPH 2020-01-29 19:45:46 +03:00			`"MORPH": MORPH,`
make idx available via to_array (#5030) 2020-02-22 16:13:06 +03:00			`"IDX": IDX`
* Refactor symbols, so that frequency rank can be derived from the orth id of a word. 2015-10-06 16:39:50 +03:00			`}`

Add spacy.attrs.intify_attrs function, to normalize strings in token attribute dictionaries. 2016-11-25 13:34:30 +03:00
* Refactor symbols, so that frequency rank can be derived from the orth id of a word. 2015-10-06 16:39:50 +03:00			`# ATTR IDs, in order of the symbol`
* Rename ATTR_IDS to attrs.IDS. Rename ATTR_NAMES to attrs.NAMES. Rename UNIV_POS_IDS to parts_of_speech.IDS 2015-10-10 09:55:55 +03:00			`NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])]`
Fix cpdef enum in attrs.pyx 2017-09-17 20:28:53 +03:00			`locals().update(IDS)`
Add spacy.attrs.intify_attrs function, to normalize strings in token attribute dictionaries. 2016-11-25 13:34:30 +03:00

			`def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):`
Use consistent formatting for docstrings 2017-04-15 12:59:21 +03:00			`"""`
			`Normalize a dictionary of attributes, converting them to ints.`
Add support for Universal Dependencies v2.0 2017-02-27 00:27:11 +03:00
Tidy up rest 2017-10-27 22:07:59 +03:00			`stringy_attrs (dict): Dictionary keyed by attribute string names. Values`
			`can be ints or strings.`
			`strings_map (StringStore): Defaults to None. If provided, encodes string`
			`values into ints.`
			`RETURNS (dict): Attributes dictionary with keys and optionally values`
			`converted to ints.`
Use consistent formatting for docstrings 2017-04-15 12:59:21 +03:00			`"""`
Add spacy.attrs.intify_attrs function, to normalize strings in token attribute dictionaries. 2016-11-25 13:34:30 +03:00			`inty_attrs = {}`
			`if _do_deprecated:`
			`if 'F' in stringy_attrs:`
			`stringy_attrs["ORTH"] = stringy_attrs.pop("F")`
			`if 'L' in stringy_attrs:`
			`stringy_attrs["LEMMA"] = stringy_attrs.pop("L")`
			`if 'pos' in stringy_attrs:`
			`stringy_attrs["TAG"] = stringy_attrs.pop("pos")`
Exclude morphs from deprecated token attributes for now 2016-11-25 18:17:32 +03:00			`if 'morph' in stringy_attrs:`
			`morphs = stringy_attrs.pop('morph')`
Filter out deprecated attributes when reading special-case tokenization rules. 2016-11-25 18:57:18 +03:00			`if 'number' in stringy_attrs:`
			`stringy_attrs.pop('number')`
			`if 'tenspect' in stringy_attrs:`
			`stringy_attrs.pop('tenspect')`
Filter out morphology keys in deprecated attrs 2016-12-18 18:50:26 +03:00			`morph_keys = [`
			`'PunctType', 'PunctSide', 'Other', 'Degree', 'AdvType', 'Number',`
			`'VerbForm', 'PronType', 'Aspect', 'Tense', 'PartType', 'Poss',`
			`'Hyph', 'ConjType', 'NumType', 'Foreign', 'VerbType', 'NounType',`
Add more morphology names in attrs.pyx 2017-03-15 17:26:16 +03:00			`'Gender', 'Mood', 'Negative', 'Tense', 'Voice', 'Abbr',`
			`'Derivation', 'Echo', 'Foreign', 'NameType', 'NounType', 'NumForm',`
			`'NumValue', 'PartType', 'Polite', 'StyleVariant',`
			`'PronType', 'AdjType', 'Person', 'Variant', 'AdpType',`
Add support for Universal Dependencies v2.0 2017-02-27 00:27:11 +03:00			`'Reflex', 'Negative', 'Mood', 'Aspect', 'Case',`
Revert "Merge branch 'develop' of https://github.com/explosion/spaCy into develop" This reverts commit c9ba3d3c2dc7067cf8bd55f878cec45a8c6d73d4, reversing changes made to 92c26a35d425d4e8ca1b805ea776ea10f5ded3df. 2018-03-27 20:23:02 +03:00			`'Polarity', 'PrepCase', 'Animacy' # U20`
Add support for Universal Dependencies v2.0 2017-02-27 00:27:11 +03:00			`]`
Filter out morphology keys in deprecated attrs 2016-12-18 18:50:26 +03:00			`for key in morph_keys:`
			`if key in stringy_attrs:`
			`stringy_attrs.pop(key)`
Ignore more morphology attributes in deprecated mode of intify_attrs 2016-12-18 19:33:46 +03:00			`elif key.lower() in stringy_attrs:`
			`stringy_attrs.pop(key.lower())`
			`elif key.upper() in stringy_attrs:`
			`stringy_attrs.pop(key.upper())`
Add spacy.attrs.intify_attrs function, to normalize strings in token attribute dictionaries. 2016-11-25 13:34:30 +03:00			`for name, value in stringy_attrs.items():`
Bugfix initializing DocBin with attributes (#4368) * docbin init fix + documentation fix + unit tests * newline * try with zlib instead of gzip (python 2 incompatibilities) 2019-10-03 15:48:45 +03:00			`int_key = intify_attr(name)`
			`if int_key is not None:`
			`if strings_map is not None and isinstance(value, basestring):`
			`if hasattr(strings_map, 'add'):`
			`value = strings_map.add(value)`
			`else:`
			`value = strings_map[value]`
			`inty_attrs[int_key] = value`
Add spacy.attrs.intify_attrs function, to normalize strings in token attribute dictionaries. 2016-11-25 13:34:30 +03:00			`return inty_attrs`
Bugfix initializing DocBin with attributes (#4368) * docbin init fix + documentation fix + unit tests * newline * try with zlib instead of gzip (python 2 incompatibilities) 2019-10-03 15:48:45 +03:00

			`def intify_attr(name):`
			`"""`
			`Normalize an attribute name, converting it to int.`

			`stringy_attr (string): Attribute string name. Can also be int (will then be left unchanged)`
			`RETURNS (int): int representation of the attribute, or None if it couldn't be converted.`
			`"""`
			`if isinstance(name, int):`
			`return name`
			`elif name in IDS:`
			`return IDS[name]`
			`elif name.upper() in IDS:`
			`return IDS[name.upper()]`
			`return None`