spaCy/spacy/attrs.pyx


IDS = {
    "": NULL_ATTR,
    "IS_ALPHA": IS_ALPHA,
    "IS_ASCII": IS_ASCII,
    "IS_DIGIT": IS_DIGIT,
    "IS_LOWER": IS_LOWER,
    "IS_PUNCT": IS_PUNCT,
    "IS_SPACE": IS_SPACE,
    "IS_TITLE": IS_TITLE,
    "IS_UPPER": IS_UPPER,
    "LIKE_URL": LIKE_URL,
    "LIKE_NUM": LIKE_NUM,
    "LIKE_EMAIL": LIKE_EMAIL,
    "IS_STOP": IS_STOP,
    "IS_OOV_DEPRECATED": IS_OOV_DEPRECATED,
    "IS_BRACKET": IS_BRACKET,
    "IS_QUOTE": IS_QUOTE,
    "IS_LEFT_PUNCT": IS_LEFT_PUNCT,
    "IS_RIGHT_PUNCT": IS_RIGHT_PUNCT,
    "IS_CURRENCY": IS_CURRENCY,
    "FLAG19": FLAG19,
    "FLAG20": FLAG20,
    "FLAG21": FLAG21,
    "FLAG22": FLAG22,
    "FLAG23": FLAG23,
    "FLAG24": FLAG24,
    "FLAG25": FLAG25,
    "FLAG26": FLAG26,
    "FLAG27": FLAG27,
    "FLAG28": FLAG28,
    "FLAG29": FLAG29,
    "FLAG30": FLAG30,
    "FLAG31": FLAG31,
    "FLAG32": FLAG32,
    "FLAG33": FLAG33,
    "FLAG34": FLAG34,
    "FLAG35": FLAG35,
    "FLAG36": FLAG36,
    "FLAG37": FLAG37,
    "FLAG38": FLAG38,
    "FLAG39": FLAG39,
    "FLAG40": FLAG40,
    "FLAG41": FLAG41,
    "FLAG42": FLAG42,
    "FLAG43": FLAG43,
    "FLAG44": FLAG44,
    "FLAG45": FLAG45,
    "FLAG46": FLAG46,
    "FLAG47": FLAG47,
    "FLAG48": FLAG48,
    "FLAG49": FLAG49,
    "FLAG50": FLAG50,
    "FLAG51": FLAG51,
    "FLAG52": FLAG52,
    "FLAG53": FLAG53,
    "FLAG54": FLAG54,
    "FLAG55": FLAG55,
    "FLAG56": FLAG56,
    "FLAG57": FLAG57,
    "FLAG58": FLAG58,
    "FLAG59": FLAG59,
    "FLAG60": FLAG60,
    "FLAG61": FLAG61,
    "FLAG62": FLAG62,
    "FLAG63": FLAG63,

    "ID": ID,
    "ORTH": ORTH,
    "LOWER": LOWER,
    "NORM": NORM,
    "SHAPE": SHAPE,
    "PREFIX": PREFIX,
    "SUFFIX": SUFFIX,

    "LENGTH": LENGTH,
    "LEMMA": LEMMA,
    "POS": POS,
    "TAG": TAG,
    "DEP": DEP,
    "ENT_IOB": ENT_IOB,
    "ENT_TYPE": ENT_TYPE,
    "ENT_ID": ENT_ID,
    "ENT_KB_ID": ENT_KB_ID,
    "HEAD": HEAD,
    "SENT_START": SENT_START,
    "SPACY": SPACY,
    "LANG": LANG,
    "MORPH": MORPH,
    "IDX": IDX
}


# ATTR IDs, in order of the symbol
NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])]
locals().update(IDS)


def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
    """
    Normalize a dictionary of attributes, converting them to ints.

    stringy_attrs (dict): Dictionary keyed by attribute string names. Values
        can be ints or strings.
    strings_map (StringStore): Defaults to None. If provided, encodes string
        values into ints.
    RETURNS (dict): Attributes dictionary with keys and optionally values
        converted to ints.
    """
    inty_attrs = {}
    if _do_deprecated:
        if 'F' in stringy_attrs:
            stringy_attrs["ORTH"] = stringy_attrs.pop("F")
        if 'L' in stringy_attrs:
            stringy_attrs["LEMMA"] = stringy_attrs.pop("L")
        if 'pos' in stringy_attrs:
            stringy_attrs["TAG"] = stringy_attrs.pop("pos")
        if 'morph' in stringy_attrs:
            morphs = stringy_attrs.pop('morph')
        if 'number' in stringy_attrs:
            stringy_attrs.pop('number')
        if 'tenspect' in stringy_attrs:
            stringy_attrs.pop('tenspect')
        morph_keys = [
            'PunctType', 'PunctSide', 'Other', 'Degree', 'AdvType', 'Number',
            'VerbForm', 'PronType', 'Aspect', 'Tense', 'PartType', 'Poss',
            'Hyph', 'ConjType', 'NumType', 'Foreign', 'VerbType', 'NounType',
            'Gender', 'Mood', 'Negative', 'Tense', 'Voice', 'Abbr',
            'Derivation', 'Echo', 'Foreign', 'NameType', 'NounType', 'NumForm',
            'NumValue', 'PartType', 'Polite', 'StyleVariant',
            'PronType', 'AdjType', 'Person', 'Variant', 'AdpType',
            'Reflex', 'Negative', 'Mood', 'Aspect', 'Case',
            'Polarity', 'PrepCase', 'Animacy' # U20
        ]
        for key in morph_keys:
            if key in stringy_attrs:
                stringy_attrs.pop(key)
            elif key.lower() in stringy_attrs:
                stringy_attrs.pop(key.lower())
            elif key.upper() in stringy_attrs:
                stringy_attrs.pop(key.upper())
    for name, value in stringy_attrs.items():
        int_key = intify_attr(name)
        if int_key is not None:
            if strings_map is not None and isinstance(value, str):
                if hasattr(strings_map, 'add'):
                    value = strings_map.add(value)
                else:
                    value = strings_map[value]
            inty_attrs[int_key] = value
    return inty_attrs


def intify_attr(name):
    """
    Normalize an attribute name, converting it to int.

    stringy_attr (string): Attribute string name. Can also be int (will then be left unchanged)
    RETURNS (int): int representation of the attribute, or None if it couldn't be converted.
    """
    if isinstance(name, int):
        return name
    elif name in IDS:
        return IDS[name]
    elif name.upper() in IDS:
        return IDS[name.upper()]
    return None
Clean up imports, unused code, whitespace, docstrings 2017-04-15 13:05:47 +03:00
* Rename ATTR_IDS to attrs.IDS. Rename ATTR_NAMES to attrs.NAMES. Rename UNIV_POS_IDS to parts_of_speech.IDS 2015-10-10 09:55:55 +03:00			`IDS = {`
* Map empty string to NULL_ATTR in attrs 2015-10-10 14:10:19 +03:00			`"": NULL_ATTR,`
* Refactor symbols, so that frequency rank can be derived from the orth id of a word. 2015-10-06 16:39:50 +03:00			`"IS_ALPHA": IS_ALPHA,`
			`"IS_ASCII": IS_ASCII,`
			`"IS_DIGIT": IS_DIGIT,`
			`"IS_LOWER": IS_LOWER,`
			`"IS_PUNCT": IS_PUNCT,`
			`"IS_SPACE": IS_SPACE,`
			`"IS_TITLE": IS_TITLE,`
			`"IS_UPPER": IS_UPPER,`
			`"LIKE_URL": LIKE_URL,`
			`"LIKE_NUM": LIKE_NUM,`
			`"LIKE_EMAIL": LIKE_EMAIL,`
			`"IS_STOP": IS_STOP,`
Reduce stored lexemes data, move feats to lookups (#5238) * Reduce stored lexemes data, move feats to lookups * Move non-derivable lexemes features (`norm / cluster / prob`) to `spacy-lookups-data` as lookups * Get/set `norm` in both lookups and `LexemeC`, serialize in lookups * Remove `cluster` and `prob` from `LexemesC`, get/set/serialize in lookups only * Remove serialization of lexemes data as `vocab/lexemes.bin` * Remove `SerializedLexemeC` * Remove `Lexeme.to_bytes/from_bytes` * Modify normalization exception loading: * Always create `Vocab.lookups` table `lexeme_norm` for normalization exceptions * Load base exceptions from `lang.norm_exceptions`, but load language-specific exceptions from lookups * Set `lex_attr_getter[NORM]` including new lookups table in `BaseDefaults.create_vocab()` and when deserializing `Vocab` * Remove all cached lexemes when deserializing vocab to override existing normalizations with the new normalizations (as a replacement for the previous step that replaced all lexemes data with the deserialized data) * Skip English normalization test Skip English normalization test because the data is now in `spacy-lookups-data`. * Remove norm exceptions Moved to spacy-lookups-data. * Move norm exceptions test to spacy-lookups-data * Load extra lookups from spacy-lookups-data lazily Load extra lookups (currently for cluster and prob) lazily from the entry point `lg_extra` as `Vocab.lookups_extra`. * Skip creating lexeme cache on load To improve model loading times, do not create the full lexeme cache when loading. The lexemes will be created on demand when processing. * Identify numeric values in Lexeme.set_attrs() With the removal of a special case for `PROB`, also identify `float` to avoid trying to convert it with the `StringStore`. * Skip lexeme cache init in from_bytes * Unskip and update lookups tests for python3.6+ * Update vocab pickle to include lookups_extra * Update vocab serialization tests Check strings rather than lexemes since lexemes aren't initialized automatically, account for addition of "_SP". * Re-skip lookups test because of python3.5 * Skip PROB/float values in Lexeme.set_attrs * Convert is_oov from lexeme flag to lex in vectors Instead of storing `is_oov` as a lexeme flag, `is_oov` reports whether the lexeme has a vector. Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com> 2020-05-19 16:59:14 +03:00			`"IS_OOV_DEPRECATED": IS_OOV_DEPRECATED,`
introduce lang field for LexemeC to hold language id put noun_chunk logic into iterators.py for each language separately 2016-03-10 15:01:34 +03:00			`"IS_BRACKET": IS_BRACKET,`
			`"IS_QUOTE": IS_QUOTE,`
			`"IS_LEFT_PUNCT": IS_LEFT_PUNCT,`
			`"IS_RIGHT_PUNCT": IS_RIGHT_PUNCT,`
code for is_currency 2018-02-11 20:51:32 +03:00			`"IS_CURRENCY": IS_CURRENCY,`
* Refactor symbols, so that frequency rank can be derived from the orth id of a word. 2015-10-06 16:39:50 +03:00			`"FLAG19": FLAG19,`
			`"FLAG20": FLAG20,`
			`"FLAG21": FLAG21,`
			`"FLAG22": FLAG22,`
			`"FLAG23": FLAG23,`
			`"FLAG24": FLAG24,`
			`"FLAG25": FLAG25,`
			`"FLAG26": FLAG26,`
			`"FLAG27": FLAG27,`
			`"FLAG28": FLAG28,`
			`"FLAG29": FLAG29,`
			`"FLAG30": FLAG30,`
			`"FLAG31": FLAG31,`
			`"FLAG32": FLAG32,`
			`"FLAG33": FLAG33,`
			`"FLAG34": FLAG34,`
			`"FLAG35": FLAG35,`
			`"FLAG36": FLAG36,`
			`"FLAG37": FLAG37,`
			`"FLAG38": FLAG38,`
			`"FLAG39": FLAG39,`
			`"FLAG40": FLAG40,`
			`"FLAG41": FLAG41,`
			`"FLAG42": FLAG42,`
			`"FLAG43": FLAG43,`
			`"FLAG44": FLAG44,`
			`"FLAG45": FLAG45,`
			`"FLAG46": FLAG46,`
			`"FLAG47": FLAG47,`
			`"FLAG48": FLAG48,`
			`"FLAG49": FLAG49,`
			`"FLAG50": FLAG50,`
			`"FLAG51": FLAG51,`
			`"FLAG52": FLAG52,`
			`"FLAG53": FLAG53,`
			`"FLAG54": FLAG54,`
			`"FLAG55": FLAG55,`
			`"FLAG56": FLAG56,`
			`"FLAG57": FLAG57,`
			`"FLAG58": FLAG58,`
			`"FLAG59": FLAG59,`
			`"FLAG60": FLAG60,`
			`"FLAG61": FLAG61,`
			`"FLAG62": FLAG62,`
			`"FLAG63": FLAG63,`

			`"ID": ID,`
			`"ORTH": ORTH,`
			`"LOWER": LOWER,`
			`"NORM": NORM,`
			`"SHAPE": SHAPE,`
			`"PREFIX": PREFIX,`
			`"SUFFIX": SUFFIX,`

			`"LENGTH": LENGTH,`
			`"LEMMA": LEMMA,`
			`"POS": POS,`
			`"TAG": TAG,`
			`"DEP": DEP,`
			`"ENT_IOB": ENT_IOB,`
			`"ENT_TYPE": ENT_TYPE,`
serialize ENT_ID (#4852) * expand serialization test for custom token attribute * add failing test for issue 4849 * define ENT_ID as attr and use in doc serialization * fix few typos 2020-01-06 16:57:34 +03:00			`"ENT_ID": ENT_ID,`
ensure Span.as_doc keeps the entity links + unit test 2019-06-25 16:28:51 +03:00			`"ENT_KB_ID": ENT_KB_ID,`
* Refactor symbols, so that frequency rank can be derived from the orth id of a word. 2015-10-06 16:39:50 +03:00			`"HEAD": HEAD,`
Add SENT_START attribute, for custom sentence boundary detection 2016-05-05 13:11:57 +03:00			`"SENT_START": SENT_START,`
* Refactor symbols, so that frequency rank can be derived from the orth id of a word. 2015-10-06 16:39:50 +03:00			`"SPACY": SPACY,`
introduce lang field for LexemeC to hold language id put noun_chunk logic into iterators.py for each language separately 2016-03-10 15:01:34 +03:00			`"LANG": LANG,`
Add MORPH attr, add support in retokenizer (#4947) * Add MORPH attr / symbol for token attrs * Update retokenizer for MORPH 2020-01-29 19:45:46 +03:00			`"MORPH": MORPH,`
make idx available via to_array (#5030) 2020-02-22 16:13:06 +03:00			`"IDX": IDX`
* Refactor symbols, so that frequency rank can be derived from the orth id of a word. 2015-10-06 16:39:50 +03:00			`}`

Add spacy.attrs.intify_attrs function, to normalize strings in token attribute dictionaries. 2016-11-25 13:34:30 +03:00
* Refactor symbols, so that frequency rank can be derived from the orth id of a word. 2015-10-06 16:39:50 +03:00			`# ATTR IDs, in order of the symbol`
* Rename ATTR_IDS to attrs.IDS. Rename ATTR_NAMES to attrs.NAMES. Rename UNIV_POS_IDS to parts_of_speech.IDS 2015-10-10 09:55:55 +03:00			`NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])]`
Fix cpdef enum in attrs.pyx 2017-09-17 20:28:53 +03:00			`locals().update(IDS)`
Add spacy.attrs.intify_attrs function, to normalize strings in token attribute dictionaries. 2016-11-25 13:34:30 +03:00

			`def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):`
Use consistent formatting for docstrings 2017-04-15 12:59:21 +03:00			`"""`
			`Normalize a dictionary of attributes, converting them to ints.`
Add support for Universal Dependencies v2.0 2017-02-27 00:27:11 +03:00
Tidy up rest 2017-10-27 22:07:59 +03:00			`stringy_attrs (dict): Dictionary keyed by attribute string names. Values`
			`can be ints or strings.`
			`strings_map (StringStore): Defaults to None. If provided, encodes string`
			`values into ints.`
			`RETURNS (dict): Attributes dictionary with keys and optionally values`
			`converted to ints.`
Use consistent formatting for docstrings 2017-04-15 12:59:21 +03:00			`"""`
Add spacy.attrs.intify_attrs function, to normalize strings in token attribute dictionaries. 2016-11-25 13:34:30 +03:00			`inty_attrs = {}`
			`if _do_deprecated:`
			`if 'F' in stringy_attrs:`
			`stringy_attrs["ORTH"] = stringy_attrs.pop("F")`
			`if 'L' in stringy_attrs:`
			`stringy_attrs["LEMMA"] = stringy_attrs.pop("L")`
			`if 'pos' in stringy_attrs:`
			`stringy_attrs["TAG"] = stringy_attrs.pop("pos")`
Exclude morphs from deprecated token attributes for now 2016-11-25 18:17:32 +03:00			`if 'morph' in stringy_attrs:`
			`morphs = stringy_attrs.pop('morph')`
Filter out deprecated attributes when reading special-case tokenization rules. 2016-11-25 18:57:18 +03:00			`if 'number' in stringy_attrs:`
			`stringy_attrs.pop('number')`
			`if 'tenspect' in stringy_attrs:`
			`stringy_attrs.pop('tenspect')`
Filter out morphology keys in deprecated attrs 2016-12-18 18:50:26 +03:00			`morph_keys = [`
			`'PunctType', 'PunctSide', 'Other', 'Degree', 'AdvType', 'Number',`
			`'VerbForm', 'PronType', 'Aspect', 'Tense', 'PartType', 'Poss',`
			`'Hyph', 'ConjType', 'NumType', 'Foreign', 'VerbType', 'NounType',`
Add more morphology names in attrs.pyx 2017-03-15 17:26:16 +03:00			`'Gender', 'Mood', 'Negative', 'Tense', 'Voice', 'Abbr',`
			`'Derivation', 'Echo', 'Foreign', 'NameType', 'NounType', 'NumForm',`
			`'NumValue', 'PartType', 'Polite', 'StyleVariant',`
			`'PronType', 'AdjType', 'Person', 'Variant', 'AdpType',`
Add support for Universal Dependencies v2.0 2017-02-27 00:27:11 +03:00			`'Reflex', 'Negative', 'Mood', 'Aspect', 'Case',`
Revert "Merge branch 'develop' of https://github.com/explosion/spaCy into develop" This reverts commit c9ba3d3c2dc7067cf8bd55f878cec45a8c6d73d4, reversing changes made to 92c26a35d425d4e8ca1b805ea776ea10f5ded3df. 2018-03-27 20:23:02 +03:00			`'Polarity', 'PrepCase', 'Animacy' # U20`
Add support for Universal Dependencies v2.0 2017-02-27 00:27:11 +03:00			`]`
Filter out morphology keys in deprecated attrs 2016-12-18 18:50:26 +03:00			`for key in morph_keys:`
			`if key in stringy_attrs:`
			`stringy_attrs.pop(key)`
Ignore more morphology attributes in deprecated mode of intify_attrs 2016-12-18 19:33:46 +03:00			`elif key.lower() in stringy_attrs:`
			`stringy_attrs.pop(key.lower())`
			`elif key.upper() in stringy_attrs:`
			`stringy_attrs.pop(key.upper())`
Add spacy.attrs.intify_attrs function, to normalize strings in token attribute dictionaries. 2016-11-25 13:34:30 +03:00			`for name, value in stringy_attrs.items():`
Bugfix initializing DocBin with attributes (#4368) * docbin init fix + documentation fix + unit tests * newline * try with zlib instead of gzip (python 2 incompatibilities) 2019-10-03 15:48:45 +03:00			`int_key = intify_attr(name)`
			`if int_key is not None:`
Update Cython string types (#9143) * Replace all basestring references with unicode `basestring` was a compatability type introduced by Cython to make dealing with utf-8 strings in Python2 easier. In Python3 it is equivalent to the unicode (or str) type. I replaced all references to basestring with unicode, since that was used elsewhere, but we could also just replace them with str, which shoudl also be equivalent. All tests pass locally. * Replace all references to unicode type with str Since we only support python3 this is simpler. * Remove all references to unicode type This removes all references to the unicode type across the codebase and replaces them with `str`, which makes it more drastic than the prior commits. In order to make this work importing `unicode_literals` had to be removed, and one explicit unicode literal also had to be removed (it is unclear why this is necessary in Cython with language level 3, but without doing it there were errors about implicit conversion). When `unicode` is used as a type in comments it was also edited to be `str`. Additionally `coding: utf8` headers were removed from a few files. 2021-09-13 18:02:17 +03:00			`if strings_map is not None and isinstance(value, str):`
Bugfix initializing DocBin with attributes (#4368) * docbin init fix + documentation fix + unit tests * newline * try with zlib instead of gzip (python 2 incompatibilities) 2019-10-03 15:48:45 +03:00			`if hasattr(strings_map, 'add'):`
			`value = strings_map.add(value)`
			`else:`
			`value = strings_map[value]`
			`inty_attrs[int_key] = value`
Add spacy.attrs.intify_attrs function, to normalize strings in token attribute dictionaries. 2016-11-25 13:34:30 +03:00			`return inty_attrs`
Bugfix initializing DocBin with attributes (#4368) * docbin init fix + documentation fix + unit tests * newline * try with zlib instead of gzip (python 2 incompatibilities) 2019-10-03 15:48:45 +03:00

			`def intify_attr(name):`
			`"""`
			`Normalize an attribute name, converting it to int.`

			`stringy_attr (string): Attribute string name. Can also be int (will then be left unchanged)`
			`RETURNS (int): int representation of the attribute, or None if it couldn't be converted.`
			`"""`
			`if isinstance(name, int):`
			`return name`
			`elif name in IDS:`
			`return IDS[name]`
			`elif name.upper() in IDS:`
			`return IDS[name.upper()]`
			`return None`