spaCy/spacy/tests/test_symbols.py
Adriane Boyd 4a615cacd2
Consolidate and freeze symbols (#11352)
* Consolidate and freeze symbols

Instead of having symbol values defined in three potentially conflicting
places (`spacy.attrs`, `spacy.parts_of_speech`, `spacy.symbols`), define
all symbols in `spacy.symbols` and reference those values in
`spacy.attrs` and `spacy.parts_of_speech`.

Remove deprecated and placeholder symbols from `spacy.attrs.IDS`.

Make `spacy.attrs.NAMES` and `spacy.symbols.NAMES` reverse dicts rather
than lists in order to support future use of hash values in `attr_id_t`.

Minor changes:

* Use `uint64_t` for attrs in `Doc.to_array` to support future use of
hash values
* Remove unneeded attrs filter for error message in `Doc.to_array`
* Remove unused attr `SENT_END`

* Handle dynamic size of attr_id_t in Doc.to_array

* Undo added warnings

* Refactor to make Doc.to_array more similar to Doc.from_array

* Improve refactoring
2022-09-02 09:08:40 +02:00

468 lines
10 KiB
Python

import pytest
from spacy.symbols import IDS, NAMES
V3_SYMBOLS = {
"": 0,
"IS_ALPHA": 1,
"IS_ASCII": 2,
"IS_DIGIT": 3,
"IS_LOWER": 4,
"IS_PUNCT": 5,
"IS_SPACE": 6,
"IS_TITLE": 7,
"IS_UPPER": 8,
"LIKE_URL": 9,
"LIKE_NUM": 10,
"LIKE_EMAIL": 11,
"IS_STOP": 12,
"IS_OOV_DEPRECATED": 13,
"IS_BRACKET": 14,
"IS_QUOTE": 15,
"IS_LEFT_PUNCT": 16,
"IS_RIGHT_PUNCT": 17,
"IS_CURRENCY": 18,
"FLAG19": 19,
"FLAG20": 20,
"FLAG21": 21,
"FLAG22": 22,
"FLAG23": 23,
"FLAG24": 24,
"FLAG25": 25,
"FLAG26": 26,
"FLAG27": 27,
"FLAG28": 28,
"FLAG29": 29,
"FLAG30": 30,
"FLAG31": 31,
"FLAG32": 32,
"FLAG33": 33,
"FLAG34": 34,
"FLAG35": 35,
"FLAG36": 36,
"FLAG37": 37,
"FLAG38": 38,
"FLAG39": 39,
"FLAG40": 40,
"FLAG41": 41,
"FLAG42": 42,
"FLAG43": 43,
"FLAG44": 44,
"FLAG45": 45,
"FLAG46": 46,
"FLAG47": 47,
"FLAG48": 48,
"FLAG49": 49,
"FLAG50": 50,
"FLAG51": 51,
"FLAG52": 52,
"FLAG53": 53,
"FLAG54": 54,
"FLAG55": 55,
"FLAG56": 56,
"FLAG57": 57,
"FLAG58": 58,
"FLAG59": 59,
"FLAG60": 60,
"FLAG61": 61,
"FLAG62": 62,
"FLAG63": 63,
"ID": 64,
"ORTH": 65,
"LOWER": 66,
"NORM": 67,
"SHAPE": 68,
"PREFIX": 69,
"SUFFIX": 70,
"LENGTH": 71,
"CLUSTER": 72,
"LEMMA": 73,
"POS": 74,
"TAG": 75,
"DEP": 76,
"ENT_IOB": 77,
"ENT_TYPE": 78,
"ENT_ID": 454,
"ENT_KB_ID": 452,
"HEAD": 79,
"SENT_START": 80,
"SPACY": 81,
"PROB": 82,
"LANG": 83,
"IDX": 455,
"ADJ": 84,
"ADP": 85,
"ADV": 86,
"AUX": 87,
"CONJ": 88,
"CCONJ": 89,
"DET": 90,
"INTJ": 91,
"NOUN": 92,
"NUM": 93,
"PART": 94,
"PRON": 95,
"PROPN": 96,
"PUNCT": 97,
"SCONJ": 98,
"SYM": 99,
"VERB": 100,
"X": 101,
"EOL": 102,
"SPACE": 103,
"DEPRECATED001": 104,
"DEPRECATED002": 105,
"DEPRECATED003": 106,
"DEPRECATED004": 107,
"DEPRECATED005": 108,
"DEPRECATED006": 109,
"DEPRECATED007": 110,
"DEPRECATED008": 111,
"DEPRECATED009": 112,
"DEPRECATED010": 113,
"DEPRECATED011": 114,
"DEPRECATED012": 115,
"DEPRECATED013": 116,
"DEPRECATED014": 117,
"DEPRECATED015": 118,
"DEPRECATED016": 119,
"DEPRECATED017": 120,
"DEPRECATED018": 121,
"DEPRECATED019": 122,
"DEPRECATED020": 123,
"DEPRECATED021": 124,
"DEPRECATED022": 125,
"DEPRECATED023": 126,
"DEPRECATED024": 127,
"DEPRECATED025": 128,
"DEPRECATED026": 129,
"DEPRECATED027": 130,
"DEPRECATED028": 131,
"DEPRECATED029": 132,
"DEPRECATED030": 133,
"DEPRECATED031": 134,
"DEPRECATED032": 135,
"DEPRECATED033": 136,
"DEPRECATED034": 137,
"DEPRECATED035": 138,
"DEPRECATED036": 139,
"DEPRECATED037": 140,
"DEPRECATED038": 141,
"DEPRECATED039": 142,
"DEPRECATED040": 143,
"DEPRECATED041": 144,
"DEPRECATED042": 145,
"DEPRECATED043": 146,
"DEPRECATED044": 147,
"DEPRECATED045": 148,
"DEPRECATED046": 149,
"DEPRECATED047": 150,
"DEPRECATED048": 151,
"DEPRECATED049": 152,
"DEPRECATED050": 153,
"DEPRECATED051": 154,
"DEPRECATED052": 155,
"DEPRECATED053": 156,
"DEPRECATED054": 157,
"DEPRECATED055": 158,
"DEPRECATED056": 159,
"DEPRECATED057": 160,
"DEPRECATED058": 161,
"DEPRECATED059": 162,
"DEPRECATED060": 163,
"DEPRECATED061": 164,
"DEPRECATED062": 165,
"DEPRECATED063": 166,
"DEPRECATED064": 167,
"DEPRECATED065": 168,
"DEPRECATED066": 169,
"DEPRECATED067": 170,
"DEPRECATED068": 171,
"DEPRECATED069": 172,
"DEPRECATED070": 173,
"DEPRECATED071": 174,
"DEPRECATED072": 175,
"DEPRECATED073": 176,
"DEPRECATED074": 177,
"DEPRECATED075": 178,
"DEPRECATED076": 179,
"DEPRECATED077": 180,
"DEPRECATED078": 181,
"DEPRECATED079": 182,
"DEPRECATED080": 183,
"DEPRECATED081": 184,
"DEPRECATED082": 185,
"DEPRECATED083": 186,
"DEPRECATED084": 187,
"DEPRECATED085": 188,
"DEPRECATED086": 189,
"DEPRECATED087": 190,
"DEPRECATED088": 191,
"DEPRECATED089": 192,
"DEPRECATED090": 193,
"DEPRECATED091": 194,
"DEPRECATED092": 195,
"DEPRECATED093": 196,
"DEPRECATED094": 197,
"DEPRECATED095": 198,
"DEPRECATED096": 199,
"DEPRECATED097": 200,
"DEPRECATED098": 201,
"DEPRECATED099": 202,
"DEPRECATED100": 203,
"DEPRECATED101": 204,
"DEPRECATED102": 205,
"DEPRECATED103": 206,
"DEPRECATED104": 207,
"DEPRECATED105": 208,
"DEPRECATED106": 209,
"DEPRECATED107": 210,
"DEPRECATED108": 211,
"DEPRECATED109": 212,
"DEPRECATED110": 213,
"DEPRECATED111": 214,
"DEPRECATED112": 215,
"DEPRECATED113": 216,
"DEPRECATED114": 217,
"DEPRECATED115": 218,
"DEPRECATED116": 219,
"DEPRECATED117": 220,
"DEPRECATED118": 221,
"DEPRECATED119": 222,
"DEPRECATED120": 223,
"DEPRECATED121": 224,
"DEPRECATED122": 225,
"DEPRECATED123": 226,
"DEPRECATED124": 227,
"DEPRECATED125": 228,
"DEPRECATED126": 229,
"DEPRECATED127": 230,
"DEPRECATED128": 231,
"DEPRECATED129": 232,
"DEPRECATED130": 233,
"DEPRECATED131": 234,
"DEPRECATED132": 235,
"DEPRECATED133": 236,
"DEPRECATED134": 237,
"DEPRECATED135": 238,
"DEPRECATED136": 239,
"DEPRECATED137": 240,
"DEPRECATED138": 241,
"DEPRECATED139": 242,
"DEPRECATED140": 243,
"DEPRECATED141": 244,
"DEPRECATED142": 245,
"DEPRECATED143": 246,
"DEPRECATED144": 247,
"DEPRECATED145": 248,
"DEPRECATED146": 249,
"DEPRECATED147": 250,
"DEPRECATED148": 251,
"DEPRECATED149": 252,
"DEPRECATED150": 253,
"DEPRECATED151": 254,
"DEPRECATED152": 255,
"DEPRECATED153": 256,
"DEPRECATED154": 257,
"DEPRECATED155": 258,
"DEPRECATED156": 259,
"DEPRECATED157": 260,
"DEPRECATED158": 261,
"DEPRECATED159": 262,
"DEPRECATED160": 263,
"DEPRECATED161": 264,
"DEPRECATED162": 265,
"DEPRECATED163": 266,
"DEPRECATED164": 267,
"DEPRECATED165": 268,
"DEPRECATED166": 269,
"DEPRECATED167": 270,
"DEPRECATED168": 271,
"DEPRECATED169": 272,
"DEPRECATED170": 273,
"DEPRECATED171": 274,
"DEPRECATED172": 275,
"DEPRECATED173": 276,
"DEPRECATED174": 277,
"DEPRECATED175": 278,
"DEPRECATED176": 279,
"DEPRECATED177": 280,
"DEPRECATED178": 281,
"DEPRECATED179": 282,
"DEPRECATED180": 283,
"DEPRECATED181": 284,
"DEPRECATED182": 285,
"DEPRECATED183": 286,
"DEPRECATED184": 287,
"DEPRECATED185": 288,
"DEPRECATED186": 289,
"DEPRECATED187": 290,
"DEPRECATED188": 291,
"DEPRECATED189": 292,
"DEPRECATED190": 293,
"DEPRECATED191": 294,
"DEPRECATED192": 295,
"DEPRECATED193": 296,
"DEPRECATED194": 297,
"DEPRECATED195": 298,
"DEPRECATED196": 299,
"DEPRECATED197": 300,
"DEPRECATED198": 301,
"DEPRECATED199": 302,
"DEPRECATED200": 303,
"DEPRECATED201": 304,
"DEPRECATED202": 305,
"DEPRECATED203": 306,
"DEPRECATED204": 307,
"DEPRECATED205": 308,
"DEPRECATED206": 309,
"DEPRECATED207": 310,
"DEPRECATED208": 311,
"DEPRECATED209": 312,
"DEPRECATED210": 313,
"DEPRECATED211": 314,
"DEPRECATED212": 315,
"DEPRECATED213": 316,
"DEPRECATED214": 317,
"DEPRECATED215": 318,
"DEPRECATED216": 319,
"DEPRECATED217": 320,
"DEPRECATED218": 321,
"DEPRECATED219": 322,
"DEPRECATED220": 323,
"DEPRECATED221": 324,
"DEPRECATED222": 325,
"DEPRECATED223": 326,
"DEPRECATED224": 327,
"DEPRECATED225": 328,
"DEPRECATED226": 329,
"DEPRECATED227": 330,
"DEPRECATED228": 331,
"DEPRECATED229": 332,
"DEPRECATED230": 333,
"DEPRECATED231": 334,
"DEPRECATED232": 335,
"DEPRECATED233": 336,
"DEPRECATED234": 337,
"DEPRECATED235": 338,
"DEPRECATED236": 339,
"DEPRECATED237": 340,
"DEPRECATED238": 341,
"DEPRECATED239": 342,
"DEPRECATED240": 343,
"DEPRECATED241": 344,
"DEPRECATED242": 345,
"DEPRECATED243": 346,
"DEPRECATED244": 347,
"DEPRECATED245": 348,
"DEPRECATED246": 349,
"DEPRECATED247": 350,
"DEPRECATED248": 351,
"DEPRECATED249": 352,
"DEPRECATED250": 353,
"DEPRECATED251": 354,
"DEPRECATED252": 355,
"DEPRECATED253": 356,
"DEPRECATED254": 357,
"DEPRECATED255": 358,
"DEPRECATED256": 359,
"DEPRECATED257": 360,
"DEPRECATED258": 361,
"DEPRECATED259": 362,
"DEPRECATED260": 363,
"DEPRECATED261": 364,
"DEPRECATED262": 365,
"DEPRECATED263": 366,
"DEPRECATED264": 367,
"DEPRECATED265": 368,
"DEPRECATED266": 369,
"DEPRECATED267": 370,
"DEPRECATED268": 371,
"DEPRECATED269": 372,
"DEPRECATED270": 373,
"DEPRECATED271": 374,
"DEPRECATED272": 375,
"DEPRECATED273": 376,
"DEPRECATED274": 377,
"DEPRECATED275": 378,
"DEPRECATED276": 379,
"PERSON": 380,
"NORP": 381,
"FACILITY": 382,
"ORG": 383,
"GPE": 384,
"LOC": 385,
"PRODUCT": 386,
"EVENT": 387,
"WORK_OF_ART": 388,
"LANGUAGE": 389,
"DATE": 391,
"TIME": 392,
"PERCENT": 393,
"MONEY": 394,
"QUANTITY": 395,
"ORDINAL": 396,
"CARDINAL": 397,
"acomp": 398,
"advcl": 399,
"advmod": 400,
"agent": 401,
"amod": 402,
"appos": 403,
"attr": 404,
"aux": 405,
"auxpass": 406,
"cc": 407,
"ccomp": 408,
"complm": 409,
"conj": 410,
"cop": 411,
"csubj": 412,
"csubjpass": 413,
"dep": 414,
"det": 415,
"dobj": 416,
"expl": 417,
"hmod": 418,
"hyph": 419,
"infmod": 420,
"intj": 421,
"iobj": 422,
"mark": 423,
"meta": 424,
"neg": 425,
"nmod": 426,
"nn": 427,
"npadvmod": 428,
"nsubj": 429,
"nsubjpass": 430,
"num": 431,
"number": 432,
"oprd": 433,
"obj": 434,
"obl": 435,
"parataxis": 436,
"partmod": 437,
"pcomp": 438,
"pobj": 439,
"poss": 440,
"possessive": 441,
"preconj": 442,
"prep": 443,
"prt": 444,
"punct": 445,
"quantmod": 446,
"rcmod": 448,
"relcl": 447,
"root": 449,
"xcomp": 450,
"acl": 451,
"LAW": 390,
"MORPH": 453,
"_": 456,
}
def test_frozen_symbols():
assert IDS == V3_SYMBOLS
assert NAMES == {v: k for k, v in IDS.items()}