mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-11 12:18:04 +03:00
4a615cacd2
* Consolidate and freeze symbols Instead of having symbol values defined in three potentially conflicting places (`spacy.attrs`, `spacy.parts_of_speech`, `spacy.symbols`), define all symbols in `spacy.symbols` and reference those values in `spacy.attrs` and `spacy.parts_of_speech`. Remove deprecated and placeholder symbols from `spacy.attrs.IDS`. Make `spacy.attrs.NAMES` and `spacy.symbols.NAMES` reverse dicts rather than lists in order to support future use of hash values in `attr_id_t`. Minor changes: * Use `uint64_t` for attrs in `Doc.to_array` to support future use of hash values * Remove unneeded attrs filter for error message in `Doc.to_array` * Remove unused attr `SENT_END` * Handle dynamic size of attr_id_t in Doc.to_array * Undo added warnings * Refactor to make Doc.to_array more similar to Doc.from_array * Improve refactoring
468 lines
10 KiB
Python
468 lines
10 KiB
Python
import pytest
|
|
from spacy.symbols import IDS, NAMES
|
|
|
|
V3_SYMBOLS = {
|
|
"": 0,
|
|
"IS_ALPHA": 1,
|
|
"IS_ASCII": 2,
|
|
"IS_DIGIT": 3,
|
|
"IS_LOWER": 4,
|
|
"IS_PUNCT": 5,
|
|
"IS_SPACE": 6,
|
|
"IS_TITLE": 7,
|
|
"IS_UPPER": 8,
|
|
"LIKE_URL": 9,
|
|
"LIKE_NUM": 10,
|
|
"LIKE_EMAIL": 11,
|
|
"IS_STOP": 12,
|
|
"IS_OOV_DEPRECATED": 13,
|
|
"IS_BRACKET": 14,
|
|
"IS_QUOTE": 15,
|
|
"IS_LEFT_PUNCT": 16,
|
|
"IS_RIGHT_PUNCT": 17,
|
|
"IS_CURRENCY": 18,
|
|
"FLAG19": 19,
|
|
"FLAG20": 20,
|
|
"FLAG21": 21,
|
|
"FLAG22": 22,
|
|
"FLAG23": 23,
|
|
"FLAG24": 24,
|
|
"FLAG25": 25,
|
|
"FLAG26": 26,
|
|
"FLAG27": 27,
|
|
"FLAG28": 28,
|
|
"FLAG29": 29,
|
|
"FLAG30": 30,
|
|
"FLAG31": 31,
|
|
"FLAG32": 32,
|
|
"FLAG33": 33,
|
|
"FLAG34": 34,
|
|
"FLAG35": 35,
|
|
"FLAG36": 36,
|
|
"FLAG37": 37,
|
|
"FLAG38": 38,
|
|
"FLAG39": 39,
|
|
"FLAG40": 40,
|
|
"FLAG41": 41,
|
|
"FLAG42": 42,
|
|
"FLAG43": 43,
|
|
"FLAG44": 44,
|
|
"FLAG45": 45,
|
|
"FLAG46": 46,
|
|
"FLAG47": 47,
|
|
"FLAG48": 48,
|
|
"FLAG49": 49,
|
|
"FLAG50": 50,
|
|
"FLAG51": 51,
|
|
"FLAG52": 52,
|
|
"FLAG53": 53,
|
|
"FLAG54": 54,
|
|
"FLAG55": 55,
|
|
"FLAG56": 56,
|
|
"FLAG57": 57,
|
|
"FLAG58": 58,
|
|
"FLAG59": 59,
|
|
"FLAG60": 60,
|
|
"FLAG61": 61,
|
|
"FLAG62": 62,
|
|
"FLAG63": 63,
|
|
"ID": 64,
|
|
"ORTH": 65,
|
|
"LOWER": 66,
|
|
"NORM": 67,
|
|
"SHAPE": 68,
|
|
"PREFIX": 69,
|
|
"SUFFIX": 70,
|
|
"LENGTH": 71,
|
|
"CLUSTER": 72,
|
|
"LEMMA": 73,
|
|
"POS": 74,
|
|
"TAG": 75,
|
|
"DEP": 76,
|
|
"ENT_IOB": 77,
|
|
"ENT_TYPE": 78,
|
|
"ENT_ID": 454,
|
|
"ENT_KB_ID": 452,
|
|
"HEAD": 79,
|
|
"SENT_START": 80,
|
|
"SPACY": 81,
|
|
"PROB": 82,
|
|
"LANG": 83,
|
|
"IDX": 455,
|
|
"ADJ": 84,
|
|
"ADP": 85,
|
|
"ADV": 86,
|
|
"AUX": 87,
|
|
"CONJ": 88,
|
|
"CCONJ": 89,
|
|
"DET": 90,
|
|
"INTJ": 91,
|
|
"NOUN": 92,
|
|
"NUM": 93,
|
|
"PART": 94,
|
|
"PRON": 95,
|
|
"PROPN": 96,
|
|
"PUNCT": 97,
|
|
"SCONJ": 98,
|
|
"SYM": 99,
|
|
"VERB": 100,
|
|
"X": 101,
|
|
"EOL": 102,
|
|
"SPACE": 103,
|
|
"DEPRECATED001": 104,
|
|
"DEPRECATED002": 105,
|
|
"DEPRECATED003": 106,
|
|
"DEPRECATED004": 107,
|
|
"DEPRECATED005": 108,
|
|
"DEPRECATED006": 109,
|
|
"DEPRECATED007": 110,
|
|
"DEPRECATED008": 111,
|
|
"DEPRECATED009": 112,
|
|
"DEPRECATED010": 113,
|
|
"DEPRECATED011": 114,
|
|
"DEPRECATED012": 115,
|
|
"DEPRECATED013": 116,
|
|
"DEPRECATED014": 117,
|
|
"DEPRECATED015": 118,
|
|
"DEPRECATED016": 119,
|
|
"DEPRECATED017": 120,
|
|
"DEPRECATED018": 121,
|
|
"DEPRECATED019": 122,
|
|
"DEPRECATED020": 123,
|
|
"DEPRECATED021": 124,
|
|
"DEPRECATED022": 125,
|
|
"DEPRECATED023": 126,
|
|
"DEPRECATED024": 127,
|
|
"DEPRECATED025": 128,
|
|
"DEPRECATED026": 129,
|
|
"DEPRECATED027": 130,
|
|
"DEPRECATED028": 131,
|
|
"DEPRECATED029": 132,
|
|
"DEPRECATED030": 133,
|
|
"DEPRECATED031": 134,
|
|
"DEPRECATED032": 135,
|
|
"DEPRECATED033": 136,
|
|
"DEPRECATED034": 137,
|
|
"DEPRECATED035": 138,
|
|
"DEPRECATED036": 139,
|
|
"DEPRECATED037": 140,
|
|
"DEPRECATED038": 141,
|
|
"DEPRECATED039": 142,
|
|
"DEPRECATED040": 143,
|
|
"DEPRECATED041": 144,
|
|
"DEPRECATED042": 145,
|
|
"DEPRECATED043": 146,
|
|
"DEPRECATED044": 147,
|
|
"DEPRECATED045": 148,
|
|
"DEPRECATED046": 149,
|
|
"DEPRECATED047": 150,
|
|
"DEPRECATED048": 151,
|
|
"DEPRECATED049": 152,
|
|
"DEPRECATED050": 153,
|
|
"DEPRECATED051": 154,
|
|
"DEPRECATED052": 155,
|
|
"DEPRECATED053": 156,
|
|
"DEPRECATED054": 157,
|
|
"DEPRECATED055": 158,
|
|
"DEPRECATED056": 159,
|
|
"DEPRECATED057": 160,
|
|
"DEPRECATED058": 161,
|
|
"DEPRECATED059": 162,
|
|
"DEPRECATED060": 163,
|
|
"DEPRECATED061": 164,
|
|
"DEPRECATED062": 165,
|
|
"DEPRECATED063": 166,
|
|
"DEPRECATED064": 167,
|
|
"DEPRECATED065": 168,
|
|
"DEPRECATED066": 169,
|
|
"DEPRECATED067": 170,
|
|
"DEPRECATED068": 171,
|
|
"DEPRECATED069": 172,
|
|
"DEPRECATED070": 173,
|
|
"DEPRECATED071": 174,
|
|
"DEPRECATED072": 175,
|
|
"DEPRECATED073": 176,
|
|
"DEPRECATED074": 177,
|
|
"DEPRECATED075": 178,
|
|
"DEPRECATED076": 179,
|
|
"DEPRECATED077": 180,
|
|
"DEPRECATED078": 181,
|
|
"DEPRECATED079": 182,
|
|
"DEPRECATED080": 183,
|
|
"DEPRECATED081": 184,
|
|
"DEPRECATED082": 185,
|
|
"DEPRECATED083": 186,
|
|
"DEPRECATED084": 187,
|
|
"DEPRECATED085": 188,
|
|
"DEPRECATED086": 189,
|
|
"DEPRECATED087": 190,
|
|
"DEPRECATED088": 191,
|
|
"DEPRECATED089": 192,
|
|
"DEPRECATED090": 193,
|
|
"DEPRECATED091": 194,
|
|
"DEPRECATED092": 195,
|
|
"DEPRECATED093": 196,
|
|
"DEPRECATED094": 197,
|
|
"DEPRECATED095": 198,
|
|
"DEPRECATED096": 199,
|
|
"DEPRECATED097": 200,
|
|
"DEPRECATED098": 201,
|
|
"DEPRECATED099": 202,
|
|
"DEPRECATED100": 203,
|
|
"DEPRECATED101": 204,
|
|
"DEPRECATED102": 205,
|
|
"DEPRECATED103": 206,
|
|
"DEPRECATED104": 207,
|
|
"DEPRECATED105": 208,
|
|
"DEPRECATED106": 209,
|
|
"DEPRECATED107": 210,
|
|
"DEPRECATED108": 211,
|
|
"DEPRECATED109": 212,
|
|
"DEPRECATED110": 213,
|
|
"DEPRECATED111": 214,
|
|
"DEPRECATED112": 215,
|
|
"DEPRECATED113": 216,
|
|
"DEPRECATED114": 217,
|
|
"DEPRECATED115": 218,
|
|
"DEPRECATED116": 219,
|
|
"DEPRECATED117": 220,
|
|
"DEPRECATED118": 221,
|
|
"DEPRECATED119": 222,
|
|
"DEPRECATED120": 223,
|
|
"DEPRECATED121": 224,
|
|
"DEPRECATED122": 225,
|
|
"DEPRECATED123": 226,
|
|
"DEPRECATED124": 227,
|
|
"DEPRECATED125": 228,
|
|
"DEPRECATED126": 229,
|
|
"DEPRECATED127": 230,
|
|
"DEPRECATED128": 231,
|
|
"DEPRECATED129": 232,
|
|
"DEPRECATED130": 233,
|
|
"DEPRECATED131": 234,
|
|
"DEPRECATED132": 235,
|
|
"DEPRECATED133": 236,
|
|
"DEPRECATED134": 237,
|
|
"DEPRECATED135": 238,
|
|
"DEPRECATED136": 239,
|
|
"DEPRECATED137": 240,
|
|
"DEPRECATED138": 241,
|
|
"DEPRECATED139": 242,
|
|
"DEPRECATED140": 243,
|
|
"DEPRECATED141": 244,
|
|
"DEPRECATED142": 245,
|
|
"DEPRECATED143": 246,
|
|
"DEPRECATED144": 247,
|
|
"DEPRECATED145": 248,
|
|
"DEPRECATED146": 249,
|
|
"DEPRECATED147": 250,
|
|
"DEPRECATED148": 251,
|
|
"DEPRECATED149": 252,
|
|
"DEPRECATED150": 253,
|
|
"DEPRECATED151": 254,
|
|
"DEPRECATED152": 255,
|
|
"DEPRECATED153": 256,
|
|
"DEPRECATED154": 257,
|
|
"DEPRECATED155": 258,
|
|
"DEPRECATED156": 259,
|
|
"DEPRECATED157": 260,
|
|
"DEPRECATED158": 261,
|
|
"DEPRECATED159": 262,
|
|
"DEPRECATED160": 263,
|
|
"DEPRECATED161": 264,
|
|
"DEPRECATED162": 265,
|
|
"DEPRECATED163": 266,
|
|
"DEPRECATED164": 267,
|
|
"DEPRECATED165": 268,
|
|
"DEPRECATED166": 269,
|
|
"DEPRECATED167": 270,
|
|
"DEPRECATED168": 271,
|
|
"DEPRECATED169": 272,
|
|
"DEPRECATED170": 273,
|
|
"DEPRECATED171": 274,
|
|
"DEPRECATED172": 275,
|
|
"DEPRECATED173": 276,
|
|
"DEPRECATED174": 277,
|
|
"DEPRECATED175": 278,
|
|
"DEPRECATED176": 279,
|
|
"DEPRECATED177": 280,
|
|
"DEPRECATED178": 281,
|
|
"DEPRECATED179": 282,
|
|
"DEPRECATED180": 283,
|
|
"DEPRECATED181": 284,
|
|
"DEPRECATED182": 285,
|
|
"DEPRECATED183": 286,
|
|
"DEPRECATED184": 287,
|
|
"DEPRECATED185": 288,
|
|
"DEPRECATED186": 289,
|
|
"DEPRECATED187": 290,
|
|
"DEPRECATED188": 291,
|
|
"DEPRECATED189": 292,
|
|
"DEPRECATED190": 293,
|
|
"DEPRECATED191": 294,
|
|
"DEPRECATED192": 295,
|
|
"DEPRECATED193": 296,
|
|
"DEPRECATED194": 297,
|
|
"DEPRECATED195": 298,
|
|
"DEPRECATED196": 299,
|
|
"DEPRECATED197": 300,
|
|
"DEPRECATED198": 301,
|
|
"DEPRECATED199": 302,
|
|
"DEPRECATED200": 303,
|
|
"DEPRECATED201": 304,
|
|
"DEPRECATED202": 305,
|
|
"DEPRECATED203": 306,
|
|
"DEPRECATED204": 307,
|
|
"DEPRECATED205": 308,
|
|
"DEPRECATED206": 309,
|
|
"DEPRECATED207": 310,
|
|
"DEPRECATED208": 311,
|
|
"DEPRECATED209": 312,
|
|
"DEPRECATED210": 313,
|
|
"DEPRECATED211": 314,
|
|
"DEPRECATED212": 315,
|
|
"DEPRECATED213": 316,
|
|
"DEPRECATED214": 317,
|
|
"DEPRECATED215": 318,
|
|
"DEPRECATED216": 319,
|
|
"DEPRECATED217": 320,
|
|
"DEPRECATED218": 321,
|
|
"DEPRECATED219": 322,
|
|
"DEPRECATED220": 323,
|
|
"DEPRECATED221": 324,
|
|
"DEPRECATED222": 325,
|
|
"DEPRECATED223": 326,
|
|
"DEPRECATED224": 327,
|
|
"DEPRECATED225": 328,
|
|
"DEPRECATED226": 329,
|
|
"DEPRECATED227": 330,
|
|
"DEPRECATED228": 331,
|
|
"DEPRECATED229": 332,
|
|
"DEPRECATED230": 333,
|
|
"DEPRECATED231": 334,
|
|
"DEPRECATED232": 335,
|
|
"DEPRECATED233": 336,
|
|
"DEPRECATED234": 337,
|
|
"DEPRECATED235": 338,
|
|
"DEPRECATED236": 339,
|
|
"DEPRECATED237": 340,
|
|
"DEPRECATED238": 341,
|
|
"DEPRECATED239": 342,
|
|
"DEPRECATED240": 343,
|
|
"DEPRECATED241": 344,
|
|
"DEPRECATED242": 345,
|
|
"DEPRECATED243": 346,
|
|
"DEPRECATED244": 347,
|
|
"DEPRECATED245": 348,
|
|
"DEPRECATED246": 349,
|
|
"DEPRECATED247": 350,
|
|
"DEPRECATED248": 351,
|
|
"DEPRECATED249": 352,
|
|
"DEPRECATED250": 353,
|
|
"DEPRECATED251": 354,
|
|
"DEPRECATED252": 355,
|
|
"DEPRECATED253": 356,
|
|
"DEPRECATED254": 357,
|
|
"DEPRECATED255": 358,
|
|
"DEPRECATED256": 359,
|
|
"DEPRECATED257": 360,
|
|
"DEPRECATED258": 361,
|
|
"DEPRECATED259": 362,
|
|
"DEPRECATED260": 363,
|
|
"DEPRECATED261": 364,
|
|
"DEPRECATED262": 365,
|
|
"DEPRECATED263": 366,
|
|
"DEPRECATED264": 367,
|
|
"DEPRECATED265": 368,
|
|
"DEPRECATED266": 369,
|
|
"DEPRECATED267": 370,
|
|
"DEPRECATED268": 371,
|
|
"DEPRECATED269": 372,
|
|
"DEPRECATED270": 373,
|
|
"DEPRECATED271": 374,
|
|
"DEPRECATED272": 375,
|
|
"DEPRECATED273": 376,
|
|
"DEPRECATED274": 377,
|
|
"DEPRECATED275": 378,
|
|
"DEPRECATED276": 379,
|
|
"PERSON": 380,
|
|
"NORP": 381,
|
|
"FACILITY": 382,
|
|
"ORG": 383,
|
|
"GPE": 384,
|
|
"LOC": 385,
|
|
"PRODUCT": 386,
|
|
"EVENT": 387,
|
|
"WORK_OF_ART": 388,
|
|
"LANGUAGE": 389,
|
|
"DATE": 391,
|
|
"TIME": 392,
|
|
"PERCENT": 393,
|
|
"MONEY": 394,
|
|
"QUANTITY": 395,
|
|
"ORDINAL": 396,
|
|
"CARDINAL": 397,
|
|
"acomp": 398,
|
|
"advcl": 399,
|
|
"advmod": 400,
|
|
"agent": 401,
|
|
"amod": 402,
|
|
"appos": 403,
|
|
"attr": 404,
|
|
"aux": 405,
|
|
"auxpass": 406,
|
|
"cc": 407,
|
|
"ccomp": 408,
|
|
"complm": 409,
|
|
"conj": 410,
|
|
"cop": 411,
|
|
"csubj": 412,
|
|
"csubjpass": 413,
|
|
"dep": 414,
|
|
"det": 415,
|
|
"dobj": 416,
|
|
"expl": 417,
|
|
"hmod": 418,
|
|
"hyph": 419,
|
|
"infmod": 420,
|
|
"intj": 421,
|
|
"iobj": 422,
|
|
"mark": 423,
|
|
"meta": 424,
|
|
"neg": 425,
|
|
"nmod": 426,
|
|
"nn": 427,
|
|
"npadvmod": 428,
|
|
"nsubj": 429,
|
|
"nsubjpass": 430,
|
|
"num": 431,
|
|
"number": 432,
|
|
"oprd": 433,
|
|
"obj": 434,
|
|
"obl": 435,
|
|
"parataxis": 436,
|
|
"partmod": 437,
|
|
"pcomp": 438,
|
|
"pobj": 439,
|
|
"poss": 440,
|
|
"possessive": 441,
|
|
"preconj": 442,
|
|
"prep": 443,
|
|
"prt": 444,
|
|
"punct": 445,
|
|
"quantmod": 446,
|
|
"rcmod": 448,
|
|
"relcl": 447,
|
|
"root": 449,
|
|
"xcomp": 450,
|
|
"acl": 451,
|
|
"LAW": 390,
|
|
"MORPH": 453,
|
|
"_": 456,
|
|
}
|
|
|
|
|
|
def test_frozen_symbols():
|
|
assert IDS == V3_SYMBOLS
|
|
assert NAMES == {v: k for k, v in IDS.items()}
|