mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	* Consolidate and freeze symbols Instead of having symbol values defined in three potentially conflicting places (`spacy.attrs`, `spacy.parts_of_speech`, `spacy.symbols`), define all symbols in `spacy.symbols` and reference those values in `spacy.attrs` and `spacy.parts_of_speech`. Remove deprecated and placeholder symbols from `spacy.attrs.IDS`. Make `spacy.attrs.NAMES` and `spacy.symbols.NAMES` reverse dicts rather than lists in order to support future use of hash values in `attr_id_t`. Minor changes: * Use `uint64_t` for attrs in `Doc.to_array` to support future use of hash values * Remove unneeded attrs filter for error message in `Doc.to_array` * Remove unused attr `SENT_END` * Handle dynamic size of attr_id_t in Doc.to_array * Undo added warnings * Refactor to make Doc.to_array more similar to Doc.from_array * Improve refactoring
		
			
				
	
	
		
			468 lines
		
	
	
		
			10 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			468 lines
		
	
	
		
			10 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
import pytest
 | 
						|
from spacy.symbols import IDS, NAMES
 | 
						|
 | 
						|
V3_SYMBOLS = {
 | 
						|
    "": 0,
 | 
						|
    "IS_ALPHA": 1,
 | 
						|
    "IS_ASCII": 2,
 | 
						|
    "IS_DIGIT": 3,
 | 
						|
    "IS_LOWER": 4,
 | 
						|
    "IS_PUNCT": 5,
 | 
						|
    "IS_SPACE": 6,
 | 
						|
    "IS_TITLE": 7,
 | 
						|
    "IS_UPPER": 8,
 | 
						|
    "LIKE_URL": 9,
 | 
						|
    "LIKE_NUM": 10,
 | 
						|
    "LIKE_EMAIL": 11,
 | 
						|
    "IS_STOP": 12,
 | 
						|
    "IS_OOV_DEPRECATED": 13,
 | 
						|
    "IS_BRACKET": 14,
 | 
						|
    "IS_QUOTE": 15,
 | 
						|
    "IS_LEFT_PUNCT": 16,
 | 
						|
    "IS_RIGHT_PUNCT": 17,
 | 
						|
    "IS_CURRENCY": 18,
 | 
						|
    "FLAG19": 19,
 | 
						|
    "FLAG20": 20,
 | 
						|
    "FLAG21": 21,
 | 
						|
    "FLAG22": 22,
 | 
						|
    "FLAG23": 23,
 | 
						|
    "FLAG24": 24,
 | 
						|
    "FLAG25": 25,
 | 
						|
    "FLAG26": 26,
 | 
						|
    "FLAG27": 27,
 | 
						|
    "FLAG28": 28,
 | 
						|
    "FLAG29": 29,
 | 
						|
    "FLAG30": 30,
 | 
						|
    "FLAG31": 31,
 | 
						|
    "FLAG32": 32,
 | 
						|
    "FLAG33": 33,
 | 
						|
    "FLAG34": 34,
 | 
						|
    "FLAG35": 35,
 | 
						|
    "FLAG36": 36,
 | 
						|
    "FLAG37": 37,
 | 
						|
    "FLAG38": 38,
 | 
						|
    "FLAG39": 39,
 | 
						|
    "FLAG40": 40,
 | 
						|
    "FLAG41": 41,
 | 
						|
    "FLAG42": 42,
 | 
						|
    "FLAG43": 43,
 | 
						|
    "FLAG44": 44,
 | 
						|
    "FLAG45": 45,
 | 
						|
    "FLAG46": 46,
 | 
						|
    "FLAG47": 47,
 | 
						|
    "FLAG48": 48,
 | 
						|
    "FLAG49": 49,
 | 
						|
    "FLAG50": 50,
 | 
						|
    "FLAG51": 51,
 | 
						|
    "FLAG52": 52,
 | 
						|
    "FLAG53": 53,
 | 
						|
    "FLAG54": 54,
 | 
						|
    "FLAG55": 55,
 | 
						|
    "FLAG56": 56,
 | 
						|
    "FLAG57": 57,
 | 
						|
    "FLAG58": 58,
 | 
						|
    "FLAG59": 59,
 | 
						|
    "FLAG60": 60,
 | 
						|
    "FLAG61": 61,
 | 
						|
    "FLAG62": 62,
 | 
						|
    "FLAG63": 63,
 | 
						|
    "ID": 64,
 | 
						|
    "ORTH": 65,
 | 
						|
    "LOWER": 66,
 | 
						|
    "NORM": 67,
 | 
						|
    "SHAPE": 68,
 | 
						|
    "PREFIX": 69,
 | 
						|
    "SUFFIX": 70,
 | 
						|
    "LENGTH": 71,
 | 
						|
    "CLUSTER": 72,
 | 
						|
    "LEMMA": 73,
 | 
						|
    "POS": 74,
 | 
						|
    "TAG": 75,
 | 
						|
    "DEP": 76,
 | 
						|
    "ENT_IOB": 77,
 | 
						|
    "ENT_TYPE": 78,
 | 
						|
    "ENT_ID": 454,
 | 
						|
    "ENT_KB_ID": 452,
 | 
						|
    "HEAD": 79,
 | 
						|
    "SENT_START": 80,
 | 
						|
    "SPACY": 81,
 | 
						|
    "PROB": 82,
 | 
						|
    "LANG": 83,
 | 
						|
    "IDX": 455,
 | 
						|
    "ADJ": 84,
 | 
						|
    "ADP": 85,
 | 
						|
    "ADV": 86,
 | 
						|
    "AUX": 87,
 | 
						|
    "CONJ": 88,
 | 
						|
    "CCONJ": 89,
 | 
						|
    "DET": 90,
 | 
						|
    "INTJ": 91,
 | 
						|
    "NOUN": 92,
 | 
						|
    "NUM": 93,
 | 
						|
    "PART": 94,
 | 
						|
    "PRON": 95,
 | 
						|
    "PROPN": 96,
 | 
						|
    "PUNCT": 97,
 | 
						|
    "SCONJ": 98,
 | 
						|
    "SYM": 99,
 | 
						|
    "VERB": 100,
 | 
						|
    "X": 101,
 | 
						|
    "EOL": 102,
 | 
						|
    "SPACE": 103,
 | 
						|
    "DEPRECATED001": 104,
 | 
						|
    "DEPRECATED002": 105,
 | 
						|
    "DEPRECATED003": 106,
 | 
						|
    "DEPRECATED004": 107,
 | 
						|
    "DEPRECATED005": 108,
 | 
						|
    "DEPRECATED006": 109,
 | 
						|
    "DEPRECATED007": 110,
 | 
						|
    "DEPRECATED008": 111,
 | 
						|
    "DEPRECATED009": 112,
 | 
						|
    "DEPRECATED010": 113,
 | 
						|
    "DEPRECATED011": 114,
 | 
						|
    "DEPRECATED012": 115,
 | 
						|
    "DEPRECATED013": 116,
 | 
						|
    "DEPRECATED014": 117,
 | 
						|
    "DEPRECATED015": 118,
 | 
						|
    "DEPRECATED016": 119,
 | 
						|
    "DEPRECATED017": 120,
 | 
						|
    "DEPRECATED018": 121,
 | 
						|
    "DEPRECATED019": 122,
 | 
						|
    "DEPRECATED020": 123,
 | 
						|
    "DEPRECATED021": 124,
 | 
						|
    "DEPRECATED022": 125,
 | 
						|
    "DEPRECATED023": 126,
 | 
						|
    "DEPRECATED024": 127,
 | 
						|
    "DEPRECATED025": 128,
 | 
						|
    "DEPRECATED026": 129,
 | 
						|
    "DEPRECATED027": 130,
 | 
						|
    "DEPRECATED028": 131,
 | 
						|
    "DEPRECATED029": 132,
 | 
						|
    "DEPRECATED030": 133,
 | 
						|
    "DEPRECATED031": 134,
 | 
						|
    "DEPRECATED032": 135,
 | 
						|
    "DEPRECATED033": 136,
 | 
						|
    "DEPRECATED034": 137,
 | 
						|
    "DEPRECATED035": 138,
 | 
						|
    "DEPRECATED036": 139,
 | 
						|
    "DEPRECATED037": 140,
 | 
						|
    "DEPRECATED038": 141,
 | 
						|
    "DEPRECATED039": 142,
 | 
						|
    "DEPRECATED040": 143,
 | 
						|
    "DEPRECATED041": 144,
 | 
						|
    "DEPRECATED042": 145,
 | 
						|
    "DEPRECATED043": 146,
 | 
						|
    "DEPRECATED044": 147,
 | 
						|
    "DEPRECATED045": 148,
 | 
						|
    "DEPRECATED046": 149,
 | 
						|
    "DEPRECATED047": 150,
 | 
						|
    "DEPRECATED048": 151,
 | 
						|
    "DEPRECATED049": 152,
 | 
						|
    "DEPRECATED050": 153,
 | 
						|
    "DEPRECATED051": 154,
 | 
						|
    "DEPRECATED052": 155,
 | 
						|
    "DEPRECATED053": 156,
 | 
						|
    "DEPRECATED054": 157,
 | 
						|
    "DEPRECATED055": 158,
 | 
						|
    "DEPRECATED056": 159,
 | 
						|
    "DEPRECATED057": 160,
 | 
						|
    "DEPRECATED058": 161,
 | 
						|
    "DEPRECATED059": 162,
 | 
						|
    "DEPRECATED060": 163,
 | 
						|
    "DEPRECATED061": 164,
 | 
						|
    "DEPRECATED062": 165,
 | 
						|
    "DEPRECATED063": 166,
 | 
						|
    "DEPRECATED064": 167,
 | 
						|
    "DEPRECATED065": 168,
 | 
						|
    "DEPRECATED066": 169,
 | 
						|
    "DEPRECATED067": 170,
 | 
						|
    "DEPRECATED068": 171,
 | 
						|
    "DEPRECATED069": 172,
 | 
						|
    "DEPRECATED070": 173,
 | 
						|
    "DEPRECATED071": 174,
 | 
						|
    "DEPRECATED072": 175,
 | 
						|
    "DEPRECATED073": 176,
 | 
						|
    "DEPRECATED074": 177,
 | 
						|
    "DEPRECATED075": 178,
 | 
						|
    "DEPRECATED076": 179,
 | 
						|
    "DEPRECATED077": 180,
 | 
						|
    "DEPRECATED078": 181,
 | 
						|
    "DEPRECATED079": 182,
 | 
						|
    "DEPRECATED080": 183,
 | 
						|
    "DEPRECATED081": 184,
 | 
						|
    "DEPRECATED082": 185,
 | 
						|
    "DEPRECATED083": 186,
 | 
						|
    "DEPRECATED084": 187,
 | 
						|
    "DEPRECATED085": 188,
 | 
						|
    "DEPRECATED086": 189,
 | 
						|
    "DEPRECATED087": 190,
 | 
						|
    "DEPRECATED088": 191,
 | 
						|
    "DEPRECATED089": 192,
 | 
						|
    "DEPRECATED090": 193,
 | 
						|
    "DEPRECATED091": 194,
 | 
						|
    "DEPRECATED092": 195,
 | 
						|
    "DEPRECATED093": 196,
 | 
						|
    "DEPRECATED094": 197,
 | 
						|
    "DEPRECATED095": 198,
 | 
						|
    "DEPRECATED096": 199,
 | 
						|
    "DEPRECATED097": 200,
 | 
						|
    "DEPRECATED098": 201,
 | 
						|
    "DEPRECATED099": 202,
 | 
						|
    "DEPRECATED100": 203,
 | 
						|
    "DEPRECATED101": 204,
 | 
						|
    "DEPRECATED102": 205,
 | 
						|
    "DEPRECATED103": 206,
 | 
						|
    "DEPRECATED104": 207,
 | 
						|
    "DEPRECATED105": 208,
 | 
						|
    "DEPRECATED106": 209,
 | 
						|
    "DEPRECATED107": 210,
 | 
						|
    "DEPRECATED108": 211,
 | 
						|
    "DEPRECATED109": 212,
 | 
						|
    "DEPRECATED110": 213,
 | 
						|
    "DEPRECATED111": 214,
 | 
						|
    "DEPRECATED112": 215,
 | 
						|
    "DEPRECATED113": 216,
 | 
						|
    "DEPRECATED114": 217,
 | 
						|
    "DEPRECATED115": 218,
 | 
						|
    "DEPRECATED116": 219,
 | 
						|
    "DEPRECATED117": 220,
 | 
						|
    "DEPRECATED118": 221,
 | 
						|
    "DEPRECATED119": 222,
 | 
						|
    "DEPRECATED120": 223,
 | 
						|
    "DEPRECATED121": 224,
 | 
						|
    "DEPRECATED122": 225,
 | 
						|
    "DEPRECATED123": 226,
 | 
						|
    "DEPRECATED124": 227,
 | 
						|
    "DEPRECATED125": 228,
 | 
						|
    "DEPRECATED126": 229,
 | 
						|
    "DEPRECATED127": 230,
 | 
						|
    "DEPRECATED128": 231,
 | 
						|
    "DEPRECATED129": 232,
 | 
						|
    "DEPRECATED130": 233,
 | 
						|
    "DEPRECATED131": 234,
 | 
						|
    "DEPRECATED132": 235,
 | 
						|
    "DEPRECATED133": 236,
 | 
						|
    "DEPRECATED134": 237,
 | 
						|
    "DEPRECATED135": 238,
 | 
						|
    "DEPRECATED136": 239,
 | 
						|
    "DEPRECATED137": 240,
 | 
						|
    "DEPRECATED138": 241,
 | 
						|
    "DEPRECATED139": 242,
 | 
						|
    "DEPRECATED140": 243,
 | 
						|
    "DEPRECATED141": 244,
 | 
						|
    "DEPRECATED142": 245,
 | 
						|
    "DEPRECATED143": 246,
 | 
						|
    "DEPRECATED144": 247,
 | 
						|
    "DEPRECATED145": 248,
 | 
						|
    "DEPRECATED146": 249,
 | 
						|
    "DEPRECATED147": 250,
 | 
						|
    "DEPRECATED148": 251,
 | 
						|
    "DEPRECATED149": 252,
 | 
						|
    "DEPRECATED150": 253,
 | 
						|
    "DEPRECATED151": 254,
 | 
						|
    "DEPRECATED152": 255,
 | 
						|
    "DEPRECATED153": 256,
 | 
						|
    "DEPRECATED154": 257,
 | 
						|
    "DEPRECATED155": 258,
 | 
						|
    "DEPRECATED156": 259,
 | 
						|
    "DEPRECATED157": 260,
 | 
						|
    "DEPRECATED158": 261,
 | 
						|
    "DEPRECATED159": 262,
 | 
						|
    "DEPRECATED160": 263,
 | 
						|
    "DEPRECATED161": 264,
 | 
						|
    "DEPRECATED162": 265,
 | 
						|
    "DEPRECATED163": 266,
 | 
						|
    "DEPRECATED164": 267,
 | 
						|
    "DEPRECATED165": 268,
 | 
						|
    "DEPRECATED166": 269,
 | 
						|
    "DEPRECATED167": 270,
 | 
						|
    "DEPRECATED168": 271,
 | 
						|
    "DEPRECATED169": 272,
 | 
						|
    "DEPRECATED170": 273,
 | 
						|
    "DEPRECATED171": 274,
 | 
						|
    "DEPRECATED172": 275,
 | 
						|
    "DEPRECATED173": 276,
 | 
						|
    "DEPRECATED174": 277,
 | 
						|
    "DEPRECATED175": 278,
 | 
						|
    "DEPRECATED176": 279,
 | 
						|
    "DEPRECATED177": 280,
 | 
						|
    "DEPRECATED178": 281,
 | 
						|
    "DEPRECATED179": 282,
 | 
						|
    "DEPRECATED180": 283,
 | 
						|
    "DEPRECATED181": 284,
 | 
						|
    "DEPRECATED182": 285,
 | 
						|
    "DEPRECATED183": 286,
 | 
						|
    "DEPRECATED184": 287,
 | 
						|
    "DEPRECATED185": 288,
 | 
						|
    "DEPRECATED186": 289,
 | 
						|
    "DEPRECATED187": 290,
 | 
						|
    "DEPRECATED188": 291,
 | 
						|
    "DEPRECATED189": 292,
 | 
						|
    "DEPRECATED190": 293,
 | 
						|
    "DEPRECATED191": 294,
 | 
						|
    "DEPRECATED192": 295,
 | 
						|
    "DEPRECATED193": 296,
 | 
						|
    "DEPRECATED194": 297,
 | 
						|
    "DEPRECATED195": 298,
 | 
						|
    "DEPRECATED196": 299,
 | 
						|
    "DEPRECATED197": 300,
 | 
						|
    "DEPRECATED198": 301,
 | 
						|
    "DEPRECATED199": 302,
 | 
						|
    "DEPRECATED200": 303,
 | 
						|
    "DEPRECATED201": 304,
 | 
						|
    "DEPRECATED202": 305,
 | 
						|
    "DEPRECATED203": 306,
 | 
						|
    "DEPRECATED204": 307,
 | 
						|
    "DEPRECATED205": 308,
 | 
						|
    "DEPRECATED206": 309,
 | 
						|
    "DEPRECATED207": 310,
 | 
						|
    "DEPRECATED208": 311,
 | 
						|
    "DEPRECATED209": 312,
 | 
						|
    "DEPRECATED210": 313,
 | 
						|
    "DEPRECATED211": 314,
 | 
						|
    "DEPRECATED212": 315,
 | 
						|
    "DEPRECATED213": 316,
 | 
						|
    "DEPRECATED214": 317,
 | 
						|
    "DEPRECATED215": 318,
 | 
						|
    "DEPRECATED216": 319,
 | 
						|
    "DEPRECATED217": 320,
 | 
						|
    "DEPRECATED218": 321,
 | 
						|
    "DEPRECATED219": 322,
 | 
						|
    "DEPRECATED220": 323,
 | 
						|
    "DEPRECATED221": 324,
 | 
						|
    "DEPRECATED222": 325,
 | 
						|
    "DEPRECATED223": 326,
 | 
						|
    "DEPRECATED224": 327,
 | 
						|
    "DEPRECATED225": 328,
 | 
						|
    "DEPRECATED226": 329,
 | 
						|
    "DEPRECATED227": 330,
 | 
						|
    "DEPRECATED228": 331,
 | 
						|
    "DEPRECATED229": 332,
 | 
						|
    "DEPRECATED230": 333,
 | 
						|
    "DEPRECATED231": 334,
 | 
						|
    "DEPRECATED232": 335,
 | 
						|
    "DEPRECATED233": 336,
 | 
						|
    "DEPRECATED234": 337,
 | 
						|
    "DEPRECATED235": 338,
 | 
						|
    "DEPRECATED236": 339,
 | 
						|
    "DEPRECATED237": 340,
 | 
						|
    "DEPRECATED238": 341,
 | 
						|
    "DEPRECATED239": 342,
 | 
						|
    "DEPRECATED240": 343,
 | 
						|
    "DEPRECATED241": 344,
 | 
						|
    "DEPRECATED242": 345,
 | 
						|
    "DEPRECATED243": 346,
 | 
						|
    "DEPRECATED244": 347,
 | 
						|
    "DEPRECATED245": 348,
 | 
						|
    "DEPRECATED246": 349,
 | 
						|
    "DEPRECATED247": 350,
 | 
						|
    "DEPRECATED248": 351,
 | 
						|
    "DEPRECATED249": 352,
 | 
						|
    "DEPRECATED250": 353,
 | 
						|
    "DEPRECATED251": 354,
 | 
						|
    "DEPRECATED252": 355,
 | 
						|
    "DEPRECATED253": 356,
 | 
						|
    "DEPRECATED254": 357,
 | 
						|
    "DEPRECATED255": 358,
 | 
						|
    "DEPRECATED256": 359,
 | 
						|
    "DEPRECATED257": 360,
 | 
						|
    "DEPRECATED258": 361,
 | 
						|
    "DEPRECATED259": 362,
 | 
						|
    "DEPRECATED260": 363,
 | 
						|
    "DEPRECATED261": 364,
 | 
						|
    "DEPRECATED262": 365,
 | 
						|
    "DEPRECATED263": 366,
 | 
						|
    "DEPRECATED264": 367,
 | 
						|
    "DEPRECATED265": 368,
 | 
						|
    "DEPRECATED266": 369,
 | 
						|
    "DEPRECATED267": 370,
 | 
						|
    "DEPRECATED268": 371,
 | 
						|
    "DEPRECATED269": 372,
 | 
						|
    "DEPRECATED270": 373,
 | 
						|
    "DEPRECATED271": 374,
 | 
						|
    "DEPRECATED272": 375,
 | 
						|
    "DEPRECATED273": 376,
 | 
						|
    "DEPRECATED274": 377,
 | 
						|
    "DEPRECATED275": 378,
 | 
						|
    "DEPRECATED276": 379,
 | 
						|
    "PERSON": 380,
 | 
						|
    "NORP": 381,
 | 
						|
    "FACILITY": 382,
 | 
						|
    "ORG": 383,
 | 
						|
    "GPE": 384,
 | 
						|
    "LOC": 385,
 | 
						|
    "PRODUCT": 386,
 | 
						|
    "EVENT": 387,
 | 
						|
    "WORK_OF_ART": 388,
 | 
						|
    "LANGUAGE": 389,
 | 
						|
    "DATE": 391,
 | 
						|
    "TIME": 392,
 | 
						|
    "PERCENT": 393,
 | 
						|
    "MONEY": 394,
 | 
						|
    "QUANTITY": 395,
 | 
						|
    "ORDINAL": 396,
 | 
						|
    "CARDINAL": 397,
 | 
						|
    "acomp": 398,
 | 
						|
    "advcl": 399,
 | 
						|
    "advmod": 400,
 | 
						|
    "agent": 401,
 | 
						|
    "amod": 402,
 | 
						|
    "appos": 403,
 | 
						|
    "attr": 404,
 | 
						|
    "aux": 405,
 | 
						|
    "auxpass": 406,
 | 
						|
    "cc": 407,
 | 
						|
    "ccomp": 408,
 | 
						|
    "complm": 409,
 | 
						|
    "conj": 410,
 | 
						|
    "cop": 411,
 | 
						|
    "csubj": 412,
 | 
						|
    "csubjpass": 413,
 | 
						|
    "dep": 414,
 | 
						|
    "det": 415,
 | 
						|
    "dobj": 416,
 | 
						|
    "expl": 417,
 | 
						|
    "hmod": 418,
 | 
						|
    "hyph": 419,
 | 
						|
    "infmod": 420,
 | 
						|
    "intj": 421,
 | 
						|
    "iobj": 422,
 | 
						|
    "mark": 423,
 | 
						|
    "meta": 424,
 | 
						|
    "neg": 425,
 | 
						|
    "nmod": 426,
 | 
						|
    "nn": 427,
 | 
						|
    "npadvmod": 428,
 | 
						|
    "nsubj": 429,
 | 
						|
    "nsubjpass": 430,
 | 
						|
    "num": 431,
 | 
						|
    "number": 432,
 | 
						|
    "oprd": 433,
 | 
						|
    "obj": 434,
 | 
						|
    "obl": 435,
 | 
						|
    "parataxis": 436,
 | 
						|
    "partmod": 437,
 | 
						|
    "pcomp": 438,
 | 
						|
    "pobj": 439,
 | 
						|
    "poss": 440,
 | 
						|
    "possessive": 441,
 | 
						|
    "preconj": 442,
 | 
						|
    "prep": 443,
 | 
						|
    "prt": 444,
 | 
						|
    "punct": 445,
 | 
						|
    "quantmod": 446,
 | 
						|
    "rcmod": 448,
 | 
						|
    "relcl": 447,
 | 
						|
    "root": 449,
 | 
						|
    "xcomp": 450,
 | 
						|
    "acl": 451,
 | 
						|
    "LAW": 390,
 | 
						|
    "MORPH": 453,
 | 
						|
    "_": 456,
 | 
						|
}
 | 
						|
 | 
						|
 | 
						|
def test_frozen_symbols():
 | 
						|
    assert IDS == V3_SYMBOLS
 | 
						|
    assert NAMES == {v: k for k, v in IDS.items()}
 |