spaCy/spacy/wsd/supersenses.pyx

from __future__ import unicode_literals
from .. cimport parts_of_speech


lexnames_str = """
-1      NO_SENSE       -1
00      J_all 3
01      A_pert        3
02      A_all 4
03      N_Tops       1
04      N_act        1
05      N_animal     1
06      N_artifact   1
07      N_attribute  1
08      N_body       1
09      N_cognition  1
10      N_communication      1
11      N_event      1
12      N_feeling    1
13      N_food       1
14      N_group      1
15      N_location   1
16      N_motive     1
17      N_object     1
18      N_person     1
19      N_phenomenon 1
20      N_plant      1
21      N_possession 1
22      N_process    1
23      N_quantity   1
24      N_relation   1
25      N_shape      1
26      N_state      1
27      N_substance  1
28      N_time       1
29      V_body       2
30      V_change     2
31      V_cognition  2
32      V_communication      2
33      V_competition        2
34      V_consumption        2
35      V_contact    2
36      V_creation   2
37      V_emotion    2
38      V_motion     2
39      V_perception 2
40      V_possession 2
41      V_social     2
42      V_stative    2
43      V_weather    2
44      A_ppl 3
""".strip()

STRINGS = tuple(line.split()[1] for line in lexnames_str.split('\n'))

IDS = dict((sense_str, i) for i, sense_str in enumerate(STRINGS))


cdef flags_t encode_supersense_strs(sense_names) except 0:
    cdef flags_t sense_bits = 0
    if len(sense_names) == 0:
        return sense_bits | (1 << NO_SENSE)
    cdef flags_t sense_id = 0
    for sense_str in sense_names:
        sense_str = sense_str.replace('noun', 'N').replace('verb', 'V')
        sense_str = sense_str.replace('adj', 'J').replace('adv', 'A')
        sense_id = IDS[sense_str]
        sense_bits |= (1 << sense_id)
    return sense_bits