spaCy/spacy/wsd/supersenses.pyx
2015-07-06 09:33:26 +02:00

70 lines
1.7 KiB
Cython

from __future__ import unicode_literals
from .. cimport parts_of_speech
lexnames_str = """
-1 NO_SENSE -1
00 J_all 3
01 A_pert 3
02 A_all 4
03 N_Tops 1
04 N_act 1
05 N_animal 1
06 N_artifact 1
07 N_attribute 1
08 N_body 1
09 N_cognition 1
10 N_communication 1
11 N_event 1
12 N_feeling 1
13 N_food 1
14 N_group 1
15 N_location 1
16 N_motive 1
17 N_object 1
18 N_person 1
19 N_phenomenon 1
20 N_plant 1
21 N_possession 1
22 N_process 1
23 N_quantity 1
24 N_relation 1
25 N_shape 1
26 N_state 1
27 N_substance 1
28 N_time 1
29 V_body 2
30 V_change 2
31 V_cognition 2
32 V_communication 2
33 V_competition 2
34 V_consumption 2
35 V_contact 2
36 V_creation 2
37 V_emotion 2
38 V_motion 2
39 V_perception 2
40 V_possession 2
41 V_social 2
42 V_stative 2
43 V_weather 2
44 A_ppl 3
""".strip()
STRINGS = tuple(line.split()[1] for line in lexnames_str.split('\n'))
IDS = dict((sense_str, i) for i, sense_str in enumerate(STRINGS))
cdef flags_t encode_supersense_strs(sense_names) except 0:
cdef flags_t sense_bits = 0
if len(sense_names) == 0:
return sense_bits | (1 << NO_SENSE)
cdef flags_t sense_id = 0
for sense_str in sense_names:
sense_str = sense_str.replace('noun', 'N').replace('verb', 'V')
sense_str = sense_str.replace('adj', 'J').replace('adv', 'A')
sense_id = IDS[sense_str]
sense_bits |= (1 << sense_id)
return sense_bits