From 88a4e53fcbc452a7a0b093143abe4fd54eb1a407 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 6 Jul 2015 09:01:21 +0200 Subject: [PATCH] * Begin refactoring sense tagger --- spacy/munge/read_semcor.py | 193 ++++++++++++++++++ spacy/wsd/__init__.pxd | 0 spacy/wsd/__init__.py | 0 .../supersense_tagger.pyx} | 0 spacy/{senses.pxd => wsd/supersenses.pxd} | 0 spacy/{senses.pyx => wsd/supersenses.pyx} | 0 6 files changed, 193 insertions(+) create mode 100644 spacy/munge/read_semcor.py create mode 100644 spacy/wsd/__init__.pxd create mode 100644 spacy/wsd/__init__.py rename spacy/{sense_tagger.pyx => wsd/supersense_tagger.pyx} (100%) rename spacy/{senses.pxd => wsd/supersenses.pxd} (100%) rename spacy/{senses.pyx => wsd/supersenses.pyx} (100%) diff --git a/spacy/munge/read_semcor.py b/spacy/munge/read_semcor.py new file mode 100644 index 000000000..c8dd5dfd6 --- /dev/null +++ b/spacy/munge/read_semcor.py @@ -0,0 +1,193 @@ +from __future__ import unicode_literals +from __future__ import division +import plac +import re +from os import path +import os +import codecs + +from spacy.en import English + +lexnames_str = """ +-1 NO_SENSE -1 +00 J_all 3 +01 A_pert 3 +02 A_all 4 +03 N_Tops 1 +04 N_act 1 +05 N_animal 1 +06 N_artifact 1 +07 N_attribute 1 +08 N_body 1 +09 N_cognition 1 +10 N_communication 1 +11 N_event 1 +12 N_feeling 1 +13 N_food 1 +14 N_group 1 +15 N_location 1 +16 N_motive 1 +17 N_object 1 +18 N_person 1 +19 N_phenomenon 1 +20 N_plant 1 +21 N_possession 1 +22 N_process 1 +23 N_quantity 1 +24 N_relation 1 +25 N_shape 1 +26 N_state 1 +27 N_substance 1 +28 N_time 1 +29 V_body 2 +30 V_change 2 +31 V_cognition 2 +32 V_communication 2 +33 V_competition 2 +34 V_consumption 2 +35 V_contact 2 +36 V_creation 2 +37 V_emotion 2 +38 V_motion 2 +39 V_perception 2 +40 V_possession 2 +41 V_social 2 +42 V_stative 2 +43 V_weather 2 +44 A_ppl 3 +""".strip() + +SUPERSENSES = tuple(line.split()[1] for line in lexnames_str.split('\n')) + + + + +def re_get(exp, string): + obj = exp.search(string) + if obj is None: + return obj + else: + return obj.group() + + +lemma_re = re.compile(r'(?<=lemma=)[^ >]+') +cmd_re = re.compile(r'(?<=cmd=)[^ >]+') +pos_re = re.compile(r'(?<=pos=)[^ >]+') +ot_re = re.compile(r'(?<=ot=)[^ >]+') +wnsn_re = re.compile(r'(?<=wnsn=)[^ >]+') +lexsn_re = re.compile(r'(?<=lexsn=)[^ >]+') +supersense_re = re.compile(r'(?<=lexsn=\d:)\d\d') +orth_re = re.compile(r'(?<=>)[^<]+(?=<)') +class Token(object): + def __init__(self, line): + self.cmd = re_get(cmd_re, line) + self.lemma = re_get(lemma_re, line) + self.ot = re_get(ot_re, line) + self.pos = re_get(pos_re, line) + self.wnsn = re_get(wnsn_re, line) + self.lexsn = re_get(lexsn_re, line) + supersense = re_get(supersense_re, line) + if supersense is None: + self.supersense = SUPERSENSES[0] + else: + self.supersense = SUPERSENSES[int(supersense) + 1] + self.orth = re_get(orth_re, line) + + def __str__(self): + return (self.cmd, self.lemma, self.ot, self.pos, + self.wnsn, self.lexsn, self.orth) + + def __repr__(self): + return str(self) + + +def read_file(loc): + paras = [] + sents = [] + sent = [] + filename = None + pnum = None + snum = None + for line in codecs.open(loc, 'r', 'latin1'): + line = line.strip() + if not line: + continue + + if line.startswith('contextfile'): + continue + + if line.startswith('': + sents.append((snum, sent)) + sent = [] + snum = None + continue + + if line == '

': + paras.append((pnum, sents)) + sents = [] + pnum = None + continue + return paras + + +def read_semcor(semcor_dir): + docs = [] + brown1 = path.join(semcor_dir, 'brown1', 'tagfiles') + for filename in os.listdir(brown1): + file_path = path.join(brown1, filename) + docs.append((filename, read_file(file_path))) + return docs + + +def test_token(): + string = 'sheriff' + token = Token(string) + assert token.cmd == 'done' + assert token.pos == 'NN' + assert token.lemma == 'sheriff' + assert token.wnsn == '1' + assert token.lexsn == '1:18:00::' + assert token.orth == 'sheriff' + + +def main(model_dir, semcor_dir): + brown1 = path.join(semcor_dir, 'brown1', 'tagfiles') + + nlp = English(data_dir=model_dir) + total_right = 0 + total_wrong = 0 + total_multi = 0 + for filename in os.listdir(brown1): + file_path = path.join(brown1, filename) + annotations = read_file(file_path) + + n_multi, n_right, n_wrong = eval_text(nlp, annotations) + total_right += n_right + total_wrong += n_wrong + total_multi += n_multi + print total_right, total_wrong + print total_right / (total_right + total_wrong) + print total_multi / (total_multi + total_right + total_wrong) + +if __name__ == '__main__': + plac.call(main) diff --git a/spacy/wsd/__init__.pxd b/spacy/wsd/__init__.pxd new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/wsd/__init__.py b/spacy/wsd/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/sense_tagger.pyx b/spacy/wsd/supersense_tagger.pyx similarity index 100% rename from spacy/sense_tagger.pyx rename to spacy/wsd/supersense_tagger.pyx diff --git a/spacy/senses.pxd b/spacy/wsd/supersenses.pxd similarity index 100% rename from spacy/senses.pxd rename to spacy/wsd/supersenses.pxd diff --git a/spacy/senses.pyx b/spacy/wsd/supersenses.pyx similarity index 100% rename from spacy/senses.pyx rename to spacy/wsd/supersenses.pyx