From 88a4e53fcbc452a7a0b093143abe4fd54eb1a407 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal
Date: Mon, 6 Jul 2015 09:01:21 +0200
Subject: [PATCH] * Begin refactoring sense tagger
---
spacy/munge/read_semcor.py | 193 ++++++++++++++++++
spacy/wsd/__init__.pxd | 0
spacy/wsd/__init__.py | 0
.../supersense_tagger.pyx} | 0
spacy/{senses.pxd => wsd/supersenses.pxd} | 0
spacy/{senses.pyx => wsd/supersenses.pyx} | 0
6 files changed, 193 insertions(+)
create mode 100644 spacy/munge/read_semcor.py
create mode 100644 spacy/wsd/__init__.pxd
create mode 100644 spacy/wsd/__init__.py
rename spacy/{sense_tagger.pyx => wsd/supersense_tagger.pyx} (100%)
rename spacy/{senses.pxd => wsd/supersenses.pxd} (100%)
rename spacy/{senses.pyx => wsd/supersenses.pyx} (100%)
diff --git a/spacy/munge/read_semcor.py b/spacy/munge/read_semcor.py
new file mode 100644
index 000000000..c8dd5dfd6
--- /dev/null
+++ b/spacy/munge/read_semcor.py
@@ -0,0 +1,193 @@
+from __future__ import unicode_literals
+from __future__ import division
+import plac
+import re
+from os import path
+import os
+import codecs
+
+from spacy.en import English
+
+lexnames_str = """
+-1 NO_SENSE -1
+00 J_all 3
+01 A_pert 3
+02 A_all 4
+03 N_Tops 1
+04 N_act 1
+05 N_animal 1
+06 N_artifact 1
+07 N_attribute 1
+08 N_body 1
+09 N_cognition 1
+10 N_communication 1
+11 N_event 1
+12 N_feeling 1
+13 N_food 1
+14 N_group 1
+15 N_location 1
+16 N_motive 1
+17 N_object 1
+18 N_person 1
+19 N_phenomenon 1
+20 N_plant 1
+21 N_possession 1
+22 N_process 1
+23 N_quantity 1
+24 N_relation 1
+25 N_shape 1
+26 N_state 1
+27 N_substance 1
+28 N_time 1
+29 V_body 2
+30 V_change 2
+31 V_cognition 2
+32 V_communication 2
+33 V_competition 2
+34 V_consumption 2
+35 V_contact 2
+36 V_creation 2
+37 V_emotion 2
+38 V_motion 2
+39 V_perception 2
+40 V_possession 2
+41 V_social 2
+42 V_stative 2
+43 V_weather 2
+44 A_ppl 3
+""".strip()
+
+SUPERSENSES = tuple(line.split()[1] for line in lexnames_str.split('\n'))
+
+
+
+
+def re_get(exp, string):
+ obj = exp.search(string)
+ if obj is None:
+ return obj
+ else:
+ return obj.group()
+
+
+lemma_re = re.compile(r'(?<=lemma=)[^ >]+')
+cmd_re = re.compile(r'(?<=cmd=)[^ >]+')
+pos_re = re.compile(r'(?<=pos=)[^ >]+')
+ot_re = re.compile(r'(?<=ot=)[^ >]+')
+wnsn_re = re.compile(r'(?<=wnsn=)[^ >]+')
+lexsn_re = re.compile(r'(?<=lexsn=)[^ >]+')
+supersense_re = re.compile(r'(?<=lexsn=\d:)\d\d')
+orth_re = re.compile(r'(?<=>)[^<]+(?=<)')
+class Token(object):
+ def __init__(self, line):
+ self.cmd = re_get(cmd_re, line)
+ self.lemma = re_get(lemma_re, line)
+ self.ot = re_get(ot_re, line)
+ self.pos = re_get(pos_re, line)
+ self.wnsn = re_get(wnsn_re, line)
+ self.lexsn = re_get(lexsn_re, line)
+ supersense = re_get(supersense_re, line)
+ if supersense is None:
+ self.supersense = SUPERSENSES[0]
+ else:
+ self.supersense = SUPERSENSES[int(supersense) + 1]
+ self.orth = re_get(orth_re, line)
+
+ def __str__(self):
+ return (self.cmd, self.lemma, self.ot, self.pos,
+ self.wnsn, self.lexsn, self.orth)
+
+ def __repr__(self):
+ return str(self)
+
+
+def read_file(loc):
+ paras = []
+ sents = []
+ sent = []
+ filename = None
+ pnum = None
+ snum = None
+ for line in codecs.open(loc, 'r', 'latin1'):
+ line = line.strip()
+ if not line:
+ continue
+
+ if line.startswith('contextfile'):
+ continue
+
+ if line.startswith('':
+ sents.append((snum, sent))
+ sent = []
+ snum = None
+ continue
+
+ if line == '
':
+ paras.append((pnum, sents))
+ sents = []
+ pnum = None
+ continue
+ return paras
+
+
+def read_semcor(semcor_dir):
+ docs = []
+ brown1 = path.join(semcor_dir, 'brown1', 'tagfiles')
+ for filename in os.listdir(brown1):
+ file_path = path.join(brown1, filename)
+ docs.append((filename, read_file(file_path)))
+ return docs
+
+
+def test_token():
+ string = 'sheriff'
+ token = Token(string)
+ assert token.cmd == 'done'
+ assert token.pos == 'NN'
+ assert token.lemma == 'sheriff'
+ assert token.wnsn == '1'
+ assert token.lexsn == '1:18:00::'
+ assert token.orth == 'sheriff'
+
+
+def main(model_dir, semcor_dir):
+ brown1 = path.join(semcor_dir, 'brown1', 'tagfiles')
+
+ nlp = English(data_dir=model_dir)
+ total_right = 0
+ total_wrong = 0
+ total_multi = 0
+ for filename in os.listdir(brown1):
+ file_path = path.join(brown1, filename)
+ annotations = read_file(file_path)
+
+ n_multi, n_right, n_wrong = eval_text(nlp, annotations)
+ total_right += n_right
+ total_wrong += n_wrong
+ total_multi += n_multi
+ print total_right, total_wrong
+ print total_right / (total_right + total_wrong)
+ print total_multi / (total_multi + total_right + total_wrong)
+
+if __name__ == '__main__':
+ plac.call(main)
diff --git a/spacy/wsd/__init__.pxd b/spacy/wsd/__init__.pxd
new file mode 100644
index 000000000..e69de29bb
diff --git a/spacy/wsd/__init__.py b/spacy/wsd/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/spacy/sense_tagger.pyx b/spacy/wsd/supersense_tagger.pyx
similarity index 100%
rename from spacy/sense_tagger.pyx
rename to spacy/wsd/supersense_tagger.pyx
diff --git a/spacy/senses.pxd b/spacy/wsd/supersenses.pxd
similarity index 100%
rename from spacy/senses.pxd
rename to spacy/wsd/supersenses.pxd
diff --git a/spacy/senses.pyx b/spacy/wsd/supersenses.pyx
similarity index 100%
rename from spacy/senses.pyx
rename to spacy/wsd/supersenses.pyx