diff --git a/services/displacy.py b/services/displacy.py deleted file mode 100644 index 40451daf2..000000000 --- a/services/displacy.py +++ /dev/null @@ -1,340 +0,0 @@ -#!/usr/bin/env python -from __future__ import unicode_literals -from __future__ import print_function -import sys - -import falcon -import json -from os import path -from collections import defaultdict -import pprint -import numpy - -import spacy.en -from spacy.attrs import ORTH, SPACY, TAG, POS, ENT_IOB, ENT_TYPE -from spacy.parts_of_speech import NAMES as UNIV_POS_NAMES - -try: - unicode -except NameError: - unicode = str - - -NLU = spacy.en.English() - - -def merge_entities(doc): - ents = [(e[0].idx, e[len(e)-1].idx + len(e[len(e)-1]), e.label_, e.text) - for e in doc.ents if len(e) >= 2] - for start, end, label, lemma in ents: - merged = doc.merge(start, end, label, text, label) - assert merged != None - - -def merge_nps(doc): - nps = [(np[0].idx, np[-1].idx + len(np[-1]), np.root.tag_, np.text) - for np in doc.noun_chunks if len(np) >= 2] - - for start, end, ent_type, lemma in nps: - doc.merge(start, end, u'NP', lemma, ent_type) - - -def merge_punct(tokens): - # Merge punctuation onto its head - collect = False - start = None - merges = [] - - for word in tokens: - if word.whitespace_: - if collect: - span = tokens[start:word.i+1] - if len(span) >= 2: - merges.append(( - span[0].idx, - span[-1].idx + len(span[-1]), - span.root.tag_, - span.root.lemma_, - span.root.ent_type_)) - collect = False - start = None - elif not collect: - collect = True - start = word.i - if collect: - span = tokens[start:len(tokens)] - merges.append((span[0].idx, span[-1].idx + len(span[-1]), - span.root.tag_, span.root.lemma_, span.root.ent_type_)) - for merge in merges: - tokens.merge(*merge) - - -def get_actions(parse_state, n_actions): - actions = [] - queue = list(sorted(parse_state.queue)) - stack = list(sorted(parse_state.stack)) - stack = [] - actions.append({'label': 'shift', 'key': 'S', 'binding': 38, - 'is_valid': NLU.parser.moves.is_valid(parse_state, 'S')}) - actions.append({'label': 'left', 'key': 'L', 'binding': 37, - 'is_valid': NLU.parser.moves.is_valid(parse_state, 'L-det')}) - actions.append({'label': 'predict', 'key': '_', 'binding': 32, - 'is_valid': bool(parse_state.queue or parse_state.stack)}) - actions.append({'label': 'right', 'key': 'R', 'binding': 39, - 'is_valid': NLU.parser.moves.is_valid(parse_state, 'R-dobj')}) - actions.append({'label': 'undo', 'key': '-', 'binding': 8, - 'is_valid': n_actions != 0}) - actions.append({'label': 'reduce', 'key': 'D', 'binding': 40, - 'is_valid': NLU.parser.moves.is_valid(parse_state, 'D')}) - return actions - - -class Model(object): - def to_json(self): - return {name: _as_json(value) for name, value in self.__dict__.items() - if not name.startswith('_')} - -def _as_json(value): - if hasattr(value, 'to_json'): - return value.to_json() - elif isinstance(value, list): - return [_as_json(v) for v in value] - elif isinstance(value, set): - return {key: True for key in value} - else: - return value - - -def _parse_history(history): - if history and history.endswith(','): - history = history[:-1] - history = history.strip().split(',') if history else tuple() - new_hist = [] - history_length = len(history) - for action in history: - if action == '-': - if new_hist: - new_hist.pop() - else: - new_hist.append(action) - return new_hist, history_length - - -def apply_edits(tokens, word_edits, tag_edits): - new_words = [] - attrs = (POS, ENT_TYPE, ENT_IOB) - new_analysis = numpy.zeros(shape=(len(tokens), len(attrs)), dtype=numpy.int32) - for word in tokens: - key = str(word.i) - new_words.append(word_edits.get(key, word.orth_)) - tag = tag_edits.get(key, word.pos_) - if tag in UNIV_POS_NAMES: - new_analysis[word.i, 0] = UNIV_POS_NAMES[tag] - # Set ent_type=0 and IOB="O" - new_analysis[word.i, 1] = 0 - new_analysis[word.i, 2] = 2 - else: - new_analysis[word.i, 0] = word.pos - new_analysis[word.i, 1] = NLU.vocab.strings[tag] - new_analysis[word.i, 2] = 3 - - doc = NLU.tokenizer.tokens_from_list(new_words) - doc.from_array(attrs, new_analysis) - NLU.parser(doc) - return doc - - -class Parse(Model): - def __init__(self, doc, states, actions, **kwargs): - word_edits = kwargs.get('words', {}) - tag_edits = kwargs.get('tags', {}) - if word_edits or tag_edits: - doc = apply_edits(doc, word_edits, tag_edits) - notes = kwargs.get('notes', {}) - self.actions = actions - self.words = [Word(w, w.i in word_edits, w.i in tag_edits) for w in doc] - self.states = states - self.notes = notes - for word in doc: - print(word.orth_, word.head.orth_) - - @classmethod - def from_text(cls, text, **kwargs): - tokens = NLU(text) - #merge_entities(tokens) - merge_nps(tokens) - #merge_punct(tokens) - return cls(tokens, [State.from_doc(tokens)], [], **kwargs) - - @classmethod - def from_history(cls, text, history, **kwargs): - if not isinstance(text, unicode): - text = text.decode('utf8') - text = text.replace('-SLASH-', '/') - history, history_length = _parse_history(history) - - tokens = NLU.tokenizer(text) - NLU.tagger(tokens) - NLU.matcher(tokens) - - with NLU.parser.step_through(tokens) as state: - for action in history: - state.transition(action) - - NLU.entity(tokens) - actions = get_actions(state.stcls, len(history)) - return Parse(tokens, [State(state.heads, state.deps, state.stack, state.queue)], - actions, **kwargs) - - @classmethod - def with_history(cls, text): - tokens = NLU.tokenizer(text) - NLU.tagger(tokens) - NLU.matcher(tokens) - - with NLU.parser.step_through(tokens) as state: - states = [] - while not state.is_final: - action = state.predict() - state.transition(action) - states.append(State(state.heads, state.deps, state.stack, state.queue)) - actions = [ - {'label': 'prev', 'key': 'P', 'binding': 37, 'is_valid': True}, - {'label': 'next', 'key': 'N', 'binding': 39, 'is_valid': True} - ] - return Parse(state.doc, states, actions) - - -class Word(Model): - def __init__(self, token, is_w_edit=False, is_t_edit=False): - self.word = token.orth_ - self.tag = token.pos_ - self.tag = token.pos_ if not token.ent_type_ else token.ent_type_ - self.is_entity = token.ent_iob in (1, 3) - self.is_w_edit = is_w_edit - self.is_t_edit = is_t_edit - self.prob = token.prob - - -class State(Model): - def __init__(self, heads, deps, stack, queue): - Model.__init__(self) - - queue = [w for w in queue if w >= 0] - self.focus = min(queue) if queue else -1 - self.is_final = bool(not stack and not queue) - self.stack = set(stack) - self.arrows = self._get_arrows(heads, deps) - - @classmethod - def from_doc(cls, doc): - return cls([w.head.i for w in doc], [w.dep_ for w in doc], [], []) - - def _get_arrows(self, heads, deps): - arcs = defaultdict(dict) - for i, (head, dep) in enumerate(zip(heads, deps)): - if i < head: - arcs[head - i][i] = Arrow(i, head, dep) - elif i > head: - arcs[i - head][head] = Arrow(i, head, dep) - output = [] - for level in range(1, len(heads)): - level_arcs = [] - for i in range(len(heads) - level): - level_arcs.append(arcs[level].get(i)) - output.append(level_arcs) - while output and all(arc is None for arc in output[-1]): - output.pop() - return output - - -class Arrow(Model): - def __init__(self, word, head, label): - self.dir = 'left' if head > word else 'right' - self.label = label - - -class Endpoint(object): - def set_header(self, resp): - resp.content_type = 'text/string' - resp.append_header('Access-Control-Allow-Origin', "*") - resp.status = falcon.HTTP_200 - - def set_body(self, resp, parse): - resp.body = json.dumps(parse.to_json(), indent=4) - - def on_get(self, req, resp, text): - if not isinstance(text, unicode): - text = text.decode('utf8') - self.set_body(resp, self.get_parse(text)) - self.set_header(resp) - - def on_post(self, req, resp): - try: - body_bytes = req.stream.read() - json_data = json.loads(body_bytes.decode('utf8')) - text = json_data['text'] - if not isinstance(text, unicode): - text = text.decode('utf8') - self.set_body(resp, self.get_parse(text)) - self.set_header(resp) - except: - pass - - -class ParseEP(Endpoint): - def get_parse(self, text, **kwargs): - return Parse.from_text(text, **kwargs) - - -class StepsEP(Endpoint): - def get_parse(self, text): - print('Step=', repr(text)) - return Parse.with_history(text) - - -class ManualEP(Endpoint): - def get_parse(self, text, **kwargs): - print('Manual=', repr(text)) - if '/' in text: - text, actions = text.rsplit('/', 1) - else: - actions = '' - return Parse.from_history(text, actions, **kwargs) - - def on_get(self, req, resp, text, actions=''): - if not isinstance(text, unicode): - text = text.decode('utf8') - self.set_body(resp, self.get_parse(text + '/' + actions)) - self.set_header(resp) - - def on_post(self, req, resp): - self.set_header(resp) - body_bytes = req.stream.read() - json_data = json.loads(body_bytes.decode('utf8')) - print(json_data) - params = json_data.get('params', {}) - self.set_body(resp, self.get_parse(json_data['text'], **params)) - - -app = falcon.API() - -remote_man = ManualEP() -remote_parse = ParseEP() -remote_steps = StepsEP() - -app.add_route('/api/displacy/parse/', remote_parse) -app.add_route('/api/displacy/parse/{text}/', remote_parse) - -app.add_route('/api/displacy/steps/', remote_steps) -app.add_route('/api/displacy/steps/{text}/', remote_steps) - -app.add_route('/api/displacy/manual/', remote_man) -app.add_route('/api/displacy/manual/{text}/', remote_man) -app.add_route('/api/displacy/manual/{text}/{actions}', remote_man) - - -if __name__ == '__main__': - text, actions = open(sys.argv[1]).read().strip().split('\n') - parse = Parse.from_text(text) - pprint.pprint(parse.to_json())