spaCy/services/displacy.py

#!/usr/bin/env python
from __future__ import unicode_literals
from __future__ import print_function
import sys

import falcon
import json
from os import path
from collections import defaultdict
import pprint
import numpy

import spacy.en
from spacy.attrs import ORTH, SPACY, TAG, POS, ENT_IOB, ENT_TYPE
from spacy.parts_of_speech import NAMES as UNIV_POS_NAMES

try:
  unicode
except NameError:
  unicode = str


NLU = spacy.en.English()


def merge_entities(doc):
    ents = [(e[0].idx, e[len(e)-1].idx + len(e[len(e)-1]), e.label_, e.text)
            for e in doc.ents if len(e) >= 2]
    for start, end, label, lemma in ents:
        merged = doc.merge(start, end, label, text, label)
        assert merged != None


def merge_nps(doc):
    nps = [(np[0].idx, np[-1].idx + len(np[-1]), np.root.tag_, np.text)
            for np in doc.noun_chunks if len(np) >= 2]
    
    for start, end, ent_type, lemma in nps:
        doc.merge(start, end, u'NP', lemma, ent_type)


def merge_punct(tokens):
    # Merge punctuation onto its head
    collect = False
    start = None
    merges = []

    for word in tokens:
        if word.whitespace_:
            if collect:
                span = tokens[start:word.i+1]
                if len(span) >= 2:
                    merges.append((
                        span[0].idx,
                        span[-1].idx + len(span[-1]),
                        span.root.tag_,
                        span.root.lemma_,
                        span.root.ent_type_))
                collect = False
                start = None
        elif not collect:
            collect = True
            start = word.i
    if collect:
        span = tokens[start:len(tokens)]
        merges.append((span[0].idx, span[-1].idx + len(span[-1]),
                       span.root.tag_, span.root.lemma_, span.root.ent_type_))
    for merge in merges:
         tokens.merge(*merge)


def get_actions(parse_state, n_actions):
    actions = []
    queue = list(sorted(parse_state.queue))
    stack = list(sorted(parse_state.stack))
    stack = []
    actions.append({'label': 'shift', 'key': 'S', 'binding': 38,
                    'is_valid': NLU.parser.moves.is_valid(parse_state, 'S')})
    actions.append({'label': 'left', 'key': 'L', 'binding': 37,
                    'is_valid': NLU.parser.moves.is_valid(parse_state, 'L-det')})
    actions.append({'label': 'predict', 'key': '_', 'binding': 32,
                    'is_valid': bool(parse_state.queue or parse_state.stack)})
    actions.append({'label': 'right', 'key': 'R', 'binding': 39,
                    'is_valid': NLU.parser.moves.is_valid(parse_state, 'R-dobj')})
    actions.append({'label': 'undo', 'key': '-', 'binding': 8,
                    'is_valid': n_actions != 0})
    actions.append({'label': 'reduce', 'key': 'D', 'binding': 40,
                    'is_valid': NLU.parser.moves.is_valid(parse_state, 'D')})
    return actions


class Model(object):
    def to_json(self):
        return {name: _as_json(value) for name, value in self.__dict__.items()
                if not name.startswith('_')}

def _as_json(value):
    if hasattr(value, 'to_json'):
        return value.to_json()
    elif isinstance(value, list):
        return [_as_json(v) for v in value]
    elif isinstance(value, set):
        return {key: True for key in value}
    else:
        return value


def _parse_history(history):
    if history and history.endswith(','):
        history = history[:-1]
    history = history.strip().split(',') if history else tuple()
    new_hist = []
    history_length = len(history)
    for action in history:
        if action == '-':
            if new_hist:
                new_hist.pop()
        else:
            new_hist.append(action)
    return new_hist, history_length


def apply_edits(tokens, word_edits, tag_edits):
    new_words = []
    attrs = (POS, ENT_TYPE, ENT_IOB)
    new_analysis = numpy.zeros(shape=(len(tokens), len(attrs)), dtype=numpy.int32)
    for word in tokens:
        key = str(word.i)
        new_words.append(word_edits.get(key, word.orth_))
        tag = tag_edits.get(key, word.pos_)
        if tag in UNIV_POS_NAMES:
            new_analysis[word.i, 0] = UNIV_POS_NAMES[tag]
            # Set ent_type=0 and IOB="O"
            new_analysis[word.i, 1] = 0
            new_analysis[word.i, 2] = 2
        else:
            new_analysis[word.i, 0] = word.pos
            new_analysis[word.i, 1] = NLU.vocab.strings[tag]
            new_analysis[word.i, 2] = 3

    doc = NLU.tokenizer.tokens_from_list(new_words)
    doc.from_array(attrs, new_analysis)
    NLU.parser(doc)
    return doc


class Parse(Model):
    def __init__(self, doc, states, actions, **kwargs):
        word_edits = kwargs.get('words', {})
        tag_edits = kwargs.get('tags', {})
        if word_edits or tag_edits:
            doc = apply_edits(doc, word_edits, tag_edits)
        notes = kwargs.get('notes', {})
        self.actions = actions
        self.words = [Word(w, w.i in word_edits, w.i in tag_edits) for w in doc]
        self.states = states
        self.notes = notes
        for word in doc:
            print(word.orth_, word.head.orth_)

    @classmethod
    def from_text(cls, text, **kwargs):
        tokens = NLU(text)
        #merge_entities(tokens)
        merge_nps(tokens)
        #merge_punct(tokens)
        return cls(tokens, [State.from_doc(tokens)], [], **kwargs)

    @classmethod
    def from_history(cls, text, history, **kwargs):
        if not isinstance(text, unicode):
            text = text.decode('utf8')
        text = text.replace('-SLASH-', '/')
        history, history_length = _parse_history(history)

        tokens = NLU.tokenizer(text)
        NLU.tagger(tokens)
        NLU.matcher(tokens)

        with NLU.parser.step_through(tokens) as state:
            for action in history:
                state.transition(action)

        NLU.entity(tokens)
        actions = get_actions(state.stcls, len(history))
        return Parse(tokens, [State(state.heads, state.deps, state.stack, state.queue)],
                     actions, **kwargs)

    @classmethod
    def with_history(cls, text):
        tokens = NLU.tokenizer(text)
        NLU.tagger(tokens)
        NLU.matcher(tokens)

        with NLU.parser.step_through(tokens) as state:
            states = []
            while not state.is_final:
                action = state.predict()
                state.transition(action)
                states.append(State(state.heads, state.deps, state.stack, state.queue))
        actions = [
            {'label': 'prev', 'key': 'P', 'binding': 37, 'is_valid': True},
            {'label': 'next', 'key': 'N', 'binding': 39, 'is_valid': True}
        ]
        return Parse(state.doc, states, actions)


class Word(Model):
    def __init__(self, token, is_w_edit=False, is_t_edit=False):
        self.word = token.orth_
        self.tag = token.pos_
        self.tag = token.pos_ if not token.ent_type_ else token.ent_type_
        self.is_entity = token.ent_iob in (1, 3)
        self.is_w_edit = is_w_edit
        self.is_t_edit = is_t_edit
        self.prob = token.prob


class State(Model):
    def __init__(self, heads, deps, stack, queue):
        Model.__init__(self)

        queue = [w for w in queue if w >= 0]
        self.focus = min(queue) if queue else -1
        self.is_final = bool(not stack and not queue)
        self.stack = set(stack)
        self.arrows = self._get_arrows(heads, deps)

    @classmethod
    def from_doc(cls, doc):
        return cls([w.head.i for w in doc], [w.dep_ for w in doc], [], [])

    def _get_arrows(self, heads, deps):
        arcs = defaultdict(dict)
        for i, (head, dep) in enumerate(zip(heads, deps)):
            if i < head:
                arcs[head - i][i] = Arrow(i, head, dep)
            elif i > head:
                arcs[i - head][head] = Arrow(i, head, dep)
        output = []
        for level in range(1, len(heads)):
            level_arcs = []
            for i in range(len(heads) - level):
                level_arcs.append(arcs[level].get(i))
            output.append(level_arcs)
        while output and all(arc is None for arc in output[-1]):
            output.pop()
        return output


class Arrow(Model):
    def __init__(self, word, head, label):
        self.dir = 'left' if head > word else 'right'
        self.label = label


class Endpoint(object):
    def set_header(self, resp):
        resp.content_type = 'text/string'
        resp.append_header('Access-Control-Allow-Origin', "*")
        resp.status = falcon.HTTP_200

    def set_body(self, resp, parse):
        resp.body = json.dumps(parse.to_json(), indent=4)

    def on_get(self, req, resp, text):
        if not isinstance(text, unicode):
            text = text.decode('utf8')
        self.set_body(resp, self.get_parse(text))
        self.set_header(resp)

    def on_post(self, req, resp):
        try:
            body_bytes = req.stream.read()
            json_data = json.loads(body_bytes.decode('utf8'))
            text = json_data['text']
            if not isinstance(text, unicode):
                text = text.decode('utf8')
            self.set_body(resp, self.get_parse(text))
            self.set_header(resp)
        except:
            pass


class ParseEP(Endpoint):
    def get_parse(self, text, **kwargs):
        return Parse.from_text(text, **kwargs)


class StepsEP(Endpoint):
    def get_parse(self, text):
        print('Step=', repr(text))
        return Parse.with_history(text)


class ManualEP(Endpoint):
    def get_parse(self, text, **kwargs):
        print('Manual=', repr(text))
        if '/' in text:
            text, actions = text.rsplit('/', 1)
        else:
            actions = ''
        return Parse.from_history(text, actions, **kwargs)

    def on_get(self, req, resp, text, actions=''):
        if not isinstance(text, unicode):
            text = text.decode('utf8')
        self.set_body(resp, self.get_parse(text + '/' + actions))
        self.set_header(resp)

    def on_post(self, req, resp):
        self.set_header(resp)
        body_bytes = req.stream.read()
        json_data = json.loads(body_bytes.decode('utf8'))
        print(json_data)
        params = json_data.get('params', {})
        self.set_body(resp, self.get_parse(json_data['text'], **params))


app = falcon.API()

remote_man = ManualEP()
remote_parse = ParseEP()
remote_steps = StepsEP()

app.add_route('/api/displacy/parse/', remote_parse)
app.add_route('/api/displacy/parse/{text}/', remote_parse)

app.add_route('/api/displacy/steps/', remote_steps)
app.add_route('/api/displacy/steps/{text}/', remote_steps)

app.add_route('/api/displacy/manual/', remote_man)
app.add_route('/api/displacy/manual/{text}/', remote_man)
app.add_route('/api/displacy/manual/{text}/{actions}', remote_man)


if __name__ == '__main__':
    text, actions = open(sys.argv[1]).read().strip().split('\n')
    parse = Parse.from_text(text)
    pprint.pprint(parse.to_json())
* Add displacy service 2015-10-28 19:36:11 +03:00			`#!/usr/bin/env python`
			`from __future__ import unicode_literals`
			`from __future__ import print_function`
			`import sys`

			`import falcon`
			`import json`
			`from os import path`
			`from collections import defaultdict`
			`import pprint`
			`import numpy`

			`import spacy.en`
			`from spacy.attrs import ORTH, SPACY, TAG, POS, ENT_IOB, ENT_TYPE`
			`from spacy.parts_of_speech import NAMES as UNIV_POS_NAMES`

			`try:`
			`unicode`
			`except NameError:`
			`unicode = str`


			`NLU = spacy.en.English()`


			`def merge_entities(doc):`
			`ents = [(e[0].idx, e[len(e)-1].idx + len(e[len(e)-1]), e.label_, e.text)`
			`for e in doc.ents if len(e) >= 2]`
			`for start, end, label, lemma in ents:`
			`merged = doc.merge(start, end, label, text, label)`
			`assert merged != None`


			`def merge_nps(doc):`
			`nps = [(np[0].idx, np[-1].idx + len(np[-1]), np.root.tag_, np.text)`
			`for np in doc.noun_chunks if len(np) >= 2]`

			`for start, end, ent_type, lemma in nps:`
			`doc.merge(start, end, u'NP', lemma, ent_type)`


			`def merge_punct(tokens):`
			`# Merge punctuation onto its head`
			`collect = False`
			`start = None`
			`merges = []`

			`for word in tokens:`
			`if word.whitespace_:`
			`if collect:`
			`span = tokens[start:word.i+1]`
			`if len(span) >= 2:`
			`merges.append((`
			`span[0].idx,`
			`span[-1].idx + len(span[-1]),`
			`span.root.tag_,`
			`span.root.lemma_,`
			`span.root.ent_type_))`
			`collect = False`
			`start = None`
			`elif not collect:`
			`collect = True`
			`start = word.i`
			`if collect:`
			`span = tokens[start:len(tokens)]`
			`merges.append((span[0].idx, span[-1].idx + len(span[-1]),`
			`span.root.tag_, span.root.lemma_, span.root.ent_type_))`
			`for merge in merges:`
			`tokens.merge(*merge)`


			`def get_actions(parse_state, n_actions):`
			`actions = []`
			`queue = list(sorted(parse_state.queue))`
			`stack = list(sorted(parse_state.stack))`
			`stack = []`
			`actions.append({'label': 'shift', 'key': 'S', 'binding': 38,`
			`'is_valid': NLU.parser.moves.is_valid(parse_state, 'S')})`
			`actions.append({'label': 'left', 'key': 'L', 'binding': 37,`
			`'is_valid': NLU.parser.moves.is_valid(parse_state, 'L-det')})`
			`actions.append({'label': 'predict', 'key': '_', 'binding': 32,`
			`'is_valid': bool(parse_state.queue or parse_state.stack)})`
			`actions.append({'label': 'right', 'key': 'R', 'binding': 39,`
			`'is_valid': NLU.parser.moves.is_valid(parse_state, 'R-dobj')})`
			`actions.append({'label': 'undo', 'key': '-', 'binding': 8,`
			`'is_valid': n_actions != 0})`
			`actions.append({'label': 'reduce', 'key': 'D', 'binding': 40,`
			`'is_valid': NLU.parser.moves.is_valid(parse_state, 'D')})`
			`return actions`


			`class Model(object):`
			`def to_json(self):`
			`return {name: _as_json(value) for name, value in self.__dict__.items()`
			`if not name.startswith('_')}`

			`def _as_json(value):`
			`if hasattr(value, 'to_json'):`
			`return value.to_json()`
			`elif isinstance(value, list):`
			`return [_as_json(v) for v in value]`
			`elif isinstance(value, set):`
			`return {key: True for key in value}`
			`else:`
			`return value`


			`def _parse_history(history):`
			`if history and history.endswith(','):`
			`history = history[:-1]`
			`history = history.strip().split(',') if history else tuple()`
			`new_hist = []`
			`history_length = len(history)`
			`for action in history:`
			`if action == '-':`
			`if new_hist:`
			`new_hist.pop()`
			`else:`
			`new_hist.append(action)`
			`return new_hist, history_length`


			`def apply_edits(tokens, word_edits, tag_edits):`
			`new_words = []`
			`attrs = (POS, ENT_TYPE, ENT_IOB)`
			`new_analysis = numpy.zeros(shape=(len(tokens), len(attrs)), dtype=numpy.int32)`
			`for word in tokens:`
			`key = str(word.i)`
			`new_words.append(word_edits.get(key, word.orth_))`
			`tag = tag_edits.get(key, word.pos_)`
			`if tag in UNIV_POS_NAMES:`
			`new_analysis[word.i, 0] = UNIV_POS_NAMES[tag]`
			`# Set ent_type=0 and IOB="O"`
			`new_analysis[word.i, 1] = 0`
			`new_analysis[word.i, 2] = 2`
			`else:`
			`new_analysis[word.i, 0] = word.pos`
			`new_analysis[word.i, 1] = NLU.vocab.strings[tag]`
			`new_analysis[word.i, 2] = 3`

			`doc = NLU.tokenizer.tokens_from_list(new_words)`
			`doc.from_array(attrs, new_analysis)`
			`NLU.parser(doc)`
			`return doc`


			`class Parse(Model):`
			`def __init__(self, doc, states, actions, **kwargs):`
			`word_edits = kwargs.get('words', {})`
			`tag_edits = kwargs.get('tags', {})`
			`if word_edits or tag_edits:`
			`doc = apply_edits(doc, word_edits, tag_edits)`
			`notes = kwargs.get('notes', {})`
			`self.actions = actions`
			`self.words = [Word(w, w.i in word_edits, w.i in tag_edits) for w in doc]`
			`self.states = states`
			`self.notes = notes`
			`for word in doc:`
			`print(word.orth_, word.head.orth_)`

			`@classmethod`
			`def from_text(cls, text, **kwargs):`
			`tokens = NLU(text)`
			`#merge_entities(tokens)`
			`merge_nps(tokens)`
			`#merge_punct(tokens)`
			`return cls(tokens, [State.from_doc(tokens)], [], **kwargs)`

			`@classmethod`
			`def from_history(cls, text, history, **kwargs):`
			`if not isinstance(text, unicode):`
			`text = text.decode('utf8')`
			`text = text.replace('-SLASH-', '/')`
			`history, history_length = _parse_history(history)`

			`tokens = NLU.tokenizer(text)`
			`NLU.tagger(tokens)`
			`NLU.matcher(tokens)`

			`with NLU.parser.step_through(tokens) as state:`
			`for action in history:`
			`state.transition(action)`

			`NLU.entity(tokens)`
			`actions = get_actions(state.stcls, len(history))`
			`return Parse(tokens, [State(state.heads, state.deps, state.stack, state.queue)],`
			`actions, **kwargs)`

			`@classmethod`
			`def with_history(cls, text):`
			`tokens = NLU.tokenizer(text)`
			`NLU.tagger(tokens)`
			`NLU.matcher(tokens)`

			`with NLU.parser.step_through(tokens) as state:`
			`states = []`
			`while not state.is_final:`
			`action = state.predict()`
			`state.transition(action)`
			`states.append(State(state.heads, state.deps, state.stack, state.queue))`
			`actions = [`
			`{'label': 'prev', 'key': 'P', 'binding': 37, 'is_valid': True},`
			`{'label': 'next', 'key': 'N', 'binding': 39, 'is_valid': True}`
			`]`
			`return Parse(state.doc, states, actions)`


			`class Word(Model):`
			`def __init__(self, token, is_w_edit=False, is_t_edit=False):`
			`self.word = token.orth_`
			`self.tag = token.pos_`
			`self.tag = token.pos_ if not token.ent_type_ else token.ent_type_`
			`self.is_entity = token.ent_iob in (1, 3)`
			`self.is_w_edit = is_w_edit`
			`self.is_t_edit = is_t_edit`
			`self.prob = token.prob`


			`class State(Model):`
			`def __init__(self, heads, deps, stack, queue):`
			`Model.__init__(self)`

			`queue = [w for w in queue if w >= 0]`
			`self.focus = min(queue) if queue else -1`
			`self.is_final = bool(not stack and not queue)`
			`self.stack = set(stack)`
			`self.arrows = self._get_arrows(heads, deps)`

			`@classmethod`
			`def from_doc(cls, doc):`
			`return cls([w.head.i for w in doc], [w.dep_ for w in doc], [], [])`

			`def _get_arrows(self, heads, deps):`
			`arcs = defaultdict(dict)`
			`for i, (head, dep) in enumerate(zip(heads, deps)):`
			`if i < head:`
			`arcs[head - i][i] = Arrow(i, head, dep)`
			`elif i > head:`
			`arcs[i - head][head] = Arrow(i, head, dep)`
			`output = []`
			`for level in range(1, len(heads)):`
			`level_arcs = []`
			`for i in range(len(heads) - level):`
			`level_arcs.append(arcs[level].get(i))`
			`output.append(level_arcs)`
			`while output and all(arc is None for arc in output[-1]):`
			`output.pop()`
			`return output`


			`class Arrow(Model):`
			`def __init__(self, word, head, label):`
			`self.dir = 'left' if head > word else 'right'`
			`self.label = label`


			`class Endpoint(object):`
			`def set_header(self, resp):`
			`resp.content_type = 'text/string'`
			`resp.append_header('Access-Control-Allow-Origin', "*")`
			`resp.status = falcon.HTTP_200`

			`def set_body(self, resp, parse):`
			`resp.body = json.dumps(parse.to_json(), indent=4)`

			`def on_get(self, req, resp, text):`
			`if not isinstance(text, unicode):`
			`text = text.decode('utf8')`
			`self.set_body(resp, self.get_parse(text))`
			`self.set_header(resp)`

			`def on_post(self, req, resp):`
			`try:`
			`body_bytes = req.stream.read()`
			`json_data = json.loads(body_bytes.decode('utf8'))`
			`text = json_data['text']`
			`if not isinstance(text, unicode):`
			`text = text.decode('utf8')`
			`self.set_body(resp, self.get_parse(text))`
			`self.set_header(resp)`
			`except:`
			`pass`


			`class ParseEP(Endpoint):`
			`def get_parse(self, text, **kwargs):`
			`return Parse.from_text(text, **kwargs)`


			`class StepsEP(Endpoint):`
			`def get_parse(self, text):`
			`print('Step=', repr(text))`
			`return Parse.with_history(text)`


			`class ManualEP(Endpoint):`
			`def get_parse(self, text, **kwargs):`
			`print('Manual=', repr(text))`
			`if '/' in text:`
			`text, actions = text.rsplit('/', 1)`
			`else:`
			`actions = ''`
			`return Parse.from_history(text, actions, **kwargs)`

			`def on_get(self, req, resp, text, actions=''):`
			`if not isinstance(text, unicode):`
			`text = text.decode('utf8')`
			`self.set_body(resp, self.get_parse(text + '/' + actions))`
			`self.set_header(resp)`

			`def on_post(self, req, resp):`
			`self.set_header(resp)`
			`body_bytes = req.stream.read()`
			`json_data = json.loads(body_bytes.decode('utf8'))`
			`print(json_data)`
			`params = json_data.get('params', {})`
			`self.set_body(resp, self.get_parse(json_data['text'], **params))`


			`app = falcon.API()`

			`remote_man = ManualEP()`
			`remote_parse = ParseEP()`
			`remote_steps = StepsEP()`

			`app.add_route('/api/displacy/parse/', remote_parse)`
			`app.add_route('/api/displacy/parse/{text}/', remote_parse)`

			`app.add_route('/api/displacy/steps/', remote_steps)`
			`app.add_route('/api/displacy/steps/{text}/', remote_steps)`

			`app.add_route('/api/displacy/manual/', remote_man)`
			`app.add_route('/api/displacy/manual/{text}/', remote_man)`
			`app.add_route('/api/displacy/manual/{text}/{actions}', remote_man)`


			`if __name__ == '__main__':`
			`text, actions = open(sys.argv[1]).read().strip().split('\n')`
			`parse = Parse.from_text(text)`
			`pprint.pprint(parse.to_json())`