2014-12-21 23:25:43 +03:00
|
|
|
from __future__ import unicode_literals
|
|
|
|
from os import path
|
2015-01-17 08:21:17 +03:00
|
|
|
import re
|
2015-07-19 16:18:17 +03:00
|
|
|
import struct
|
|
|
|
import json
|
2014-12-21 23:25:43 +03:00
|
|
|
|
2015-01-12 02:26:22 +03:00
|
|
|
from .. import orth
|
2014-12-21 23:25:43 +03:00
|
|
|
from ..vocab import Vocab
|
|
|
|
from ..tokenizer import Tokenizer
|
2015-02-22 08:32:33 +03:00
|
|
|
from ..syntax.arc_eager import ArcEager
|
2015-03-09 02:04:00 +03:00
|
|
|
from ..syntax.ner import BiluoPushDown
|
2015-07-08 13:35:29 +03:00
|
|
|
from ..syntax.parser import ParserFactory
|
2015-07-19 16:18:17 +03:00
|
|
|
from ..serialize.bits import BitArray
|
2015-07-08 13:35:29 +03:00
|
|
|
|
2015-07-08 19:53:00 +03:00
|
|
|
from ..tokens import Doc
|
2015-04-07 05:02:32 +03:00
|
|
|
from ..multi_words import RegexMerger
|
|
|
|
|
2014-12-21 23:25:43 +03:00
|
|
|
from .pos import EnPosTagger
|
2014-12-22 00:54:47 +03:00
|
|
|
from .pos import POS_TAGS
|
2014-12-21 23:25:43 +03:00
|
|
|
from .attrs import get_flags
|
2015-04-07 05:02:32 +03:00
|
|
|
from . import regexes
|
2014-12-21 23:25:43 +03:00
|
|
|
|
2015-01-17 08:21:17 +03:00
|
|
|
from ..util import read_lang_data
|
|
|
|
|
2015-07-19 16:18:17 +03:00
|
|
|
from ..attrs import TAG, HEAD, DEP, ENT_TYPE, ENT_IOB
|
|
|
|
|
2015-01-17 08:21:17 +03:00
|
|
|
|
2015-07-26 01:01:46 +03:00
|
|
|
def get_lex_props(string, oov_prob=-30):
|
2015-01-14 16:33:16 +03:00
|
|
|
return {
|
|
|
|
'flags': get_flags(string),
|
|
|
|
'length': len(string),
|
2015-01-22 18:08:25 +03:00
|
|
|
'orth': string,
|
2015-01-23 22:17:03 +03:00
|
|
|
'lower': string.lower(),
|
|
|
|
'norm': string,
|
2015-01-14 16:33:16 +03:00
|
|
|
'shape': orth.word_shape(string),
|
|
|
|
'prefix': string[0],
|
|
|
|
'suffix': string[-3:],
|
|
|
|
'cluster': 0,
|
2015-07-26 01:01:46 +03:00
|
|
|
'prob': oov_prob,
|
2015-01-14 16:33:16 +03:00
|
|
|
'sentiment': 0
|
|
|
|
}
|
|
|
|
|
2015-07-08 13:35:29 +03:00
|
|
|
if_model_present = -1
|
2015-07-08 20:34:55 +03:00
|
|
|
LOCAL_DATA_DIR = path.join(path.dirname(__file__), 'data')
|
2015-01-26 18:45:21 +03:00
|
|
|
|
2015-01-14 16:33:16 +03:00
|
|
|
|
2014-12-21 23:25:43 +03:00
|
|
|
class English(object):
|
2014-12-27 10:45:16 +03:00
|
|
|
"""The English NLP pipeline.
|
|
|
|
|
2015-07-07 15:00:07 +03:00
|
|
|
Example:
|
|
|
|
|
|
|
|
Load data from default directory:
|
|
|
|
|
|
|
|
>>> nlp = English()
|
|
|
|
>>> nlp = English(data_dir=u'')
|
|
|
|
|
|
|
|
Load data from specified directory:
|
|
|
|
|
|
|
|
>>> nlp = English(data_dir=u'path/to/data_directory')
|
|
|
|
|
|
|
|
Disable (and avoid loading) parts of the processing pipeline:
|
|
|
|
|
|
|
|
>>> nlp = English(vectors=False, parser=False, tagger=False, entity=False)
|
|
|
|
|
|
|
|
Start with nothing loaded:
|
|
|
|
|
|
|
|
>>> nlp = English(data_dir=None)
|
2014-12-27 10:45:16 +03:00
|
|
|
"""
|
2015-02-22 08:32:33 +03:00
|
|
|
ParserTransitionSystem = ArcEager
|
2015-03-09 02:04:00 +03:00
|
|
|
EntityTransitionSystem = BiluoPushDown
|
2015-02-22 08:32:33 +03:00
|
|
|
|
2015-07-08 13:35:29 +03:00
|
|
|
def __init__(self,
|
2015-07-08 20:35:30 +03:00
|
|
|
data_dir=LOCAL_DATA_DIR,
|
2015-07-08 13:35:29 +03:00
|
|
|
Tokenizer=Tokenizer.from_dir,
|
|
|
|
Tagger=EnPosTagger,
|
|
|
|
Parser=ParserFactory(ParserTransitionSystem),
|
|
|
|
Entity=ParserFactory(EntityTransitionSystem),
|
2015-07-17 02:19:29 +03:00
|
|
|
Packer=None,
|
2015-07-08 13:35:29 +03:00
|
|
|
load_vectors=True
|
|
|
|
):
|
|
|
|
|
2015-07-19 16:18:17 +03:00
|
|
|
self.data_dir = data_dir
|
2015-07-26 17:36:38 +03:00
|
|
|
|
|
|
|
if path.exists(path.join(data_dir, 'vocab', 'oov_prob')):
|
|
|
|
oov_prob = float(open(path.join(data_dir, 'vocab', 'oov_prob')).read())
|
|
|
|
else:
|
|
|
|
oov_prob = None
|
2015-07-07 16:53:25 +03:00
|
|
|
|
2015-01-13 16:03:48 +03:00
|
|
|
self.vocab = Vocab(data_dir=path.join(data_dir, 'vocab') if data_dir else None,
|
2015-07-08 13:35:29 +03:00
|
|
|
get_lex_props=get_lex_props, load_vectors=load_vectors,
|
2015-07-26 01:01:46 +03:00
|
|
|
pos_tags=POS_TAGS,
|
2015-07-26 17:36:38 +03:00
|
|
|
oov_prob=oov_prob)
|
2015-07-07 15:00:07 +03:00
|
|
|
if Tagger is True:
|
2015-07-07 16:53:25 +03:00
|
|
|
Tagger = EnPosTagger
|
2015-07-07 15:00:07 +03:00
|
|
|
if Parser is True:
|
|
|
|
transition_system = self.ParserTransitionSystem
|
2015-07-07 16:53:25 +03:00
|
|
|
Parser = lambda s, d: parser.Parser(s, d, transition_system)
|
2015-07-07 15:00:07 +03:00
|
|
|
if Entity is True:
|
|
|
|
transition_system = self.EntityTransitionSystem
|
|
|
|
Entity = lambda s, d: parser.Parser(s, d, transition_system)
|
|
|
|
|
2015-07-08 13:35:29 +03:00
|
|
|
self.tokenizer = Tokenizer(self.vocab, path.join(data_dir, 'tokenizer'))
|
|
|
|
|
2015-07-23 10:27:37 +03:00
|
|
|
if Tagger and path.exists(path.join(data_dir, 'pos')):
|
2015-07-07 16:53:25 +03:00
|
|
|
self.tagger = Tagger(self.vocab.strings, data_dir)
|
|
|
|
else:
|
|
|
|
self.tagger = None
|
2015-07-23 10:27:37 +03:00
|
|
|
if Parser and path.exists(path.join(data_dir, 'deps')):
|
2015-07-07 16:53:25 +03:00
|
|
|
self.parser = Parser(self.vocab.strings, path.join(data_dir, 'deps'))
|
|
|
|
else:
|
|
|
|
self.parser = None
|
2015-07-23 10:27:37 +03:00
|
|
|
if Entity and path.exists(path.join(data_dir, 'ner')):
|
2015-07-07 16:53:25 +03:00
|
|
|
self.entity = Entity(self.vocab.strings, path.join(data_dir, 'ner'))
|
|
|
|
else:
|
|
|
|
self.entity = None
|
2015-07-17 02:19:29 +03:00
|
|
|
if Packer:
|
|
|
|
self.packer = Packer(self.vocab, data_dir)
|
2015-07-16 18:47:53 +03:00
|
|
|
else:
|
2015-07-17 02:19:29 +03:00
|
|
|
self.packer = None
|
2015-04-07 05:02:32 +03:00
|
|
|
self.mwe_merger = RegexMerger([
|
|
|
|
('IN', 'O', regexes.MW_PREPOSITIONS_RE),
|
|
|
|
('CD', 'TIME', regexes.TIME_RE),
|
|
|
|
('NNP', 'DATE', regexes.DAYS_RE),
|
|
|
|
('CD', 'MONEY', regexes.MONEY_RE)])
|
2015-03-09 02:04:00 +03:00
|
|
|
|
2015-07-08 20:34:55 +03:00
|
|
|
def __call__(self, text, tag=True, parse=True, entity=True, merge_mwes=False):
|
2015-01-26 18:45:21 +03:00
|
|
|
"""Apply the pipeline to some text. The text can span multiple sentences,
|
|
|
|
and can contain arbtrary whitespace. Alignment into the original string
|
2015-07-08 13:35:29 +03:00
|
|
|
is preserved.
|
|
|
|
|
2014-12-27 10:45:16 +03:00
|
|
|
Args:
|
|
|
|
text (unicode): The text to be processed.
|
|
|
|
|
|
|
|
Returns:
|
2015-07-08 19:56:27 +03:00
|
|
|
tokens (spacy.tokens.Doc):
|
2015-01-26 18:45:21 +03:00
|
|
|
|
|
|
|
>>> from spacy.en import English
|
|
|
|
>>> nlp = English()
|
|
|
|
>>> tokens = nlp('An example sentence. Another example sentence.')
|
|
|
|
>>> tokens[0].orth_, tokens[0].head.tag_
|
|
|
|
('An', 'NN')
|
2014-12-27 10:45:16 +03:00
|
|
|
"""
|
2015-01-17 08:21:17 +03:00
|
|
|
tokens = self.tokenizer(text)
|
2015-07-08 13:35:29 +03:00
|
|
|
if self.tagger and tag:
|
2014-12-23 03:40:32 +03:00
|
|
|
self.tagger(tokens)
|
2015-07-08 13:35:29 +03:00
|
|
|
if self.parser and parse:
|
2015-01-17 08:21:17 +03:00
|
|
|
self.parser(tokens)
|
2015-07-08 13:35:29 +03:00
|
|
|
if self.entity and entity:
|
2015-03-09 02:04:00 +03:00
|
|
|
self.entity(tokens)
|
2015-07-08 20:34:55 +03:00
|
|
|
if merge_mwes and self.mwe_merger is not None:
|
|
|
|
self.mwe_merger(tokens)
|
2014-12-21 23:25:43 +03:00
|
|
|
return tokens
|
2014-12-24 09:42:00 +03:00
|
|
|
|
2015-07-19 16:18:17 +03:00
|
|
|
def end_training(self, data_dir=None):
|
|
|
|
if data_dir is None:
|
|
|
|
data_dir = self.data_dir
|
|
|
|
self.parser.model.end_training()
|
|
|
|
self.entity.model.end_training()
|
|
|
|
self.tagger.model.end_training()
|
|
|
|
self.vocab.strings.dump(path.join(data_dir, 'vocab', 'strings.txt'))
|
2015-07-22 14:40:23 +03:00
|
|
|
|
2015-07-23 02:15:25 +03:00
|
|
|
with open(path.join(data_dir, 'vocab', 'serializer.json'), 'w') as file_:
|
|
|
|
file_.write(
|
|
|
|
json.dumps([
|
|
|
|
(TAG, self.tagger.freqs[TAG].items()),
|
|
|
|
(DEP, self.parser.moves.freqs[DEP].items()),
|
|
|
|
(ENT_IOB, self.entity.moves.freqs[ENT_IOB].items()),
|
|
|
|
(ENT_TYPE, self.entity.moves.freqs[ENT_TYPE].items()),
|
|
|
|
(HEAD, self.parser.moves.freqs[HEAD].items())]))
|
2015-07-19 16:18:17 +03:00
|
|
|
|
2014-12-24 09:42:00 +03:00
|
|
|
@property
|
|
|
|
def tags(self):
|
2015-07-22 14:40:23 +03:00
|
|
|
"""Deprecated. List of part-of-speech tag names."""
|
2014-12-31 11:40:59 +03:00
|
|
|
return self.tagger.tag_names
|