mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-13 18:10:35 +03:00
Revert noise-level back to default 0.0
This commit is contained in:
commit
167f6a8938
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -40,7 +40,6 @@ venv/
|
||||||
|
|
||||||
# Distribution / packaging
|
# Distribution / packaging
|
||||||
env/
|
env/
|
||||||
bin/
|
|
||||||
build/
|
build/
|
||||||
develop-eggs/
|
develop-eggs/
|
||||||
dist/
|
dist/
|
||||||
|
|
|
@ -14,8 +14,7 @@ os:
|
||||||
env:
|
env:
|
||||||
- VIA=compile LC_ALL=en_US.ascii
|
- VIA=compile LC_ALL=en_US.ascii
|
||||||
- VIA=compile
|
- VIA=compile
|
||||||
|
#- VIA=pypi_nightly
|
||||||
# - VIA=sdist
|
|
||||||
|
|
||||||
install:
|
install:
|
||||||
- "./travis.sh"
|
- "./travis.sh"
|
||||||
|
@ -23,7 +22,7 @@ install:
|
||||||
script:
|
script:
|
||||||
- "pip install pytest pytest-timeout"
|
- "pip install pytest pytest-timeout"
|
||||||
- if [[ "${VIA}" == "compile" ]]; then python -m pytest --tb=native spacy; fi
|
- if [[ "${VIA}" == "compile" ]]; then python -m pytest --tb=native spacy; fi
|
||||||
- if [[ "${VIA}" == "pypi" ]]; then python -m pytest --tb=native `python -c "import os.path; import spacy; print(os.path.abspath(ospath.dirname(spacy.__file__)))"`; fi
|
- if [[ "${VIA}" == "pypi_nightly" ]]; then python -m pytest --tb=native --models --en `python -c "import os.path; import spacy; print(os.path.abspath(os.path.dirname(spacy.__file__)))"`; fi
|
||||||
- if [[ "${VIA}" == "sdist" ]]; then python -m pytest --tb=native `python -c "import os.path; import spacy; print(os.path.abspath(os.path.dirname(spacy.__file__)))"`; fi
|
- if [[ "${VIA}" == "sdist" ]]; then python -m pytest --tb=native `python -c "import os.path; import spacy; print(os.path.abspath(os.path.dirname(spacy.__file__)))"`; fi
|
||||||
|
|
||||||
notifications:
|
notifications:
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
recursive-include include *.h
|
recursive-include include *.h
|
||||||
include LICENSE
|
include LICENSE
|
||||||
include README.rst
|
include README.rst
|
||||||
|
include bin/spacy
|
||||||
|
|
|
@ -229,7 +229,7 @@ Compile from source
|
||||||
The other way to install spaCy is to clone its
|
The other way to install spaCy is to clone its
|
||||||
`GitHub repository <https://github.com/explosion/spaCy>`_ and build it from
|
`GitHub repository <https://github.com/explosion/spaCy>`_ and build it from
|
||||||
source. That is the common way if you want to make changes to the code base.
|
source. That is the common way if you want to make changes to the code base.
|
||||||
You'll need to make sure that you have a development enviroment consisting of a
|
You'll need to make sure that you have a development environment consisting of a
|
||||||
Python distribution including header files, a compiler,
|
Python distribution including header files, a compiler,
|
||||||
`pip <https://pip.pypa.io/en/latest/installing/>`__, `virtualenv <https://virtualenv.pypa.io/>`_
|
`pip <https://pip.pypa.io/en/latest/installing/>`__, `virtualenv <https://virtualenv.pypa.io/>`_
|
||||||
and `git <https://git-scm.com>`_ installed. The compiler part is the trickiest.
|
and `git <https://git-scm.com>`_ installed. The compiler part is the trickiest.
|
||||||
|
|
109
examples/training/train_textcat.py
Normal file
109
examples/training/train_textcat.py
Normal file
|
@ -0,0 +1,109 @@
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
import plac
|
||||||
|
import random
|
||||||
|
import tqdm
|
||||||
|
|
||||||
|
from thinc.neural.optimizers import Adam
|
||||||
|
from thinc.neural.ops import NumpyOps
|
||||||
|
import thinc.extra.datasets
|
||||||
|
|
||||||
|
import spacy.lang.en
|
||||||
|
from spacy.gold import GoldParse, minibatch
|
||||||
|
from spacy.util import compounding
|
||||||
|
from spacy.pipeline import TextCategorizer
|
||||||
|
|
||||||
|
|
||||||
|
def train_textcat(tokenizer, textcat,
|
||||||
|
train_texts, train_cats, dev_texts, dev_cats,
|
||||||
|
n_iter=20):
|
||||||
|
'''
|
||||||
|
Train the TextCategorizer without associated pipeline.
|
||||||
|
'''
|
||||||
|
textcat.begin_training()
|
||||||
|
optimizer = Adam(NumpyOps(), 0.001)
|
||||||
|
train_docs = [tokenizer(text) for text in train_texts]
|
||||||
|
train_gold = [GoldParse(doc, cats=cats) for doc, cats in
|
||||||
|
zip(train_docs, train_cats)]
|
||||||
|
train_data = zip(train_docs, train_gold)
|
||||||
|
batch_sizes = compounding(4., 128., 1.001)
|
||||||
|
for i in range(n_iter):
|
||||||
|
losses = {}
|
||||||
|
train_data = tqdm.tqdm(train_data, leave=False) # Progress bar
|
||||||
|
for batch in minibatch(train_data, size=batch_sizes):
|
||||||
|
docs, golds = zip(*batch)
|
||||||
|
textcat.update((docs, None), golds, sgd=optimizer, drop=0.2,
|
||||||
|
losses=losses)
|
||||||
|
with textcat.model.use_params(optimizer.averages):
|
||||||
|
scores = evaluate(tokenizer, textcat, dev_texts, dev_cats)
|
||||||
|
yield losses['textcat'], scores
|
||||||
|
|
||||||
|
|
||||||
|
def evaluate(tokenizer, textcat, texts, cats):
|
||||||
|
docs = (tokenizer(text) for text in texts)
|
||||||
|
tp = 1e-8 # True positives
|
||||||
|
fp = 1e-8 # False positives
|
||||||
|
fn = 1e-8 # False negatives
|
||||||
|
tn = 1e-8 # True negatives
|
||||||
|
for i, doc in enumerate(textcat.pipe(docs)):
|
||||||
|
gold = cats[i]
|
||||||
|
for label, score in doc.cats.items():
|
||||||
|
if score >= 0.5 and label in gold:
|
||||||
|
tp += 1.
|
||||||
|
elif score >= 0.5 and label not in gold:
|
||||||
|
fp += 1.
|
||||||
|
elif score < 0.5 and label not in gold:
|
||||||
|
tn += 1
|
||||||
|
if score < 0.5 and label in gold:
|
||||||
|
fn += 1
|
||||||
|
precis = tp / (tp + fp)
|
||||||
|
recall = tp / (tp + fn)
|
||||||
|
fscore = 2 * (precis * recall) / (precis + recall)
|
||||||
|
return {'textcat_p': precis, 'textcat_r': recall, 'textcat_f': fscore}
|
||||||
|
|
||||||
|
|
||||||
|
def load_data():
|
||||||
|
# Partition off part of the train data --- avoid running experiments
|
||||||
|
# against test.
|
||||||
|
train_data, _ = thinc.extra.datasets.imdb()
|
||||||
|
|
||||||
|
random.shuffle(train_data)
|
||||||
|
|
||||||
|
texts, labels = zip(*train_data)
|
||||||
|
cats = [(['POSITIVE'] if y else []) for y in labels]
|
||||||
|
|
||||||
|
split = int(len(train_data) * 0.8)
|
||||||
|
|
||||||
|
train_texts = texts[:split]
|
||||||
|
train_cats = cats[:split]
|
||||||
|
dev_texts = texts[split:]
|
||||||
|
dev_cats = cats[split:]
|
||||||
|
return (train_texts, train_cats), (dev_texts, dev_cats)
|
||||||
|
|
||||||
|
|
||||||
|
def main(model_loc=None):
|
||||||
|
nlp = spacy.lang.en.English()
|
||||||
|
tokenizer = nlp.tokenizer
|
||||||
|
textcat = TextCategorizer(tokenizer.vocab, labels=['POSITIVE'])
|
||||||
|
|
||||||
|
print("Load IMDB data")
|
||||||
|
(train_texts, train_cats), (dev_texts, dev_cats) = load_data()
|
||||||
|
|
||||||
|
print("Itn.\tLoss\tP\tR\tF")
|
||||||
|
progress = '{i:d} {loss:.3f} {textcat_p:.3f} {textcat_r:.3f} {textcat_f:.3f}'
|
||||||
|
|
||||||
|
for i, (loss, scores) in enumerate(train_textcat(tokenizer, textcat,
|
||||||
|
train_texts, train_cats,
|
||||||
|
dev_texts, dev_cats, n_iter=20)):
|
||||||
|
print(progress.format(i=i, loss=loss, **scores))
|
||||||
|
# How to save, load and use
|
||||||
|
nlp.pipeline.append(textcat)
|
||||||
|
if model_loc is not None:
|
||||||
|
nlp.to_disk(model_loc)
|
||||||
|
|
||||||
|
nlp = spacy.load(model_loc)
|
||||||
|
doc = nlp(u'This movie sucked!')
|
||||||
|
print(doc.cats)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
plac.call(main)
|
|
@ -3,7 +3,7 @@ pathlib
|
||||||
numpy>=1.7
|
numpy>=1.7
|
||||||
cymem>=1.30,<1.32
|
cymem>=1.30,<1.32
|
||||||
preshed>=1.0.0,<2.0.0
|
preshed>=1.0.0,<2.0.0
|
||||||
thinc>=6.7.3,<6.8.0
|
thinc>=6.8.0,<6.9.0
|
||||||
murmurhash>=0.28,<0.29
|
murmurhash>=0.28,<0.29
|
||||||
plac<1.0.0,>=0.9.6
|
plac<1.0.0,>=0.9.6
|
||||||
six
|
six
|
||||||
|
|
5
setup.py
5
setup.py
|
@ -28,7 +28,9 @@ MOD_NAMES = [
|
||||||
'spacy.pipeline',
|
'spacy.pipeline',
|
||||||
'spacy.syntax.stateclass',
|
'spacy.syntax.stateclass',
|
||||||
'spacy.syntax._state',
|
'spacy.syntax._state',
|
||||||
|
'spacy.syntax._beam_utils',
|
||||||
'spacy.tokenizer',
|
'spacy.tokenizer',
|
||||||
|
'spacy._cfile',
|
||||||
'spacy.syntax.parser',
|
'spacy.syntax.parser',
|
||||||
'spacy.syntax.nn_parser',
|
'spacy.syntax.nn_parser',
|
||||||
'spacy.syntax.beam_parser',
|
'spacy.syntax.beam_parser',
|
||||||
|
@ -187,12 +189,13 @@ def setup_package():
|
||||||
url=about['__uri__'],
|
url=about['__uri__'],
|
||||||
license=about['__license__'],
|
license=about['__license__'],
|
||||||
ext_modules=ext_modules,
|
ext_modules=ext_modules,
|
||||||
|
scripts=['bin/spacy'],
|
||||||
install_requires=[
|
install_requires=[
|
||||||
'numpy>=1.7',
|
'numpy>=1.7',
|
||||||
'murmurhash>=0.28,<0.29',
|
'murmurhash>=0.28,<0.29',
|
||||||
'cymem>=1.30,<1.32',
|
'cymem>=1.30,<1.32',
|
||||||
'preshed>=1.0.0,<2.0.0',
|
'preshed>=1.0.0,<2.0.0',
|
||||||
'thinc>=6.7.3,<6.8.0',
|
'thinc>=6.8.0,<6.9.0',
|
||||||
'plac<1.0.0,>=0.9.6',
|
'plac<1.0.0,>=0.9.6',
|
||||||
'pip>=9.0.0,<10.0.0',
|
'pip>=9.0.0,<10.0.0',
|
||||||
'six',
|
'six',
|
||||||
|
|
|
@ -13,5 +13,10 @@ def load(name, **overrides):
|
||||||
return util.load_model(name, **overrides)
|
return util.load_model(name, **overrides)
|
||||||
|
|
||||||
|
|
||||||
|
def blank(name, **kwargs):
|
||||||
|
LangClass = util.get_lang_class(name)
|
||||||
|
return LangClass(**kwargs)
|
||||||
|
|
||||||
|
|
||||||
def info(model=None, markdown=False):
|
def info(model=None, markdown=False):
|
||||||
return cli_info(None, model, markdown)
|
return cli_info(None, model, markdown)
|
||||||
|
|
|
@ -3,15 +3,23 @@ from __future__ import print_function
|
||||||
# NB! This breaks in plac on Python 2!!
|
# NB! This breaks in plac on Python 2!!
|
||||||
#from __future__ import unicode_literals
|
#from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
import plac
|
import plac
|
||||||
import sys
|
import sys
|
||||||
from spacy.cli import download, link, info, package, train, convert
|
from spacy.cli import download, link, info, package, train, convert, model
|
||||||
|
from spacy.cli import profile
|
||||||
from spacy.util import prints
|
from spacy.util import prints
|
||||||
|
|
||||||
commands = {'download': download, 'link': link, 'info': info, 'train': train,
|
commands = {
|
||||||
'convert': convert, 'package': package}
|
'download': download,
|
||||||
|
'link': link,
|
||||||
|
'info': info,
|
||||||
|
'train': train,
|
||||||
|
'convert': convert,
|
||||||
|
'package': package,
|
||||||
|
'model': model,
|
||||||
|
'profile': profile,
|
||||||
|
}
|
||||||
if len(sys.argv) == 1:
|
if len(sys.argv) == 1:
|
||||||
prints(', '.join(commands), title="Available commands", exits=1)
|
prints(', '.join(commands), title="Available commands", exits=1)
|
||||||
command = sys.argv.pop(1)
|
command = sys.argv.pop(1)
|
||||||
|
@ -19,5 +27,7 @@ if __name__ == '__main__':
|
||||||
if command in commands:
|
if command in commands:
|
||||||
plac.call(commands[command])
|
plac.call(commands[command])
|
||||||
else:
|
else:
|
||||||
prints("Available: %s" % ', '.join(commands),
|
prints(
|
||||||
title="Unknown command: %s" % command, exits=1)
|
"Available: %s" % ', '.join(commands),
|
||||||
|
title="Unknown command: %s" % command,
|
||||||
|
exits=1)
|
||||||
|
|
26
spacy/_cfile.pxd
Normal file
26
spacy/_cfile.pxd
Normal file
|
@ -0,0 +1,26 @@
|
||||||
|
from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
|
||||||
|
from cymem.cymem cimport Pool
|
||||||
|
|
||||||
|
cdef class CFile:
|
||||||
|
cdef FILE* fp
|
||||||
|
cdef bint is_open
|
||||||
|
cdef Pool mem
|
||||||
|
cdef int size # For compatibility with subclass
|
||||||
|
cdef int _capacity # For compatibility with subclass
|
||||||
|
|
||||||
|
cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1
|
||||||
|
|
||||||
|
cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1
|
||||||
|
|
||||||
|
cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
cdef class StringCFile(CFile):
|
||||||
|
cdef unsigned char* data
|
||||||
|
|
||||||
|
cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1
|
||||||
|
|
||||||
|
cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1
|
||||||
|
|
||||||
|
cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *
|
88
spacy/_cfile.pyx
Normal file
88
spacy/_cfile.pyx
Normal file
|
@ -0,0 +1,88 @@
|
||||||
|
from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
|
||||||
|
from libc.string cimport memcpy
|
||||||
|
|
||||||
|
|
||||||
|
cdef class CFile:
|
||||||
|
def __init__(self, loc, mode, on_open_error=None):
|
||||||
|
if isinstance(mode, unicode):
|
||||||
|
mode_str = mode.encode('ascii')
|
||||||
|
else:
|
||||||
|
mode_str = mode
|
||||||
|
if hasattr(loc, 'as_posix'):
|
||||||
|
loc = loc.as_posix()
|
||||||
|
self.mem = Pool()
|
||||||
|
cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
|
||||||
|
self.fp = fopen(<char*>bytes_loc, mode_str)
|
||||||
|
if self.fp == NULL:
|
||||||
|
if on_open_error is not None:
|
||||||
|
on_open_error()
|
||||||
|
else:
|
||||||
|
raise IOError("Could not open binary file %s" % bytes_loc)
|
||||||
|
self.is_open = True
|
||||||
|
|
||||||
|
def __dealloc__(self):
|
||||||
|
if self.is_open:
|
||||||
|
fclose(self.fp)
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
fclose(self.fp)
|
||||||
|
self.is_open = False
|
||||||
|
|
||||||
|
cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1:
|
||||||
|
st = fread(dest, elem_size, number, self.fp)
|
||||||
|
if st != number:
|
||||||
|
raise IOError
|
||||||
|
|
||||||
|
cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1:
|
||||||
|
st = fwrite(src, elem_size, number, self.fp)
|
||||||
|
if st != number:
|
||||||
|
raise IOError
|
||||||
|
|
||||||
|
cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *:
|
||||||
|
cdef void* dest = mem.alloc(number, elem_size)
|
||||||
|
self.read_into(dest, number, elem_size)
|
||||||
|
return dest
|
||||||
|
|
||||||
|
def write_unicode(self, unicode value):
|
||||||
|
cdef bytes py_bytes = value.encode('utf8')
|
||||||
|
cdef char* chars = <char*>py_bytes
|
||||||
|
self.write(sizeof(char), len(py_bytes), chars)
|
||||||
|
|
||||||
|
|
||||||
|
cdef class StringCFile:
|
||||||
|
def __init__(self, mode, bytes data=b'', on_open_error=None):
|
||||||
|
self.mem = Pool()
|
||||||
|
self.is_open = 'w' in mode
|
||||||
|
self._capacity = max(len(data), 8)
|
||||||
|
self.size = len(data)
|
||||||
|
self.data = <unsigned char*>self.mem.alloc(1, self._capacity)
|
||||||
|
for i in range(len(data)):
|
||||||
|
self.data[i] = data[i]
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
self.is_open = False
|
||||||
|
|
||||||
|
def string_data(self):
|
||||||
|
return (self.data-self.size)[:self.size]
|
||||||
|
|
||||||
|
cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1:
|
||||||
|
memcpy(dest, self.data, elem_size * number)
|
||||||
|
self.data += elem_size * number
|
||||||
|
|
||||||
|
cdef int write_from(self, void* src, size_t elem_size, size_t number) except -1:
|
||||||
|
write_size = number * elem_size
|
||||||
|
if (self.size + write_size) >= self._capacity:
|
||||||
|
self._capacity = (self.size + write_size) * 2
|
||||||
|
self.data = <unsigned char*>self.mem.realloc(self.data, self._capacity)
|
||||||
|
memcpy(&self.data[self.size], src, elem_size * number)
|
||||||
|
self.size += write_size
|
||||||
|
|
||||||
|
cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *:
|
||||||
|
cdef void* dest = mem.alloc(number, elem_size)
|
||||||
|
self.read_into(dest, number, elem_size)
|
||||||
|
return dest
|
||||||
|
|
||||||
|
def write_unicode(self, unicode value):
|
||||||
|
cdef bytes py_bytes = value.encode('utf8')
|
||||||
|
cdef char* chars = <char*>py_bytes
|
||||||
|
self.write(sizeof(char), len(py_bytes), chars)
|
358
spacy/_ml.py
358
spacy/_ml.py
|
@ -3,23 +3,101 @@ from thinc.api import add, layerize, chain, clone, concatenate, with_flatten
|
||||||
from thinc.neural import Model, Maxout, Softmax, Affine
|
from thinc.neural import Model, Maxout, Softmax, Affine
|
||||||
from thinc.neural._classes.hash_embed import HashEmbed
|
from thinc.neural._classes.hash_embed import HashEmbed
|
||||||
from thinc.neural.ops import NumpyOps, CupyOps
|
from thinc.neural.ops import NumpyOps, CupyOps
|
||||||
|
from thinc.neural.util import get_array_module
|
||||||
|
import random
|
||||||
|
import cytoolz
|
||||||
|
|
||||||
from thinc.neural._classes.convolution import ExtractWindow
|
from thinc.neural._classes.convolution import ExtractWindow
|
||||||
from thinc.neural._classes.static_vectors import StaticVectors
|
from thinc.neural._classes.static_vectors import StaticVectors
|
||||||
from thinc.neural._classes.batchnorm import BatchNorm
|
from thinc.neural._classes.batchnorm import BatchNorm as BN
|
||||||
|
from thinc.neural._classes.layernorm import LayerNorm as LN
|
||||||
from thinc.neural._classes.resnet import Residual
|
from thinc.neural._classes.resnet import Residual
|
||||||
from thinc.neural import ReLu
|
from thinc.neural import ReLu
|
||||||
|
from thinc.neural._classes.selu import SELU
|
||||||
from thinc import describe
|
from thinc import describe
|
||||||
from thinc.describe import Dimension, Synapses, Biases, Gradient
|
from thinc.describe import Dimension, Synapses, Biases, Gradient
|
||||||
from thinc.neural._classes.affine import _set_dimensions_if_needed
|
from thinc.neural._classes.affine import _set_dimensions_if_needed
|
||||||
|
from thinc.api import FeatureExtracter, with_getitem
|
||||||
|
from thinc.neural.pooling import Pooling, max_pool, mean_pool, sum_pool
|
||||||
|
from thinc.neural._classes.attention import ParametricAttention
|
||||||
|
from thinc.linear.linear import LinearModel
|
||||||
|
from thinc.api import uniqued, wrap, flatten_add_lengths
|
||||||
|
|
||||||
from .attrs import ID, NORM, PREFIX, SUFFIX, SHAPE, TAG, DEP
|
|
||||||
|
from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE, TAG, DEP, CLUSTER
|
||||||
from .tokens.doc import Doc
|
from .tokens.doc import Doc
|
||||||
|
from . import util
|
||||||
|
|
||||||
import numpy
|
import numpy
|
||||||
import io
|
import io
|
||||||
|
|
||||||
|
|
||||||
|
@layerize
|
||||||
|
def _flatten_add_lengths(seqs, pad=0, drop=0.):
|
||||||
|
ops = Model.ops
|
||||||
|
lengths = ops.asarray([len(seq) for seq in seqs], dtype='i')
|
||||||
|
def finish_update(d_X, sgd=None):
|
||||||
|
return ops.unflatten(d_X, lengths, pad=pad)
|
||||||
|
X = ops.flatten(seqs, pad=pad)
|
||||||
|
return (X, lengths), finish_update
|
||||||
|
|
||||||
|
|
||||||
|
@layerize
|
||||||
|
def _logistic(X, drop=0.):
|
||||||
|
xp = get_array_module(X)
|
||||||
|
if not isinstance(X, xp.ndarray):
|
||||||
|
X = xp.asarray(X)
|
||||||
|
# Clip to range (-10, 10)
|
||||||
|
X = xp.minimum(X, 10., X)
|
||||||
|
X = xp.maximum(X, -10., X)
|
||||||
|
Y = 1. / (1. + xp.exp(-X))
|
||||||
|
def logistic_bwd(dY, sgd=None):
|
||||||
|
dX = dY * (Y * (1-Y))
|
||||||
|
return dX
|
||||||
|
return Y, logistic_bwd
|
||||||
|
|
||||||
|
|
||||||
|
@layerize
|
||||||
|
def add_tuples(X, drop=0.):
|
||||||
|
"""Give inputs of sequence pairs, where each sequence is (vals, length),
|
||||||
|
sum the values, returning a single sequence.
|
||||||
|
|
||||||
|
If input is:
|
||||||
|
((vals1, length), (vals2, length)
|
||||||
|
Output is:
|
||||||
|
(vals1+vals2, length)
|
||||||
|
|
||||||
|
vals are a single tensor for the whole batch.
|
||||||
|
"""
|
||||||
|
(vals1, length1), (vals2, length2) = X
|
||||||
|
assert length1 == length2
|
||||||
|
|
||||||
|
def add_tuples_bwd(dY, sgd=None):
|
||||||
|
return (dY, dY)
|
||||||
|
|
||||||
|
return (vals1+vals2, length), add_tuples_bwd
|
||||||
|
|
||||||
|
|
||||||
|
def _zero_init(model):
|
||||||
|
def _zero_init_impl(self, X, y):
|
||||||
|
self.W.fill(0)
|
||||||
|
model.on_data_hooks.append(_zero_init_impl)
|
||||||
|
if model.W is not None:
|
||||||
|
model.W.fill(0.)
|
||||||
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
@layerize
|
||||||
|
def _preprocess_doc(docs, drop=0.):
|
||||||
|
keys = [doc.to_array([LOWER]) for doc in docs]
|
||||||
|
keys = [a[:, 0] for a in keys]
|
||||||
|
ops = Model.ops
|
||||||
|
lengths = ops.asarray([arr.shape[0] for arr in keys])
|
||||||
|
keys = ops.xp.concatenate(keys)
|
||||||
|
vals = ops.allocate(keys.shape[0]) + 1
|
||||||
|
return (keys, vals, lengths), None
|
||||||
|
|
||||||
|
|
||||||
def _init_for_precomputed(W, ops):
|
def _init_for_precomputed(W, ops):
|
||||||
if (W**2).sum() != 0.:
|
if (W**2).sum() != 0.:
|
||||||
return
|
return
|
||||||
|
@ -27,6 +105,7 @@ def _init_for_precomputed(W, ops):
|
||||||
ops.xavier_uniform_init(reshaped)
|
ops.xavier_uniform_init(reshaped)
|
||||||
W[:] = reshaped.reshape(W.shape)
|
W[:] = reshaped.reshape(W.shape)
|
||||||
|
|
||||||
|
|
||||||
@describe.on_data(_set_dimensions_if_needed)
|
@describe.on_data(_set_dimensions_if_needed)
|
||||||
@describe.attributes(
|
@describe.attributes(
|
||||||
nI=Dimension("Input size"),
|
nI=Dimension("Input size"),
|
||||||
|
@ -130,25 +209,42 @@ class PrecomputableMaxouts(Model):
|
||||||
return dXf
|
return dXf
|
||||||
return Yfp, backward
|
return Yfp, backward
|
||||||
|
|
||||||
|
|
||||||
|
def drop_layer(layer, factor=2.):
|
||||||
|
def drop_layer_fwd(X, drop=0.):
|
||||||
|
if drop <= 0.:
|
||||||
|
return layer.begin_update(X, drop=drop)
|
||||||
|
else:
|
||||||
|
coinflip = layer.ops.xp.random.random()
|
||||||
|
if (coinflip / factor) >= drop:
|
||||||
|
return layer.begin_update(X, drop=drop)
|
||||||
|
else:
|
||||||
|
return X, lambda dX, sgd=None: dX
|
||||||
|
|
||||||
|
model = wrap(drop_layer_fwd, layer)
|
||||||
|
model.predict = layer
|
||||||
|
return model
|
||||||
|
|
||||||
|
|
||||||
def Tok2Vec(width, embed_size, preprocess=None):
|
def Tok2Vec(width, embed_size, preprocess=None):
|
||||||
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE]
|
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
|
||||||
with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add}):
|
with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add}):
|
||||||
norm = get_col(cols.index(NORM)) >> HashEmbed(width, embed_size, name='embed_lower')
|
norm = get_col(cols.index(NORM)) >> HashEmbed(width, embed_size, name='embed_lower')
|
||||||
prefix = get_col(cols.index(PREFIX)) >> HashEmbed(width, embed_size//2, name='embed_prefix')
|
prefix = get_col(cols.index(PREFIX)) >> HashEmbed(width, embed_size//2, name='embed_prefix')
|
||||||
suffix = get_col(cols.index(SUFFIX)) >> HashEmbed(width, embed_size//2, name='embed_suffix')
|
suffix = get_col(cols.index(SUFFIX)) >> HashEmbed(width, embed_size//2, name='embed_suffix')
|
||||||
shape = get_col(cols.index(SHAPE)) >> HashEmbed(width, embed_size//2, name='embed_shape')
|
shape = get_col(cols.index(SHAPE)) >> HashEmbed(width, embed_size//2, name='embed_shape')
|
||||||
|
|
||||||
embed = (norm | prefix | suffix | shape )
|
embed = (norm | prefix | suffix | shape ) >> LN(Maxout(width, width*4, pieces=3))
|
||||||
tok2vec = (
|
tok2vec = (
|
||||||
with_flatten(
|
with_flatten(
|
||||||
asarray(Model.ops, dtype='uint64')
|
asarray(Model.ops, dtype='uint64')
|
||||||
>> embed
|
>> uniqued(embed, column=5)
|
||||||
>> Maxout(width, width*4, pieces=3)
|
>> drop_layer(
|
||||||
>> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3))
|
Residual(
|
||||||
>> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3))
|
(ExtractWindow(nW=1) >> LN(Maxout(width, width*3)))
|
||||||
>> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3))
|
)
|
||||||
>> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3)),
|
) ** 4, pad=4
|
||||||
pad=4)
|
)
|
||||||
)
|
)
|
||||||
if preprocess not in (False, None):
|
if preprocess not in (False, None):
|
||||||
tok2vec = preprocess >> tok2vec
|
tok2vec = preprocess >> tok2vec
|
||||||
|
@ -243,7 +339,8 @@ def zero_init(model):
|
||||||
|
|
||||||
|
|
||||||
def doc2feats(cols=None):
|
def doc2feats(cols=None):
|
||||||
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE]
|
if cols is None:
|
||||||
|
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
|
||||||
def forward(docs, drop=0.):
|
def forward(docs, drop=0.):
|
||||||
feats = []
|
feats = []
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
|
@ -269,6 +366,45 @@ def get_token_vectors(tokens_attrs_vectors, drop=0.):
|
||||||
return vectors, backward
|
return vectors, backward
|
||||||
|
|
||||||
|
|
||||||
|
def fine_tune(embedding, combine=None):
|
||||||
|
if combine is not None:
|
||||||
|
raise NotImplementedError(
|
||||||
|
"fine_tune currently only supports addition. Set combine=None")
|
||||||
|
def fine_tune_fwd(docs_tokvecs, drop=0.):
|
||||||
|
docs, tokvecs = docs_tokvecs
|
||||||
|
lengths = model.ops.asarray([len(doc) for doc in docs], dtype='i')
|
||||||
|
|
||||||
|
vecs, bp_vecs = embedding.begin_update(docs, drop=drop)
|
||||||
|
flat_tokvecs = embedding.ops.flatten(tokvecs)
|
||||||
|
flat_vecs = embedding.ops.flatten(vecs)
|
||||||
|
output = embedding.ops.unflatten(
|
||||||
|
(model.mix[0] * flat_tokvecs + model.mix[1] * flat_vecs), lengths)
|
||||||
|
|
||||||
|
def fine_tune_bwd(d_output, sgd=None):
|
||||||
|
flat_grad = model.ops.flatten(d_output)
|
||||||
|
model.d_mix[0] += flat_tokvecs.dot(flat_grad.T).sum()
|
||||||
|
model.d_mix[1] += flat_vecs.dot(flat_grad.T).sum()
|
||||||
|
|
||||||
|
bp_vecs([d_o * model.mix[1] for d_o in d_output], sgd=sgd)
|
||||||
|
if sgd is not None:
|
||||||
|
sgd(model._mem.weights, model._mem.gradient, key=model.id)
|
||||||
|
return [d_o * model.mix[0] for d_o in d_output]
|
||||||
|
return output, fine_tune_bwd
|
||||||
|
|
||||||
|
def fine_tune_predict(docs_tokvecs):
|
||||||
|
docs, tokvecs = docs_tokvecs
|
||||||
|
vecs = embedding(docs)
|
||||||
|
return [model.mix[0]*tv+model.mix[1]*v
|
||||||
|
for tv, v in zip(tokvecs, vecs)]
|
||||||
|
|
||||||
|
model = wrap(fine_tune_fwd, embedding)
|
||||||
|
model.mix = model._mem.add((model.id, 'mix'), (2,))
|
||||||
|
model.mix.fill(0.5)
|
||||||
|
model.d_mix = model._mem.add_gradient((model.id, 'd_mix'), (model.id, 'mix'))
|
||||||
|
model.predict = fine_tune_predict
|
||||||
|
return model
|
||||||
|
|
||||||
|
|
||||||
@layerize
|
@layerize
|
||||||
def flatten(seqs, drop=0.):
|
def flatten(seqs, drop=0.):
|
||||||
if isinstance(seqs[0], numpy.ndarray):
|
if isinstance(seqs[0], numpy.ndarray):
|
||||||
|
@ -282,3 +418,201 @@ def flatten(seqs, drop=0.):
|
||||||
return ops.unflatten(d_X, lengths)
|
return ops.unflatten(d_X, lengths)
|
||||||
X = ops.xp.vstack(seqs)
|
X = ops.xp.vstack(seqs)
|
||||||
return X, finish_update
|
return X, finish_update
|
||||||
|
|
||||||
|
|
||||||
|
@layerize
|
||||||
|
def logistic(X, drop=0.):
|
||||||
|
xp = get_array_module(X)
|
||||||
|
if not isinstance(X, xp.ndarray):
|
||||||
|
X = xp.asarray(X)
|
||||||
|
# Clip to range (-10, 10)
|
||||||
|
X = xp.minimum(X, 10., X)
|
||||||
|
X = xp.maximum(X, -10., X)
|
||||||
|
Y = 1. / (1. + xp.exp(-X))
|
||||||
|
def logistic_bwd(dY, sgd=None):
|
||||||
|
dX = dY * (Y * (1-Y))
|
||||||
|
return dX
|
||||||
|
return Y, logistic_bwd
|
||||||
|
|
||||||
|
|
||||||
|
def zero_init(model):
|
||||||
|
def _zero_init_impl(self, X, y):
|
||||||
|
self.W.fill(0)
|
||||||
|
model.on_data_hooks.append(_zero_init_impl)
|
||||||
|
return model
|
||||||
|
|
||||||
|
@layerize
|
||||||
|
def preprocess_doc(docs, drop=0.):
|
||||||
|
keys = [doc.to_array([LOWER]) for doc in docs]
|
||||||
|
keys = [a[:, 0] for a in keys]
|
||||||
|
ops = Model.ops
|
||||||
|
lengths = ops.asarray([arr.shape[0] for arr in keys])
|
||||||
|
keys = ops.xp.concatenate(keys)
|
||||||
|
vals = ops.allocate(keys.shape[0]) + 1
|
||||||
|
return (keys, vals, lengths), None
|
||||||
|
|
||||||
|
def getitem(i):
|
||||||
|
def getitem_fwd(X, drop=0.):
|
||||||
|
return X[i], None
|
||||||
|
return layerize(getitem_fwd)
|
||||||
|
|
||||||
|
def build_tagger_model(nr_class, token_vector_width, **cfg):
|
||||||
|
embed_size = util.env_opt('embed_size', 7500)
|
||||||
|
with Model.define_operators({'>>': chain, '+': add}):
|
||||||
|
# Input: (doc, tensor) tuples
|
||||||
|
private_tok2vec = Tok2Vec(token_vector_width, embed_size, preprocess=doc2feats())
|
||||||
|
|
||||||
|
model = (
|
||||||
|
fine_tune(private_tok2vec)
|
||||||
|
>> with_flatten(
|
||||||
|
Maxout(token_vector_width, token_vector_width)
|
||||||
|
>> Softmax(nr_class, token_vector_width)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
model.nI = None
|
||||||
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
@layerize
|
||||||
|
def SpacyVectors(docs, drop=0.):
|
||||||
|
xp = get_array_module(docs[0].vocab.vectors.data)
|
||||||
|
width = docs[0].vocab.vectors.data.shape[1]
|
||||||
|
batch = []
|
||||||
|
for doc in docs:
|
||||||
|
indices = numpy.zeros((len(doc),), dtype='i')
|
||||||
|
for i, word in enumerate(doc):
|
||||||
|
if word.orth in doc.vocab.vectors.key2row:
|
||||||
|
indices[i] = doc.vocab.vectors.key2row[word.orth]
|
||||||
|
else:
|
||||||
|
indices[i] = 0
|
||||||
|
vectors = doc.vocab.vectors.data[indices]
|
||||||
|
batch.append(vectors)
|
||||||
|
return batch, None
|
||||||
|
|
||||||
|
|
||||||
|
def foreach(layer, drop_factor=1.0):
|
||||||
|
'''Map a layer across elements in a list'''
|
||||||
|
def foreach_fwd(Xs, drop=0.):
|
||||||
|
drop *= drop_factor
|
||||||
|
ys = []
|
||||||
|
backprops = []
|
||||||
|
for X in Xs:
|
||||||
|
y, bp_y = layer.begin_update(X, drop=drop)
|
||||||
|
ys.append(y)
|
||||||
|
backprops.append(bp_y)
|
||||||
|
def foreach_bwd(d_ys, sgd=None):
|
||||||
|
d_Xs = []
|
||||||
|
for d_y, bp_y in zip(d_ys, backprops):
|
||||||
|
if bp_y is not None and bp_y is not None:
|
||||||
|
d_Xs.append(d_y, sgd=sgd)
|
||||||
|
else:
|
||||||
|
d_Xs.append(None)
|
||||||
|
return d_Xs
|
||||||
|
return ys, foreach_bwd
|
||||||
|
model = wrap(foreach_fwd, layer)
|
||||||
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
def build_text_classifier(nr_class, width=64, **cfg):
|
||||||
|
nr_vector = cfg.get('nr_vector', 5000)
|
||||||
|
with Model.define_operators({'>>': chain, '+': add, '|': concatenate,
|
||||||
|
'**': clone}):
|
||||||
|
if cfg.get('low_data'):
|
||||||
|
model = (
|
||||||
|
SpacyVectors
|
||||||
|
>> flatten_add_lengths
|
||||||
|
>> with_getitem(0,
|
||||||
|
Affine(width, 300)
|
||||||
|
)
|
||||||
|
>> ParametricAttention(width)
|
||||||
|
>> Pooling(sum_pool)
|
||||||
|
>> Residual(ReLu(width, width)) ** 2
|
||||||
|
>> zero_init(Affine(nr_class, width, drop_factor=0.0))
|
||||||
|
>> logistic
|
||||||
|
)
|
||||||
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
lower = HashEmbed(width, nr_vector, column=1)
|
||||||
|
prefix = HashEmbed(width//2, nr_vector, column=2)
|
||||||
|
suffix = HashEmbed(width//2, nr_vector, column=3)
|
||||||
|
shape = HashEmbed(width//2, nr_vector, column=4)
|
||||||
|
|
||||||
|
trained_vectors = (
|
||||||
|
FeatureExtracter([ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID])
|
||||||
|
>> with_flatten(
|
||||||
|
uniqued(
|
||||||
|
(lower | prefix | suffix | shape)
|
||||||
|
>> LN(Maxout(width, width+(width//2)*3)),
|
||||||
|
column=0
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
static_vectors = (
|
||||||
|
SpacyVectors
|
||||||
|
>> with_flatten(Affine(width, 300))
|
||||||
|
)
|
||||||
|
|
||||||
|
cnn_model = (
|
||||||
|
# TODO Make concatenate support lists
|
||||||
|
concatenate_lists(trained_vectors, static_vectors)
|
||||||
|
>> with_flatten(
|
||||||
|
LN(Maxout(width, width*2))
|
||||||
|
>> Residual(
|
||||||
|
(ExtractWindow(nW=1) >> zero_init(Maxout(width, width*3)))
|
||||||
|
) ** 2, pad=2
|
||||||
|
)
|
||||||
|
>> flatten_add_lengths
|
||||||
|
>> ParametricAttention(width)
|
||||||
|
>> Pooling(sum_pool)
|
||||||
|
>> Residual(zero_init(Maxout(width, width)))
|
||||||
|
>> zero_init(Affine(nr_class, width, drop_factor=0.0))
|
||||||
|
)
|
||||||
|
|
||||||
|
linear_model = (
|
||||||
|
_preprocess_doc
|
||||||
|
>> LinearModel(nr_class, drop_factor=0.)
|
||||||
|
)
|
||||||
|
|
||||||
|
model = (
|
||||||
|
(linear_model | cnn_model)
|
||||||
|
>> zero_init(Affine(nr_class, nr_class*2, drop_factor=0.0))
|
||||||
|
>> logistic
|
||||||
|
)
|
||||||
|
|
||||||
|
model.lsuv = False
|
||||||
|
return model
|
||||||
|
|
||||||
|
@layerize
|
||||||
|
def flatten(seqs, drop=0.):
|
||||||
|
ops = Model.ops
|
||||||
|
lengths = ops.asarray([len(seq) for seq in seqs], dtype='i')
|
||||||
|
def finish_update(d_X, sgd=None):
|
||||||
|
return ops.unflatten(d_X, lengths, pad=0)
|
||||||
|
X = ops.flatten(seqs, pad=0)
|
||||||
|
return X, finish_update
|
||||||
|
|
||||||
|
|
||||||
|
def concatenate_lists(*layers, **kwargs): # pragma: no cover
|
||||||
|
'''Compose two or more models `f`, `g`, etc, such that their outputs are
|
||||||
|
concatenated, i.e. `concatenate(f, g)(x)` computes `hstack(f(x), g(x))`
|
||||||
|
'''
|
||||||
|
if not layers:
|
||||||
|
return noop()
|
||||||
|
drop_factor = kwargs.get('drop_factor', 1.0)
|
||||||
|
ops = layers[0].ops
|
||||||
|
layers = [chain(layer, flatten) for layer in layers]
|
||||||
|
concat = concatenate(*layers)
|
||||||
|
def concatenate_lists_fwd(Xs, drop=0.):
|
||||||
|
drop *= drop_factor
|
||||||
|
lengths = ops.asarray([len(X) for X in Xs], dtype='i')
|
||||||
|
flat_y, bp_flat_y = concat.begin_update(Xs, drop=drop)
|
||||||
|
ys = ops.unflatten(flat_y, lengths)
|
||||||
|
def concatenate_lists_bwd(d_ys, sgd=None):
|
||||||
|
return bp_flat_y(ops.flatten(d_ys), sgd=sgd)
|
||||||
|
return ys, concatenate_lists_bwd
|
||||||
|
model = wrap(concatenate_lists_fwd, concat)
|
||||||
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
# https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
|
# https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
|
||||||
|
|
||||||
__title__ = 'spacy-nightly'
|
__title__ = 'spacy-nightly'
|
||||||
__version__ = '2.0.0a1'
|
__version__ = '2.0.0a13'
|
||||||
__summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython'
|
__summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython'
|
||||||
__uri__ = 'https://spacy.io'
|
__uri__ = 'https://spacy.io'
|
||||||
__author__ = 'Explosion AI'
|
__author__ = 'Explosion AI'
|
||||||
|
|
|
@ -2,5 +2,7 @@ from .download import download
|
||||||
from .info import info
|
from .info import info
|
||||||
from .link import link
|
from .link import link
|
||||||
from .package import package
|
from .package import package
|
||||||
|
from .profile import profile
|
||||||
from .train import train
|
from .train import train
|
||||||
from .convert import convert
|
from .convert import convert
|
||||||
|
from .model import model
|
||||||
|
|
|
@ -21,10 +21,10 @@ CONVERTERS = {
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
input_file=("input file", "positional", None, str),
|
input_file=("input file", "positional", None, str),
|
||||||
output_dir=("output directory for converted file", "positional", None, str),
|
output_dir=("output directory for converted file", "positional", None, str),
|
||||||
n_sents=("Number of sentences per doc", "option", "n", float),
|
n_sents=("Number of sentences per doc", "option", "n", int),
|
||||||
morphology=("Enable appending morphology to tags", "flag", "m", bool)
|
morphology=("Enable appending morphology to tags", "flag", "m", bool)
|
||||||
)
|
)
|
||||||
def convert(cmd, input_file, output_dir, n_sents, morphology):
|
def convert(cmd, input_file, output_dir, n_sents=1, morphology=False):
|
||||||
"""
|
"""
|
||||||
Convert files into JSON format for use with train command and other
|
Convert files into JSON format for use with train command and other
|
||||||
experiment management functions.
|
experiment management functions.
|
||||||
|
|
|
@ -73,10 +73,10 @@ def generate_sentence(sent):
|
||||||
tokens = []
|
tokens = []
|
||||||
for i, id in enumerate(id_):
|
for i, id in enumerate(id_):
|
||||||
token = {}
|
token = {}
|
||||||
token["orth"] = word[id]
|
token["orth"] = word[i]
|
||||||
token["tag"] = tag[id]
|
token["tag"] = tag[i]
|
||||||
token["head"] = head[id] - i
|
token["head"] = head[i] - id
|
||||||
token["dep"] = dep[id]
|
token["dep"] = dep[i]
|
||||||
tokens.append(token)
|
tokens.append(token)
|
||||||
sentence["tokens"] = tokens
|
sentence["tokens"] = tokens
|
||||||
return sentence
|
return sentence
|
||||||
|
|
|
@ -8,7 +8,7 @@ import subprocess
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
from .link import link
|
from .link import link
|
||||||
from ..util import prints
|
from ..util import prints, get_package_path
|
||||||
from .. import about
|
from .. import about
|
||||||
|
|
||||||
|
|
||||||
|
@ -24,15 +24,20 @@ def download(cmd, model, direct=False):
|
||||||
with version.
|
with version.
|
||||||
"""
|
"""
|
||||||
if direct:
|
if direct:
|
||||||
download_model('{m}/{m}.tar.gz'.format(m=model))
|
dl = download_model('{m}/{m}.tar.gz'.format(m=model))
|
||||||
else:
|
else:
|
||||||
shortcuts = get_json(about.__shortcuts__, "available shortcuts")
|
shortcuts = get_json(about.__shortcuts__, "available shortcuts")
|
||||||
model_name = shortcuts.get(model, model)
|
model_name = shortcuts.get(model, model)
|
||||||
compatibility = get_compatibility()
|
compatibility = get_compatibility()
|
||||||
version = get_version(model_name, compatibility)
|
version = get_version(model_name, compatibility)
|
||||||
download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name, v=version))
|
dl = download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name, v=version))
|
||||||
|
if dl == 0:
|
||||||
try:
|
try:
|
||||||
link(None, model_name, model, force=True)
|
# Get package path here because link uses
|
||||||
|
# pip.get_installed_distributions() to check if model is a package,
|
||||||
|
# which fails if model was just installed via subprocess
|
||||||
|
package_path = get_package_path(model_name)
|
||||||
|
link(None, model_name, model, force=True, model_path=package_path)
|
||||||
except:
|
except:
|
||||||
# Dirty, but since spacy.download and the auto-linking is mostly
|
# Dirty, but since spacy.download and the auto-linking is mostly
|
||||||
# a convenience wrapper, it's best to show a success message and
|
# a convenience wrapper, it's best to show a success message and
|
||||||
|
@ -73,6 +78,6 @@ def get_version(model, comp):
|
||||||
|
|
||||||
def download_model(filename):
|
def download_model(filename):
|
||||||
download_url = about.__download_url__ + '/' + filename
|
download_url = about.__download_url__ + '/' + filename
|
||||||
subprocess.call([sys.executable, '-m',
|
return subprocess.call([sys.executable, '-m',
|
||||||
'pip', 'install', '--no-cache-dir', download_url],
|
'pip', 'install', '--no-cache-dir', download_url],
|
||||||
env=os.environ.copy())
|
env=os.environ.copy())
|
||||||
|
|
|
@ -14,7 +14,7 @@ from .. import util
|
||||||
link_name=("name of shortuct link to create", "positional", None, str),
|
link_name=("name of shortuct link to create", "positional", None, str),
|
||||||
force=("force overwriting of existing link", "flag", "f", bool)
|
force=("force overwriting of existing link", "flag", "f", bool)
|
||||||
)
|
)
|
||||||
def link(cmd, origin, link_name, force=False):
|
def link(cmd, origin, link_name, force=False, model_path=None):
|
||||||
"""
|
"""
|
||||||
Create a symlink for models within the spacy/data directory. Accepts
|
Create a symlink for models within the spacy/data directory. Accepts
|
||||||
either the name of a pip package, or the local path to the model data
|
either the name of a pip package, or the local path to the model data
|
||||||
|
@ -23,7 +23,7 @@ def link(cmd, origin, link_name, force=False):
|
||||||
if util.is_package(origin):
|
if util.is_package(origin):
|
||||||
model_path = util.get_package_path(origin)
|
model_path = util.get_package_path(origin)
|
||||||
else:
|
else:
|
||||||
model_path = Path(origin)
|
model_path = Path(origin) if model_path is None else Path(model_path)
|
||||||
if not model_path.exists():
|
if not model_path.exists():
|
||||||
prints("The data should be located in %s" % path2str(model_path),
|
prints("The data should be located in %s" % path2str(model_path),
|
||||||
title="Can't locate model data", exits=1)
|
title="Can't locate model data", exits=1)
|
||||||
|
|
137
spacy/cli/model.py
Normal file
137
spacy/cli/model.py
Normal file
|
@ -0,0 +1,137 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import bz2
|
||||||
|
import gzip
|
||||||
|
import math
|
||||||
|
from ast import literal_eval
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import spacy
|
||||||
|
from preshed.counter import PreshCounter
|
||||||
|
|
||||||
|
from .. import util
|
||||||
|
from ..compat import fix_text
|
||||||
|
|
||||||
|
|
||||||
|
def model(cmd, lang, model_dir, freqs_data, clusters_data, vectors_data,
|
||||||
|
min_doc_freq=5, min_word_freq=200):
|
||||||
|
model_path = Path(model_dir)
|
||||||
|
freqs_path = Path(freqs_data)
|
||||||
|
clusters_path = Path(clusters_data) if clusters_data else None
|
||||||
|
vectors_path = Path(vectors_data) if vectors_data else None
|
||||||
|
|
||||||
|
check_dirs(freqs_path, clusters_path, vectors_path)
|
||||||
|
vocab = util.get_lang_class(lang).Defaults.create_vocab()
|
||||||
|
nlp = spacy.blank(lang)
|
||||||
|
vocab = nlp.vocab
|
||||||
|
probs, oov_prob = read_probs(
|
||||||
|
freqs_path, min_doc_freq=int(min_doc_freq), min_freq=int(min_doc_freq))
|
||||||
|
clusters = read_clusters(clusters_path) if clusters_path else {}
|
||||||
|
populate_vocab(vocab, clusters, probs, oov_prob)
|
||||||
|
add_vectors(vocab, vectors_path)
|
||||||
|
create_model(model_path, nlp)
|
||||||
|
|
||||||
|
|
||||||
|
def add_vectors(vocab, vectors_path):
|
||||||
|
with bz2.BZ2File(vectors_path.as_posix()) as f:
|
||||||
|
num_words, dim = next(f).split()
|
||||||
|
vocab.clear_vectors(int(dim))
|
||||||
|
for line in f:
|
||||||
|
word_w_vector = line.decode("utf8").strip().split(" ")
|
||||||
|
word = word_w_vector[0]
|
||||||
|
vector = np.array([float(val) for val in word_w_vector[1:]])
|
||||||
|
if word in vocab:
|
||||||
|
vocab.set_vector(word, vector)
|
||||||
|
|
||||||
|
|
||||||
|
def create_model(model_path, model):
|
||||||
|
if not model_path.exists():
|
||||||
|
model_path.mkdir()
|
||||||
|
model.to_disk(model_path.as_posix())
|
||||||
|
|
||||||
|
|
||||||
|
def read_probs(freqs_path, max_length=100, min_doc_freq=5, min_freq=200):
|
||||||
|
counts = PreshCounter()
|
||||||
|
total = 0
|
||||||
|
freqs_file = check_unzip(freqs_path)
|
||||||
|
for i, line in enumerate(freqs_file):
|
||||||
|
freq, doc_freq, key = line.rstrip().split('\t', 2)
|
||||||
|
freq = int(freq)
|
||||||
|
counts.inc(i + 1, freq)
|
||||||
|
total += freq
|
||||||
|
counts.smooth()
|
||||||
|
log_total = math.log(total)
|
||||||
|
freqs_file = check_unzip(freqs_path)
|
||||||
|
probs = {}
|
||||||
|
for line in freqs_file:
|
||||||
|
freq, doc_freq, key = line.rstrip().split('\t', 2)
|
||||||
|
doc_freq = int(doc_freq)
|
||||||
|
freq = int(freq)
|
||||||
|
if doc_freq >= min_doc_freq and freq >= min_freq and len(
|
||||||
|
key) < max_length:
|
||||||
|
word = literal_eval(key)
|
||||||
|
smooth_count = counts.smoother(int(freq))
|
||||||
|
probs[word] = math.log(smooth_count) - log_total
|
||||||
|
oov_prob = math.log(counts.smoother(0)) - log_total
|
||||||
|
return probs, oov_prob
|
||||||
|
|
||||||
|
|
||||||
|
def read_clusters(clusters_path):
|
||||||
|
clusters = {}
|
||||||
|
with clusters_path.open() as f:
|
||||||
|
for line in f:
|
||||||
|
try:
|
||||||
|
cluster, word, freq = line.split()
|
||||||
|
word = fix_text(word)
|
||||||
|
except ValueError:
|
||||||
|
continue
|
||||||
|
# If the clusterer has only seen the word a few times, its
|
||||||
|
# cluster is unreliable.
|
||||||
|
if int(freq) >= 3:
|
||||||
|
clusters[word] = cluster
|
||||||
|
else:
|
||||||
|
clusters[word] = '0'
|
||||||
|
# Expand clusters with re-casing
|
||||||
|
for word, cluster in list(clusters.items()):
|
||||||
|
if word.lower() not in clusters:
|
||||||
|
clusters[word.lower()] = cluster
|
||||||
|
if word.title() not in clusters:
|
||||||
|
clusters[word.title()] = cluster
|
||||||
|
if word.upper() not in clusters:
|
||||||
|
clusters[word.upper()] = cluster
|
||||||
|
return clusters
|
||||||
|
|
||||||
|
|
||||||
|
def populate_vocab(vocab, clusters, probs, oov_prob):
|
||||||
|
for word, prob in reversed(
|
||||||
|
sorted(list(probs.items()), key=lambda item: item[1])):
|
||||||
|
lexeme = vocab[word]
|
||||||
|
lexeme.prob = prob
|
||||||
|
lexeme.is_oov = False
|
||||||
|
# Decode as a little-endian string, so that we can do & 15 to get
|
||||||
|
# the first 4 bits. See _parse_features.pyx
|
||||||
|
if word in clusters:
|
||||||
|
lexeme.cluster = int(clusters[word][::-1], 2)
|
||||||
|
else:
|
||||||
|
lexeme.cluster = 0
|
||||||
|
|
||||||
|
|
||||||
|
def check_unzip(file_path):
|
||||||
|
file_path_str = file_path.as_posix()
|
||||||
|
if file_path_str.endswith('gz'):
|
||||||
|
return gzip.open(file_path_str)
|
||||||
|
else:
|
||||||
|
return file_path.open()
|
||||||
|
|
||||||
|
|
||||||
|
def check_dirs(freqs_data, clusters_data, vectors_data):
|
||||||
|
if not freqs_data.is_file():
|
||||||
|
util.sys_exit(freqs_data.as_posix(), title="No frequencies file found")
|
||||||
|
if clusters_data and not clusters_data.is_file():
|
||||||
|
util.sys_exit(
|
||||||
|
clusters_data.as_posix(), title="No Brown clusters file found")
|
||||||
|
if vectors_data and not vectors_data.is_file():
|
||||||
|
util.sys_exit(
|
||||||
|
vectors_data.as_posix(), title="No word vectors file found")
|
|
@ -15,10 +15,11 @@ from .. import about
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
input_dir=("directory with model data", "positional", None, str),
|
input_dir=("directory with model data", "positional", None, str),
|
||||||
output_dir=("output parent directory", "positional", None, str),
|
output_dir=("output parent directory", "positional", None, str),
|
||||||
meta=("path to meta.json", "option", "m", str),
|
meta_path=("path to meta.json", "option", "m", str),
|
||||||
|
create_meta=("create meta.json, even if one exists in directory", "flag", "c", bool),
|
||||||
force=("force overwriting of existing folder in output directory", "flag", "f", bool)
|
force=("force overwriting of existing folder in output directory", "flag", "f", bool)
|
||||||
)
|
)
|
||||||
def package(cmd, input_dir, output_dir, meta=None, force=False):
|
def package(cmd, input_dir, output_dir, meta_path=None, create_meta=False, force=False):
|
||||||
"""
|
"""
|
||||||
Generate Python package for model data, including meta and required
|
Generate Python package for model data, including meta and required
|
||||||
installation files. A new directory will be created in the specified
|
installation files. A new directory will be created in the specified
|
||||||
|
@ -26,7 +27,7 @@ def package(cmd, input_dir, output_dir, meta=None, force=False):
|
||||||
"""
|
"""
|
||||||
input_path = util.ensure_path(input_dir)
|
input_path = util.ensure_path(input_dir)
|
||||||
output_path = util.ensure_path(output_dir)
|
output_path = util.ensure_path(output_dir)
|
||||||
meta_path = util.ensure_path(meta)
|
meta_path = util.ensure_path(meta_path)
|
||||||
if not input_path or not input_path.exists():
|
if not input_path or not input_path.exists():
|
||||||
prints(input_path, title="Model directory not found", exits=1)
|
prints(input_path, title="Model directory not found", exits=1)
|
||||||
if not output_path or not output_path.exists():
|
if not output_path or not output_path.exists():
|
||||||
|
@ -38,7 +39,7 @@ def package(cmd, input_dir, output_dir, meta=None, force=False):
|
||||||
template_manifest = get_template('MANIFEST.in')
|
template_manifest = get_template('MANIFEST.in')
|
||||||
template_init = get_template('xx_model_name/__init__.py')
|
template_init = get_template('xx_model_name/__init__.py')
|
||||||
meta_path = meta_path or input_path / 'meta.json'
|
meta_path = meta_path or input_path / 'meta.json'
|
||||||
if meta_path.is_file():
|
if not create_meta and meta_path.is_file():
|
||||||
prints(meta_path, title="Reading meta.json from file")
|
prints(meta_path, title="Reading meta.json from file")
|
||||||
meta = util.read_json(meta_path)
|
meta = util.read_json(meta_path)
|
||||||
else:
|
else:
|
||||||
|
@ -100,7 +101,7 @@ def generate_meta():
|
||||||
def generate_pipeline():
|
def generate_pipeline():
|
||||||
prints("If set to 'True', the default pipeline is used. If set to 'False', "
|
prints("If set to 'True', the default pipeline is used. If set to 'False', "
|
||||||
"the pipeline will be disabled. Components should be specified as a "
|
"the pipeline will be disabled. Components should be specified as a "
|
||||||
"comma-separated list of component names, e.g. vectorizer, tagger, "
|
"comma-separated list of component names, e.g. tensorizer, tagger, "
|
||||||
"parser, ner. For more information, see the docs on processing pipelines.",
|
"parser, ner. For more information, see the docs on processing pipelines.",
|
||||||
title="Enter your model's pipeline components")
|
title="Enter your model's pipeline components")
|
||||||
pipeline = util.get_raw_input("Pipeline components", True)
|
pipeline = util.get_raw_input("Pipeline components", True)
|
||||||
|
|
45
spacy/cli/profile.py
Normal file
45
spacy/cli/profile.py
Normal file
|
@ -0,0 +1,45 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals, division, print_function
|
||||||
|
|
||||||
|
import plac
|
||||||
|
from pathlib import Path
|
||||||
|
import ujson
|
||||||
|
import cProfile
|
||||||
|
import pstats
|
||||||
|
|
||||||
|
import spacy
|
||||||
|
import sys
|
||||||
|
import tqdm
|
||||||
|
import cytoolz
|
||||||
|
|
||||||
|
|
||||||
|
def read_inputs(loc):
|
||||||
|
if loc is None:
|
||||||
|
file_ = sys.stdin
|
||||||
|
file_ = (line.encode('utf8') for line in file_)
|
||||||
|
else:
|
||||||
|
file_ = Path(loc).open()
|
||||||
|
for line in file_:
|
||||||
|
data = ujson.loads(line)
|
||||||
|
text = data['text']
|
||||||
|
yield text
|
||||||
|
|
||||||
|
|
||||||
|
@plac.annotations(
|
||||||
|
lang=("model/language", "positional", None, str),
|
||||||
|
inputs=("Location of input file", "positional", None, read_inputs)
|
||||||
|
)
|
||||||
|
def profile(cmd, lang, inputs=None):
|
||||||
|
"""
|
||||||
|
Profile a spaCy pipeline, to find out which functions take the most time.
|
||||||
|
"""
|
||||||
|
nlp = spacy.load(lang)
|
||||||
|
texts = list(cytoolz.take(10000, inputs))
|
||||||
|
cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), "Profile.prof")
|
||||||
|
s = pstats.Stats("Profile.prof")
|
||||||
|
s.strip_dirs().sort_stats("time").print_stats()
|
||||||
|
|
||||||
|
|
||||||
|
def parse_texts(nlp, texts):
|
||||||
|
for doc in nlp.pipe(tqdm.tqdm(texts), batch_size=128):
|
||||||
|
pass
|
|
@ -32,10 +32,12 @@ from ..compat import json_dumps
|
||||||
resume=("Whether to resume training", "flag", "R", bool),
|
resume=("Whether to resume training", "flag", "R", bool),
|
||||||
no_tagger=("Don't train tagger", "flag", "T", bool),
|
no_tagger=("Don't train tagger", "flag", "T", bool),
|
||||||
no_parser=("Don't train parser", "flag", "P", bool),
|
no_parser=("Don't train parser", "flag", "P", bool),
|
||||||
no_entities=("Don't train NER", "flag", "N", bool)
|
no_entities=("Don't train NER", "flag", "N", bool),
|
||||||
|
gold_preproc=("Use gold preprocessing", "flag", "G", bool),
|
||||||
)
|
)
|
||||||
def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
|
def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
|
||||||
use_gpu=-1, resume=False, no_tagger=False, no_parser=False, no_entities=False):
|
use_gpu=-1, resume=False, no_tagger=False, no_parser=False, no_entities=False,
|
||||||
|
gold_preproc=False):
|
||||||
"""
|
"""
|
||||||
Train a model. Expects data in spaCy's JSON format.
|
Train a model. Expects data in spaCy's JSON format.
|
||||||
"""
|
"""
|
||||||
|
@ -69,7 +71,7 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
|
||||||
util.env_opt('batch_to', 64),
|
util.env_opt('batch_to', 64),
|
||||||
util.env_opt('batch_compound', 1.001))
|
util.env_opt('batch_compound', 1.001))
|
||||||
gold_preproc = util.env_opt('gold_preproc', False)
|
gold_preproc = util.env_opt('gold_preproc', False)
|
||||||
noise_level = util.env_opt('noise_level', 0.25)
|
noise_level = util.env_opt('noise_level', 0.0)
|
||||||
|
|
||||||
if resume:
|
if resume:
|
||||||
prints(output_path / 'model19.pickle', title="Resuming training")
|
prints(output_path / 'model19.pickle', title="Resuming training")
|
||||||
|
@ -95,15 +97,14 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
|
||||||
for batch in minibatch(train_docs, size=batch_sizes):
|
for batch in minibatch(train_docs, size=batch_sizes):
|
||||||
docs, golds = zip(*batch)
|
docs, golds = zip(*batch)
|
||||||
nlp.update(docs, golds, sgd=optimizer,
|
nlp.update(docs, golds, sgd=optimizer,
|
||||||
drop=next(dropout_rates), losses=losses)
|
drop=next(dropout_rates), losses=losses,
|
||||||
|
update_shared=True)
|
||||||
pbar.update(sum(len(doc) for doc in docs))
|
pbar.update(sum(len(doc) for doc in docs))
|
||||||
|
|
||||||
with nlp.use_params(optimizer.averages):
|
with nlp.use_params(optimizer.averages):
|
||||||
util.set_env_log(False)
|
util.set_env_log(False)
|
||||||
epoch_model_path = output_path / ('model%d' % i)
|
epoch_model_path = output_path / ('model%d' % i)
|
||||||
nlp.to_disk(epoch_model_path)
|
nlp.to_disk(epoch_model_path)
|
||||||
with (output_path / ('model%d.pickle' % i)).open('wb') as file_:
|
|
||||||
dill.dump(nlp, file_, -1)
|
|
||||||
nlp_loaded = lang_class(pipeline=pipeline)
|
nlp_loaded = lang_class(pipeline=pipeline)
|
||||||
nlp_loaded = nlp_loaded.from_disk(epoch_model_path)
|
nlp_loaded = nlp_loaded.from_disk(epoch_model_path)
|
||||||
scorer = nlp_loaded.evaluate(
|
scorer = nlp_loaded.evaluate(
|
||||||
|
|
|
@ -5,6 +5,7 @@ import six
|
||||||
import ftfy
|
import ftfy
|
||||||
import sys
|
import sys
|
||||||
import ujson
|
import ujson
|
||||||
|
import itertools
|
||||||
|
|
||||||
from thinc.neural.util import copy_array
|
from thinc.neural.util import copy_array
|
||||||
|
|
||||||
|
@ -35,6 +36,7 @@ CudaStream = CudaStream
|
||||||
cupy = cupy
|
cupy = cupy
|
||||||
fix_text = ftfy.fix_text
|
fix_text = ftfy.fix_text
|
||||||
copy_array = copy_array
|
copy_array = copy_array
|
||||||
|
izip = getattr(itertools, 'izip', zip)
|
||||||
|
|
||||||
is_python2 = six.PY2
|
is_python2 = six.PY2
|
||||||
is_python3 = six.PY3
|
is_python3 = six.PY3
|
||||||
|
@ -44,21 +46,31 @@ is_osx = sys.platform == 'darwin'
|
||||||
|
|
||||||
|
|
||||||
if is_python2:
|
if is_python2:
|
||||||
|
import imp
|
||||||
bytes_ = str
|
bytes_ = str
|
||||||
unicode_ = unicode
|
unicode_ = unicode
|
||||||
basestring_ = basestring
|
basestring_ = basestring
|
||||||
input_ = raw_input
|
input_ = raw_input
|
||||||
json_dumps = lambda data: ujson.dumps(data, indent=2).decode('utf8')
|
json_dumps = lambda data: ujson.dumps(data, indent=2, escape_forward_slashes=False).decode('utf8')
|
||||||
path2str = lambda path: str(path).decode('utf8')
|
path2str = lambda path: str(path).decode('utf8')
|
||||||
|
|
||||||
elif is_python3:
|
elif is_python3:
|
||||||
|
import importlib.util
|
||||||
bytes_ = bytes
|
bytes_ = bytes
|
||||||
unicode_ = str
|
unicode_ = str
|
||||||
basestring_ = str
|
basestring_ = str
|
||||||
input_ = input
|
input_ = input
|
||||||
json_dumps = lambda data: ujson.dumps(data, indent=2)
|
json_dumps = lambda data: ujson.dumps(data, indent=2, escape_forward_slashes=False)
|
||||||
path2str = lambda path: str(path)
|
path2str = lambda path: str(path)
|
||||||
|
|
||||||
|
|
||||||
|
def b_to_str(b_str):
|
||||||
|
if is_python2:
|
||||||
|
return b_str
|
||||||
|
# important: if no encoding is set, string becomes "b'...'"
|
||||||
|
return str(b_str, encoding='utf8')
|
||||||
|
|
||||||
|
|
||||||
def getattr_(obj, name, *default):
|
def getattr_(obj, name, *default):
|
||||||
if is_python3 and isinstance(name, bytes):
|
if is_python3 and isinstance(name, bytes):
|
||||||
name = name.decode('utf8')
|
name = name.decode('utf8')
|
||||||
|
@ -92,3 +104,12 @@ def normalize_string_keys(old):
|
||||||
return new
|
return new
|
||||||
|
|
||||||
|
|
||||||
|
def import_file(name, loc):
|
||||||
|
loc = str(loc)
|
||||||
|
if is_python2:
|
||||||
|
return imp.load_source(name, loc)
|
||||||
|
else:
|
||||||
|
spec = importlib.util.spec_from_file_location(name, str(loc))
|
||||||
|
module = importlib.util.module_from_spec(spec)
|
||||||
|
spec.loader.exec_module(module)
|
||||||
|
return module
|
||||||
|
|
|
@ -15,7 +15,7 @@ def depr_model_download(lang):
|
||||||
lang (unicode): Language shortcut, 'en' or 'de'.
|
lang (unicode): Language shortcut, 'en' or 'de'.
|
||||||
"""
|
"""
|
||||||
prints("The spacy.%s.download command is now deprecated. Please use "
|
prints("The spacy.%s.download command is now deprecated. Please use "
|
||||||
"python -m spacy download [model name or shortcut] instead. For "
|
"spacy download [model name or shortcut] instead. For "
|
||||||
"more info, see the documentation:" % lang,
|
"more info, see the documentation:" % lang,
|
||||||
about.__docs_models__,
|
about.__docs_models__,
|
||||||
"Downloading default '%s' model now..." % lang,
|
"Downloading default '%s' model now..." % lang,
|
||||||
|
|
|
@ -3,6 +3,7 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
from .render import DependencyRenderer, EntityRenderer
|
from .render import DependencyRenderer, EntityRenderer
|
||||||
from ..tokens import Doc
|
from ..tokens import Doc
|
||||||
|
from ..compat import b_to_str
|
||||||
from ..util import prints, is_in_jupyter
|
from ..util import prints, is_in_jupyter
|
||||||
|
|
||||||
|
|
||||||
|
@ -65,7 +66,9 @@ def serve(docs, style='dep', page=True, minify=False, options={}, manual=False,
|
||||||
|
|
||||||
|
|
||||||
def app(environ, start_response):
|
def app(environ, start_response):
|
||||||
start_response('200 OK', [('Content-type', 'text/html; charset=utf-8')])
|
# headers and status need to be bytes in Python 2, see #1227
|
||||||
|
headers = [(b_to_str(b'Content-type'), b_to_str(b'text/html; charset=utf-8'))]
|
||||||
|
start_response(b_to_str(b'200 OK'), headers)
|
||||||
res = _html['parsed'].encode(encoding='utf-8')
|
res = _html['parsed'].encode(encoding='utf-8')
|
||||||
return [res]
|
return [res]
|
||||||
|
|
||||||
|
|
|
@ -60,7 +60,7 @@ GLOSSARY = {
|
||||||
'JJR': 'adjective, comparative',
|
'JJR': 'adjective, comparative',
|
||||||
'JJS': 'adjective, superlative',
|
'JJS': 'adjective, superlative',
|
||||||
'LS': 'list item marker',
|
'LS': 'list item marker',
|
||||||
'MD': 'verb, modal auxillary',
|
'MD': 'verb, modal auxiliary',
|
||||||
'NIL': 'missing tag',
|
'NIL': 'missing tag',
|
||||||
'NN': 'noun, singular or mass',
|
'NN': 'noun, singular or mass',
|
||||||
'NNP': 'noun, proper singular',
|
'NNP': 'noun, proper singular',
|
||||||
|
@ -91,7 +91,7 @@ GLOSSARY = {
|
||||||
'NFP': 'superfluous punctuation',
|
'NFP': 'superfluous punctuation',
|
||||||
'GW': 'additional word in multi-word expression',
|
'GW': 'additional word in multi-word expression',
|
||||||
'XX': 'unknown',
|
'XX': 'unknown',
|
||||||
'BES': 'auxillary "be"',
|
'BES': 'auxiliary "be"',
|
||||||
'HVS': 'forms of "have"',
|
'HVS': 'forms of "have"',
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -9,6 +9,7 @@ cdef struct GoldParseC:
|
||||||
int* tags
|
int* tags
|
||||||
int* heads
|
int* heads
|
||||||
int* has_dep
|
int* has_dep
|
||||||
|
int* sent_start
|
||||||
attr_t* labels
|
attr_t* labels
|
||||||
int** brackets
|
int** brackets
|
||||||
Transition* ner
|
Transition* ner
|
||||||
|
@ -29,6 +30,7 @@ cdef class GoldParse:
|
||||||
cdef public list ner
|
cdef public list ner
|
||||||
cdef public list ents
|
cdef public list ents
|
||||||
cdef public dict brackets
|
cdef public dict brackets
|
||||||
|
cdef public object cats
|
||||||
|
|
||||||
cdef readonly list cand_to_gold
|
cdef readonly list cand_to_gold
|
||||||
cdef readonly list gold_to_cand
|
cdef readonly list gold_to_cand
|
||||||
|
|
|
@ -381,7 +381,8 @@ cdef class GoldParse:
|
||||||
make_projective=make_projective)
|
make_projective=make_projective)
|
||||||
|
|
||||||
def __init__(self, doc, annot_tuples=None, words=None, tags=None, heads=None,
|
def __init__(self, doc, annot_tuples=None, words=None, tags=None, heads=None,
|
||||||
deps=None, entities=None, make_projective=False):
|
deps=None, entities=None, make_projective=False,
|
||||||
|
cats=tuple()):
|
||||||
"""Create a GoldParse.
|
"""Create a GoldParse.
|
||||||
|
|
||||||
doc (Doc): The document the annotations refer to.
|
doc (Doc): The document the annotations refer to.
|
||||||
|
@ -392,6 +393,12 @@ cdef class GoldParse:
|
||||||
entities (iterable): A sequence of named entity annotations, either as
|
entities (iterable): A sequence of named entity annotations, either as
|
||||||
BILUO tag strings, or as `(start_char, end_char, label)` tuples,
|
BILUO tag strings, or as `(start_char, end_char, label)` tuples,
|
||||||
representing the entity positions.
|
representing the entity positions.
|
||||||
|
cats (iterable): A sequence of labels for text classification. Each
|
||||||
|
label may be a string or an int, or a `(start_char, end_char, label)`
|
||||||
|
tuple, indicating that the label is applied to only part of the
|
||||||
|
document (usually a sentence). Unlike entity annotations, label
|
||||||
|
annotations can overlap, i.e. a single word can be covered by
|
||||||
|
multiple labelled spans.
|
||||||
RETURNS (GoldParse): The newly constructed object.
|
RETURNS (GoldParse): The newly constructed object.
|
||||||
"""
|
"""
|
||||||
if words is None:
|
if words is None:
|
||||||
|
@ -399,11 +406,11 @@ cdef class GoldParse:
|
||||||
if tags is None:
|
if tags is None:
|
||||||
tags = [None for _ in doc]
|
tags = [None for _ in doc]
|
||||||
if heads is None:
|
if heads is None:
|
||||||
heads = [token.i for token in doc]
|
heads = [None for token in doc]
|
||||||
if deps is None:
|
if deps is None:
|
||||||
deps = [None for _ in doc]
|
deps = [None for _ in doc]
|
||||||
if entities is None:
|
if entities is None:
|
||||||
entities = ['-' for _ in doc]
|
entities = [None for _ in doc]
|
||||||
elif len(entities) == 0:
|
elif len(entities) == 0:
|
||||||
entities = ['O' for _ in doc]
|
entities = ['O' for _ in doc]
|
||||||
elif not isinstance(entities[0], basestring):
|
elif not isinstance(entities[0], basestring):
|
||||||
|
@ -419,8 +426,10 @@ cdef class GoldParse:
|
||||||
self.c.heads = <int*>self.mem.alloc(len(doc), sizeof(int))
|
self.c.heads = <int*>self.mem.alloc(len(doc), sizeof(int))
|
||||||
self.c.labels = <attr_t*>self.mem.alloc(len(doc), sizeof(attr_t))
|
self.c.labels = <attr_t*>self.mem.alloc(len(doc), sizeof(attr_t))
|
||||||
self.c.has_dep = <int*>self.mem.alloc(len(doc), sizeof(int))
|
self.c.has_dep = <int*>self.mem.alloc(len(doc), sizeof(int))
|
||||||
|
self.c.sent_start = <int*>self.mem.alloc(len(doc), sizeof(int))
|
||||||
self.c.ner = <Transition*>self.mem.alloc(len(doc), sizeof(Transition))
|
self.c.ner = <Transition*>self.mem.alloc(len(doc), sizeof(Transition))
|
||||||
|
|
||||||
|
self.cats = list(cats)
|
||||||
self.words = [None] * len(doc)
|
self.words = [None] * len(doc)
|
||||||
self.tags = [None] * len(doc)
|
self.tags = [None] * len(doc)
|
||||||
self.heads = [None] * len(doc)
|
self.heads = [None] * len(doc)
|
||||||
|
@ -474,8 +483,12 @@ cdef class GoldParse:
|
||||||
"""
|
"""
|
||||||
return not nonproj.is_nonproj_tree(self.heads)
|
return not nonproj.is_nonproj_tree(self.heads)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def sent_starts(self):
|
||||||
|
return [self.c.sent_start[i] for i in range(self.length)]
|
||||||
|
|
||||||
def biluo_tags_from_offsets(doc, entities):
|
|
||||||
|
def biluo_tags_from_offsets(doc, entities, missing='O'):
|
||||||
"""Encode labelled spans into per-token tags, using the Begin/In/Last/Unit/Out
|
"""Encode labelled spans into per-token tags, using the Begin/In/Last/Unit/Out
|
||||||
scheme (BILUO).
|
scheme (BILUO).
|
||||||
|
|
||||||
|
@ -527,7 +540,7 @@ def biluo_tags_from_offsets(doc, entities):
|
||||||
if i in entity_chars:
|
if i in entity_chars:
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
biluo[token.i] = 'O'
|
biluo[token.i] = missing
|
||||||
return biluo
|
return biluo
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -27,7 +27,7 @@ ALPHA_UPPER = merge_char_classes(_upper + _uncased)
|
||||||
|
|
||||||
_units = ('km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm nm yd in ft '
|
_units = ('km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm nm yd in ft '
|
||||||
'kg g mg µg t lb oz m/s km/h kmh mph hPa Pa mbar mb MB kb KB gb GB tb '
|
'kg g mg µg t lb oz m/s km/h kmh mph hPa Pa mbar mb MB kb KB gb GB tb '
|
||||||
'TB T G M K')
|
'TB T G M K %')
|
||||||
_currency = r'\$ £ € ¥ ฿ US\$ C\$ A\$'
|
_currency = r'\$ £ € ¥ ฿ US\$ C\$ A\$'
|
||||||
_punct = r'… , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* &'
|
_punct = r'… , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* &'
|
||||||
_quotes = r'\' \'\' " ” “ `` ` ‘ ´ ‚ , „ » «'
|
_quotes = r'\' \'\' " ” “ `` ` ‘ ´ ‚ , „ » «'
|
||||||
|
|
18
spacy/lang/da/examples.py
Normal file
18
spacy/lang/da/examples.py
Normal file
|
@ -0,0 +1,18 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
>>> from spacy.lang.da.examples import sentences
|
||||||
|
>>> docs = nlp.pipe(sentences)
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
sentences = [
|
||||||
|
"Apple overvejer at købe et britisk statup for 1 milliard dollar",
|
||||||
|
"Selvkørende biler flytter forsikringsansvaret over på producenterne",
|
||||||
|
"San Francisco overvejer at forbyde leverandørrobotter på fortov",
|
||||||
|
"London er en stor by i Storbritannien"
|
||||||
|
]
|
22
spacy/lang/de/examples.py
Normal file
22
spacy/lang/de/examples.py
Normal file
|
@ -0,0 +1,22 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
>>> from spacy.lang.de.examples import sentences
|
||||||
|
>>> docs = nlp.pipe(sentences)
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
sentences = [
|
||||||
|
"Die ganze Stadt ist ein Startup: Shenzhen ist das Silicon Valley für Hardware-Firmen",
|
||||||
|
"Wie deutsche Startups die Technologie vorantreiben wollen: Künstliche Intelligenz",
|
||||||
|
"Trend zum Urlaub in Deutschland beschert Gastwirten mehr Umsatz",
|
||||||
|
"Bundesanwaltschaft erhebt Anklage gegen mutmaßlichen Schweizer Spion",
|
||||||
|
"San Francisco erwägt Verbot von Lieferrobotern",
|
||||||
|
"Autonome Fahrzeuge verlagern Haftpflicht auf Hersteller",
|
||||||
|
"Wo bist du?",
|
||||||
|
"Was ist die Hauptstadt von Deutschland?"
|
||||||
|
]
|
22
spacy/lang/en/examples.py
Normal file
22
spacy/lang/en/examples.py
Normal file
|
@ -0,0 +1,22 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
>>> from spacy.lang.en.examples import sentences
|
||||||
|
>>> docs = nlp.pipe(sentences)
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
sentences = [
|
||||||
|
"Apple is looking at buying U.K. startup for $1 billion",
|
||||||
|
"Autonomous cars shift insurance liability toward manufacturers",
|
||||||
|
"San Francisco considers banning sidewalk delivery robots",
|
||||||
|
"London is a big city in the United Kingdom.",
|
||||||
|
"Where are you?",
|
||||||
|
"Who is the president of France?",
|
||||||
|
"What is the capital of the United States?",
|
||||||
|
"When was Barack Obama born?"
|
||||||
|
]
|
|
@ -59,7 +59,8 @@ MORPH_RULES = {
|
||||||
|
|
||||||
"VBP": {
|
"VBP": {
|
||||||
"are": {LEMMA: "be", "VerbForm": "Fin", "Tense": "Pres", "Mood": "Ind"},
|
"are": {LEMMA: "be", "VerbForm": "Fin", "Tense": "Pres", "Mood": "Ind"},
|
||||||
"'re": {LEMMA: "be", "VerbForm": "Fin", "Tense": "Pres", "Mood": "Ind"}
|
"'re": {LEMMA: "be", "VerbForm": "Fin", "Tense": "Pres", "Mood": "Ind"},
|
||||||
|
"am": {LEMMA: "be", "VerbForm": "Fin", "Person": "One", "Tense": "Pres", "Mood": "Ind"},
|
||||||
},
|
},
|
||||||
|
|
||||||
"VBD": {
|
"VBD": {
|
||||||
|
|
|
@ -232,7 +232,10 @@ for verb_data in [
|
||||||
{ORTH: "are", LEMMA: "be", NORM: "are", TAG: "VBP", "number": 2},
|
{ORTH: "are", LEMMA: "be", NORM: "are", TAG: "VBP", "number": 2},
|
||||||
{ORTH: "is", LEMMA: "be", NORM: "is", TAG: "VBZ"},
|
{ORTH: "is", LEMMA: "be", NORM: "is", TAG: "VBZ"},
|
||||||
{ORTH: "was", LEMMA: "be", NORM: "was"},
|
{ORTH: "was", LEMMA: "be", NORM: "was"},
|
||||||
{ORTH: "were", LEMMA: "be", NORM: "were"}]:
|
{ORTH: "were", LEMMA: "be", NORM: "were"},
|
||||||
|
{ORTH: "have", NORM: "have"},
|
||||||
|
{ORTH: "has", LEMMA: "have", NORM: "has"},
|
||||||
|
{ORTH: "dare", NORM: "dare"}]:
|
||||||
verb_data_tc = dict(verb_data)
|
verb_data_tc = dict(verb_data)
|
||||||
verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
|
verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
|
||||||
for data in [verb_data, verb_data_tc]:
|
for data in [verb_data, verb_data_tc]:
|
||||||
|
|
22
spacy/lang/es/examples.py
Normal file
22
spacy/lang/es/examples.py
Normal file
|
@ -0,0 +1,22 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
>>> from spacy.lang.es.examples import sentences
|
||||||
|
>>> docs = nlp.pipe(sentences)
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
sentences = [
|
||||||
|
"Apple está buscando comprar una startup del Reino Unido por mil millones de dólares",
|
||||||
|
"Los coches autónomos delegan la responsabilidad del seguro en sus fabricantes",
|
||||||
|
"San Francisco analiza prohibir los robots delivery",
|
||||||
|
"Londres es una gran ciudad del Reino Unido",
|
||||||
|
"El gato come pescado",
|
||||||
|
"Veo al hombre con el telescopio",
|
||||||
|
"La araña come moscas",
|
||||||
|
"El pingüino incuba en su nido"
|
||||||
|
]
|
|
@ -5,6 +5,7 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
|
||||||
from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lemmatizer import LOOKUP
|
from .lemmatizer import LOOKUP
|
||||||
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
|
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ..norm_exceptions import BASE_NORMS
|
from ..norm_exceptions import BASE_NORMS
|
||||||
|
@ -24,6 +25,7 @@ class FrenchDefaults(Language.Defaults):
|
||||||
infixes = tuple(TOKENIZER_INFIXES)
|
infixes = tuple(TOKENIZER_INFIXES)
|
||||||
suffixes = tuple(TOKENIZER_SUFFIXES)
|
suffixes = tuple(TOKENIZER_SUFFIXES)
|
||||||
token_match = TOKEN_MATCH
|
token_match = TOKEN_MATCH
|
||||||
|
syntax_iterators = dict(SYNTAX_ITERATORS)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def create_lemmatizer(cls, nlp=None):
|
def create_lemmatizer(cls, nlp=None):
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
26
spacy/lang/fr/examples.py
Normal file
26
spacy/lang/fr/examples.py
Normal file
|
@ -0,0 +1,26 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
>>> from spacy.lang.fr.examples import sentences
|
||||||
|
>>> docs = nlp.pipe(sentences)
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
sentences = [
|
||||||
|
"Apple cherche a acheter une startup anglaise pour 1 milliard de dollard",
|
||||||
|
"Les voitures autonomes voient leur assurances décalées vers les constructeurs",
|
||||||
|
"San Francisco envisage d'interdire les robots coursiers",
|
||||||
|
"Londres est une grande ville du Royaume-Uni",
|
||||||
|
"L’Italie choisit ArcelorMittal pour reprendre la plus grande aciérie d’Europe",
|
||||||
|
"Apple lance HomePod parce qu'il se sent menacé par l'Echo d'Amazon",
|
||||||
|
"La France ne devrait pas manquer d'électricité cet été, même en cas de canicule",
|
||||||
|
"Nouvelles attaques de Trump contre le maire de Londres",
|
||||||
|
"Où es-tu ?",
|
||||||
|
"Qui est le président de la France ?",
|
||||||
|
"Où est la capitale des Etats-Unis ?",
|
||||||
|
"Quand est né Barack Obama ?"
|
||||||
|
]
|
42
spacy/lang/fr/syntax_iterators.py
Normal file
42
spacy/lang/fr/syntax_iterators.py
Normal file
|
@ -0,0 +1,42 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from ...symbols import NOUN, PROPN, PRON
|
||||||
|
|
||||||
|
|
||||||
|
def noun_chunks(obj):
|
||||||
|
"""
|
||||||
|
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
|
||||||
|
"""
|
||||||
|
labels = ['nsubj', 'nsubj:pass', 'obj', 'iobj', 'ROOT', 'appos', 'nmod', 'nmod:poss']
|
||||||
|
doc = obj.doc # Ensure works on both Doc and Span.
|
||||||
|
np_deps = [doc.vocab.strings[label] for label in labels]
|
||||||
|
conj = doc.vocab.strings.add('conj')
|
||||||
|
np_label = doc.vocab.strings.add('NP')
|
||||||
|
seen = set()
|
||||||
|
for i, word in enumerate(obj):
|
||||||
|
if word.pos not in (NOUN, PROPN, PRON):
|
||||||
|
continue
|
||||||
|
# Prevent nested chunks from being produced
|
||||||
|
if word.i in seen:
|
||||||
|
continue
|
||||||
|
if word.dep in np_deps:
|
||||||
|
if any(w.i in seen for w in word.subtree):
|
||||||
|
continue
|
||||||
|
seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
|
||||||
|
yield word.left_edge.i, word.right_edge.i+1, np_label
|
||||||
|
elif word.dep == conj:
|
||||||
|
head = word.head
|
||||||
|
while head.dep == conj and head.head.i < head.i:
|
||||||
|
head = head.head
|
||||||
|
# If the head is an NP, and we're coordinated to it, we're an NP
|
||||||
|
if head.dep in np_deps:
|
||||||
|
if any(w.i in seen for w in word.subtree):
|
||||||
|
continue
|
||||||
|
seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
|
||||||
|
yield word.left_edge.i, word.right_edge.i+1, np_label
|
||||||
|
|
||||||
|
|
||||||
|
SYNTAX_ITERATORS = {
|
||||||
|
'noun_chunks': noun_chunks
|
||||||
|
}
|
28
spacy/lang/he/examples.py
Normal file
28
spacy/lang/he/examples.py
Normal file
|
@ -0,0 +1,28 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
>>> from spacy.lang.he.examples import sentences
|
||||||
|
>>> docs = nlp.pipe(sentences)
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
sentences = [
|
||||||
|
'סין מקימה קרן של 440 מיליון דולר להשקעה בהייטק בישראל',
|
||||||
|
'רה"מ הודיע כי יחרים טקס בחסותו',
|
||||||
|
'הכנסת צפויה לאשר איכון אוטומטי של שיחות למוקד 100',
|
||||||
|
'תוכנית לאומית תהפוך את ישראל למעצמה דיגיטלית',
|
||||||
|
'סע לשלום, המפתחות בפנים.',
|
||||||
|
'מלצר, פעמיים טורקי!',
|
||||||
|
'ואהבת לרעך כמוך.',
|
||||||
|
'היום נעשה משהו בלתי נשכח.',
|
||||||
|
'איפה הילד?',
|
||||||
|
'מיהו נשיא צרפת?',
|
||||||
|
'מהי בירת ארצות הברית?',
|
||||||
|
"איך קוראים בעברית לצ'ופצ'יק של הקומקום?",
|
||||||
|
'מה הייתה הדקה?',
|
||||||
|
'מי אומר שלום ראשון, זה שעולה או זה שיורד?'
|
||||||
|
]
|
42
spacy/lang/id/__init__.py
Normal file
42
spacy/lang/id/__init__.py
Normal file
|
@ -0,0 +1,42 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from .stop_words import STOP_WORDS
|
||||||
|
from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
||||||
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
|
from .norm_exceptions import NORM_EXCEPTIONS
|
||||||
|
from .lemmatizer import LOOKUP
|
||||||
|
from .lex_attrs import LEX_ATTRS
|
||||||
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
|
|
||||||
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
|
from ...language import Language
|
||||||
|
from ...lemmatizerlookup import Lemmatizer
|
||||||
|
from ...attrs import LANG
|
||||||
|
from ...util import update_exc
|
||||||
|
|
||||||
|
|
||||||
|
class IndonesianDefaults(Language.Defaults):
|
||||||
|
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||||
|
lex_attr_getters[LANG] = lambda text: 'id'
|
||||||
|
|
||||||
|
lex_attr_getters.update(LEX_ATTRS)
|
||||||
|
|
||||||
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
|
stop_words = set(STOP_WORDS)
|
||||||
|
prefixes = tuple(TOKENIZER_PREFIXES)
|
||||||
|
suffixes = tuple(TOKENIZER_SUFFIXES)
|
||||||
|
infixes = tuple(TOKENIZER_INFIXES)
|
||||||
|
syntax_iterators = dict(SYNTAX_ITERATORS)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def create_lemmatizer(cls, nlp=None):
|
||||||
|
return Lemmatizer(LOOKUP)
|
||||||
|
|
||||||
|
|
||||||
|
class Indonesian(Language):
|
||||||
|
lang = 'id'
|
||||||
|
Defaults = IndonesianDefaults
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ['Indonesian']
|
3833
spacy/lang/id/_tokenizer_exceptions_list.py
Normal file
3833
spacy/lang/id/_tokenizer_exceptions_list.py
Normal file
File diff suppressed because it is too large
Load Diff
22
spacy/lang/id/examples.py
Normal file
22
spacy/lang/id/examples.py
Normal file
|
@ -0,0 +1,22 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
>>> from spacy.lang.en.examples import sentences
|
||||||
|
>>> docs = nlp.pipe(sentences)
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
sentences = [
|
||||||
|
"Al Qaidah mengklaim bom mobil yang menewaskan 60 Orang di Mali",
|
||||||
|
"Abu Sayyaf mengeksekusi sandera warga Filipina",
|
||||||
|
"Penyaluran pupuk berasal dari lima lokasi yakni Bontang, Kalimantan Timur, Surabaya, Banyuwangi, Semarang, dan Makassar.",
|
||||||
|
"PT Pupuk Kaltim telah menyalurkan 274.707 ton pupuk bersubsidi ke wilayah penyaluran di 14 provinsi.",
|
||||||
|
"Jakarta adalah kota besar yang nyaris tidak pernah tidur."
|
||||||
|
"Kamu ada di mana semalam?",
|
||||||
|
"Siapa yang membeli makanan ringan tersebut?",
|
||||||
|
"Siapa presiden pertama Republik Indonesia?"
|
||||||
|
]
|
36883
spacy/lang/id/lemmatizer.py
Normal file
36883
spacy/lang/id/lemmatizer.py
Normal file
File diff suppressed because it is too large
Load Diff
42
spacy/lang/id/lex_attrs.py
Normal file
42
spacy/lang/id/lex_attrs.py
Normal file
|
@ -0,0 +1,42 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from ...attrs import LIKE_NUM
|
||||||
|
|
||||||
|
|
||||||
|
_num_words = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven',
|
||||||
|
'eight', 'nine', 'ten', 'eleven', 'twelve', 'thirteen', 'fourteen',
|
||||||
|
'fifteen', 'sixteen', 'seventeen', 'eighteen', 'nineteen', 'twenty',
|
||||||
|
'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety',
|
||||||
|
'hundred', 'thousand', 'million', 'billion', 'trillion', 'quadrillion',
|
||||||
|
'gajillion', 'bazillion',
|
||||||
|
'nol', 'satu', 'dua', 'tiga', 'empat', 'lima', 'enam', 'tujuh',
|
||||||
|
'delapan', 'sembilan', 'sepuluh', 'sebelas', 'duabelas', 'tigabelas',
|
||||||
|
'empatbelas', 'limabelas', 'enambelas', 'tujuhbelas', 'delapanbelas',
|
||||||
|
'sembilanbelas', 'duapuluh', 'seratus', 'seribu', 'sejuta',
|
||||||
|
'ribu', 'rb', 'juta', 'jt', 'miliar', 'biliun', 'triliun',
|
||||||
|
'kuadriliun', 'kuintiliun', 'sekstiliun', 'septiliun', 'oktiliun',
|
||||||
|
'noniliun', 'desiliun',
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def like_num(text):
|
||||||
|
text = text.replace(',', '').replace('.', '')
|
||||||
|
if text.isdigit():
|
||||||
|
return True
|
||||||
|
if text.count('/') == 1:
|
||||||
|
num, denom = text.split('/')
|
||||||
|
if num.isdigit() and denom.isdigit():
|
||||||
|
return True
|
||||||
|
if text in _num_words:
|
||||||
|
return True
|
||||||
|
if text.count('-') == 1:
|
||||||
|
_, num = text.split('-')
|
||||||
|
if num.isdigit() or num in _num_words:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
LEX_ATTRS = {
|
||||||
|
LIKE_NUM: like_num
|
||||||
|
}
|
17
spacy/lang/id/norm_exceptions.py
Normal file
17
spacy/lang/id/norm_exceptions.py
Normal file
|
@ -0,0 +1,17 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
_exc = {
|
||||||
|
"Rp": "$",
|
||||||
|
"IDR": "$",
|
||||||
|
"RMB": "$",
|
||||||
|
"USD": "$",
|
||||||
|
"AUD": "$",
|
||||||
|
"GBP": "$",
|
||||||
|
}
|
||||||
|
|
||||||
|
NORM_EXCEPTIONS = {}
|
||||||
|
|
||||||
|
for string, norm in _exc.items():
|
||||||
|
NORM_EXCEPTIONS[string] = norm
|
||||||
|
NORM_EXCEPTIONS[string.title()] = norm
|
53
spacy/lang/id/punctuation.py
Normal file
53
spacy/lang/id/punctuation.py
Normal file
|
@ -0,0 +1,53 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from ..punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
||||||
|
from ..char_classes import merge_chars, split_chars, _currency, _units
|
||||||
|
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES
|
||||||
|
from ..char_classes import QUOTES, UNITS, ALPHA, ALPHA_LOWER, ALPHA_UPPER, HYPHENS
|
||||||
|
|
||||||
|
_units = (_units + 's bit Gbps Mbps mbps Kbps kbps ƒ ppi px '
|
||||||
|
'Hz kHz MHz GHz mAh '
|
||||||
|
'ratus rb ribu ribuan '
|
||||||
|
'juta jt jutaan mill?iar million bil[l]?iun bilyun billion '
|
||||||
|
)
|
||||||
|
_currency = (_currency + r' USD Rp IDR RMB SGD S\$')
|
||||||
|
_months = ('Januari Februari Maret April Mei Juni Juli Agustus September '
|
||||||
|
'Oktober November Desember January February March May June '
|
||||||
|
'July August October December Jan Feb Mar Jun Jul Aug Sept '
|
||||||
|
'Oct Okt Nov Des ')
|
||||||
|
|
||||||
|
|
||||||
|
UNITS = merge_chars(_units)
|
||||||
|
CURRENCY = merge_chars(_currency)
|
||||||
|
HTML_PREFIX = r'<(b|strong|i|em|p|span|div|br)\s?/>|<a([^>]+)>'
|
||||||
|
HTML_SUFFIX = r'</(b|strong|i|em|p|span|div|a)>'
|
||||||
|
MONTHS = merge_chars(_months)
|
||||||
|
LIST_CURRENCY = split_chars(_currency)
|
||||||
|
|
||||||
|
TOKENIZER_PREFIXES.remove('#') # hashtag
|
||||||
|
_prefixes = TOKENIZER_PREFIXES + LIST_CURRENCY + [HTML_PREFIX] + ['/', '—']
|
||||||
|
|
||||||
|
_suffixes = TOKENIZER_SUFFIXES + [r'\-[Nn]ya', '-[KkMm]u', '[—-]'] + [
|
||||||
|
r'(?<={c})(?:[0-9]+)'.format(c=CURRENCY),
|
||||||
|
r'(?<=[0-9])(?:{u})'.format(u=UNITS),
|
||||||
|
r'(?<=[0-9])%',
|
||||||
|
r'(?<=[0-9{a}]{h})(?:[\.,:-])'.format(a=ALPHA, h=HTML_SUFFIX),
|
||||||
|
r'(?<=[0-9{a}])(?:{h})'.format(a=ALPHA, h=HTML_SUFFIX),
|
||||||
|
]
|
||||||
|
|
||||||
|
_infixes = TOKENIZER_INFIXES + [
|
||||||
|
r'(?<=[0-9])[\\/](?=[0-9%-])',
|
||||||
|
r'(?<=[0-9])%(?=[{a}0-9/])'.format(a=ALPHA),
|
||||||
|
r'(?<={u})[\/-](?=[0-9])'.format(u=UNITS),
|
||||||
|
r'(?<={m})[\/-](?=[0-9])'.format(m=MONTHS),
|
||||||
|
r'(?<=[0-9\)][\.,])"(?=[0-9])',
|
||||||
|
r'(?<=[{a}\)][\.,\'])["—](?=[{a}])'.format(a=ALPHA),
|
||||||
|
r'(?<=[{a}])-(?=[0-9])'.format(a=ALPHA),
|
||||||
|
r'(?<=[0-9])-(?=[{a}])'.format(a=ALPHA),
|
||||||
|
r'(?<=[{a}])[\/-](?={c}{a})'.format(a=ALPHA, c=CURRENCY),
|
||||||
|
]
|
||||||
|
|
||||||
|
TOKENIZER_PREFIXES = _prefixes
|
||||||
|
TOKENIZER_SUFFIXES = _suffixes
|
||||||
|
TOKENIZER_INFIXES = _infixes
|
763
spacy/lang/id/stop_words.py
Normal file
763
spacy/lang/id/stop_words.py
Normal file
|
@ -0,0 +1,763 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
STOP_WORDS = set("""
|
||||||
|
ada
|
||||||
|
adalah
|
||||||
|
adanya
|
||||||
|
adapun
|
||||||
|
agak
|
||||||
|
agaknya
|
||||||
|
agar
|
||||||
|
akan
|
||||||
|
akankah
|
||||||
|
akhir
|
||||||
|
akhiri
|
||||||
|
akhirnya
|
||||||
|
aku
|
||||||
|
akulah
|
||||||
|
amat
|
||||||
|
amatlah
|
||||||
|
anda
|
||||||
|
andalah
|
||||||
|
antar
|
||||||
|
antara
|
||||||
|
antaranya
|
||||||
|
apa
|
||||||
|
apaan
|
||||||
|
apabila
|
||||||
|
apakah
|
||||||
|
apalagi
|
||||||
|
apatah
|
||||||
|
artinya
|
||||||
|
asal
|
||||||
|
asalkan
|
||||||
|
atas
|
||||||
|
atau
|
||||||
|
ataukah
|
||||||
|
ataupun
|
||||||
|
awal
|
||||||
|
awalnya
|
||||||
|
bagai
|
||||||
|
bagaikan
|
||||||
|
bagaimana
|
||||||
|
bagaimanakah
|
||||||
|
bagaimanapun
|
||||||
|
bagi
|
||||||
|
bagian
|
||||||
|
bahkan
|
||||||
|
bahwa
|
||||||
|
bahwasanya
|
||||||
|
baik
|
||||||
|
bakal
|
||||||
|
bakalan
|
||||||
|
balik
|
||||||
|
banyak
|
||||||
|
bapak
|
||||||
|
baru
|
||||||
|
bawah
|
||||||
|
beberapa
|
||||||
|
begini
|
||||||
|
beginian
|
||||||
|
beginikah
|
||||||
|
beginilah
|
||||||
|
begitu
|
||||||
|
begitukah
|
||||||
|
begitulah
|
||||||
|
begitupun
|
||||||
|
bekerja
|
||||||
|
belakang
|
||||||
|
belakangan
|
||||||
|
belum
|
||||||
|
belumlah
|
||||||
|
benar
|
||||||
|
benarkah
|
||||||
|
benarlah
|
||||||
|
berada
|
||||||
|
berakhir
|
||||||
|
berakhirlah
|
||||||
|
berakhirnya
|
||||||
|
berapa
|
||||||
|
berapakah
|
||||||
|
berapalah
|
||||||
|
berapapun
|
||||||
|
berarti
|
||||||
|
berawal
|
||||||
|
berbagai
|
||||||
|
berdatangan
|
||||||
|
beri
|
||||||
|
berikan
|
||||||
|
berikut
|
||||||
|
berikutnya
|
||||||
|
berjumlah
|
||||||
|
berkali-kali
|
||||||
|
berkata
|
||||||
|
berkehendak
|
||||||
|
berkeinginan
|
||||||
|
berkenaan
|
||||||
|
berlainan
|
||||||
|
berlalu
|
||||||
|
berlangsung
|
||||||
|
berlebihan
|
||||||
|
bermacam
|
||||||
|
bermacam-macam
|
||||||
|
bermaksud
|
||||||
|
bermula
|
||||||
|
bersama
|
||||||
|
bersama-sama
|
||||||
|
bersiap
|
||||||
|
bersiap-siap
|
||||||
|
bertanya
|
||||||
|
bertanya-tanya
|
||||||
|
berturut
|
||||||
|
berturut-turut
|
||||||
|
bertutur
|
||||||
|
berujar
|
||||||
|
berupa
|
||||||
|
besar
|
||||||
|
betul
|
||||||
|
betulkah
|
||||||
|
biasa
|
||||||
|
biasanya
|
||||||
|
bila
|
||||||
|
bilakah
|
||||||
|
bisa
|
||||||
|
bisakah
|
||||||
|
boleh
|
||||||
|
bolehkah
|
||||||
|
bolehlah
|
||||||
|
buat
|
||||||
|
bukan
|
||||||
|
bukankah
|
||||||
|
bukanlah
|
||||||
|
bukannya
|
||||||
|
bulan
|
||||||
|
bung
|
||||||
|
cara
|
||||||
|
caranya
|
||||||
|
cukup
|
||||||
|
cukupkah
|
||||||
|
cukuplah
|
||||||
|
cuma
|
||||||
|
dahulu
|
||||||
|
dalam
|
||||||
|
dan
|
||||||
|
dapat
|
||||||
|
dari
|
||||||
|
daripada
|
||||||
|
datang
|
||||||
|
dekat
|
||||||
|
demi
|
||||||
|
demikian
|
||||||
|
demikianlah
|
||||||
|
dengan
|
||||||
|
depan
|
||||||
|
di
|
||||||
|
dia
|
||||||
|
diakhiri
|
||||||
|
diakhirinya
|
||||||
|
dialah
|
||||||
|
diantara
|
||||||
|
diantaranya
|
||||||
|
diberi
|
||||||
|
diberikan
|
||||||
|
diberikannya
|
||||||
|
dibuat
|
||||||
|
dibuatnya
|
||||||
|
didapat
|
||||||
|
didatangkan
|
||||||
|
digunakan
|
||||||
|
diibaratkan
|
||||||
|
diibaratkannya
|
||||||
|
diingat
|
||||||
|
diingatkan
|
||||||
|
diinginkan
|
||||||
|
dijawab
|
||||||
|
dijelaskan
|
||||||
|
dijelaskannya
|
||||||
|
dikarenakan
|
||||||
|
dikatakan
|
||||||
|
dikatakannya
|
||||||
|
dikerjakan
|
||||||
|
diketahui
|
||||||
|
diketahuinya
|
||||||
|
dikira
|
||||||
|
dilakukan
|
||||||
|
dilalui
|
||||||
|
dilihat
|
||||||
|
dimaksud
|
||||||
|
dimaksudkan
|
||||||
|
dimaksudkannya
|
||||||
|
dimaksudnya
|
||||||
|
diminta
|
||||||
|
dimintai
|
||||||
|
dimisalkan
|
||||||
|
dimulai
|
||||||
|
dimulailah
|
||||||
|
dimulainya
|
||||||
|
dimungkinkan
|
||||||
|
dini
|
||||||
|
dipastikan
|
||||||
|
diperbuat
|
||||||
|
diperbuatnya
|
||||||
|
dipergunakan
|
||||||
|
diperkirakan
|
||||||
|
diperlihatkan
|
||||||
|
diperlukan
|
||||||
|
diperlukannya
|
||||||
|
dipersoalkan
|
||||||
|
dipertanyakan
|
||||||
|
dipunyai
|
||||||
|
diri
|
||||||
|
dirinya
|
||||||
|
disampaikan
|
||||||
|
disebut
|
||||||
|
disebutkan
|
||||||
|
disebutkannya
|
||||||
|
disini
|
||||||
|
disinilah
|
||||||
|
ditambahkan
|
||||||
|
ditandaskan
|
||||||
|
ditanya
|
||||||
|
ditanyai
|
||||||
|
ditanyakan
|
||||||
|
ditegaskan
|
||||||
|
ditujukan
|
||||||
|
ditunjuk
|
||||||
|
ditunjuki
|
||||||
|
ditunjukkan
|
||||||
|
ditunjukkannya
|
||||||
|
ditunjuknya
|
||||||
|
dituturkan
|
||||||
|
dituturkannya
|
||||||
|
diucapkan
|
||||||
|
diucapkannya
|
||||||
|
diungkapkan
|
||||||
|
dong
|
||||||
|
dua
|
||||||
|
dulu
|
||||||
|
empat
|
||||||
|
enggak
|
||||||
|
enggaknya
|
||||||
|
entah
|
||||||
|
entahlah
|
||||||
|
guna
|
||||||
|
gunakan
|
||||||
|
hal
|
||||||
|
hampir
|
||||||
|
hanya
|
||||||
|
hanyalah
|
||||||
|
hari
|
||||||
|
harus
|
||||||
|
haruslah
|
||||||
|
harusnya
|
||||||
|
hendak
|
||||||
|
hendaklah
|
||||||
|
hendaknya
|
||||||
|
hingga
|
||||||
|
ia
|
||||||
|
ialah
|
||||||
|
ibarat
|
||||||
|
ibaratkan
|
||||||
|
ibaratnya
|
||||||
|
ibu
|
||||||
|
ikut
|
||||||
|
ingat
|
||||||
|
ingat-ingat
|
||||||
|
ingin
|
||||||
|
inginkah
|
||||||
|
inginkan
|
||||||
|
ini
|
||||||
|
inikah
|
||||||
|
inilah
|
||||||
|
itu
|
||||||
|
itukah
|
||||||
|
itulah
|
||||||
|
jadi
|
||||||
|
jadilah
|
||||||
|
jadinya
|
||||||
|
jangan
|
||||||
|
jangankan
|
||||||
|
janganlah
|
||||||
|
jauh
|
||||||
|
jawab
|
||||||
|
jawaban
|
||||||
|
jawabnya
|
||||||
|
jelas
|
||||||
|
jelaskan
|
||||||
|
jelaslah
|
||||||
|
jelasnya
|
||||||
|
jika
|
||||||
|
jikalau
|
||||||
|
juga
|
||||||
|
jumlah
|
||||||
|
jumlahnya
|
||||||
|
justru
|
||||||
|
kala
|
||||||
|
kalau
|
||||||
|
kalaulah
|
||||||
|
kalaupun
|
||||||
|
kalian
|
||||||
|
kami
|
||||||
|
kamilah
|
||||||
|
kamu
|
||||||
|
kamulah
|
||||||
|
kan
|
||||||
|
kapan
|
||||||
|
kapankah
|
||||||
|
kapanpun
|
||||||
|
karena
|
||||||
|
karenanya
|
||||||
|
kasus
|
||||||
|
kata
|
||||||
|
katakan
|
||||||
|
katakanlah
|
||||||
|
katanya
|
||||||
|
ke
|
||||||
|
keadaan
|
||||||
|
kebetulan
|
||||||
|
kecil
|
||||||
|
kedua
|
||||||
|
keduanya
|
||||||
|
keinginan
|
||||||
|
kelamaan
|
||||||
|
kelihatan
|
||||||
|
kelihatannya
|
||||||
|
kelima
|
||||||
|
keluar
|
||||||
|
kembali
|
||||||
|
kemudian
|
||||||
|
kemungkinan
|
||||||
|
kemungkinannya
|
||||||
|
kenapa
|
||||||
|
kepada
|
||||||
|
kepadanya
|
||||||
|
kesampaian
|
||||||
|
keseluruhan
|
||||||
|
keseluruhannya
|
||||||
|
keterlaluan
|
||||||
|
ketika
|
||||||
|
khususnya
|
||||||
|
kini
|
||||||
|
kinilah
|
||||||
|
kira
|
||||||
|
kira-kira
|
||||||
|
kiranya
|
||||||
|
kita
|
||||||
|
kitalah
|
||||||
|
kok
|
||||||
|
kurang
|
||||||
|
lagi
|
||||||
|
lagian
|
||||||
|
lah
|
||||||
|
lain
|
||||||
|
lainnya
|
||||||
|
lalu
|
||||||
|
lama
|
||||||
|
lamanya
|
||||||
|
lanjut
|
||||||
|
lanjutnya
|
||||||
|
lebih
|
||||||
|
lewat
|
||||||
|
lima
|
||||||
|
luar
|
||||||
|
macam
|
||||||
|
maka
|
||||||
|
makanya
|
||||||
|
makin
|
||||||
|
malah
|
||||||
|
malahan
|
||||||
|
mampu
|
||||||
|
mampukah
|
||||||
|
mana
|
||||||
|
manakala
|
||||||
|
manalagi
|
||||||
|
masa
|
||||||
|
masalah
|
||||||
|
masalahnya
|
||||||
|
masih
|
||||||
|
masihkah
|
||||||
|
masing
|
||||||
|
masing-masing
|
||||||
|
mau
|
||||||
|
maupun
|
||||||
|
melainkan
|
||||||
|
melakukan
|
||||||
|
melalui
|
||||||
|
melihat
|
||||||
|
melihatnya
|
||||||
|
memang
|
||||||
|
memastikan
|
||||||
|
memberi
|
||||||
|
memberikan
|
||||||
|
membuat
|
||||||
|
memerlukan
|
||||||
|
memihak
|
||||||
|
meminta
|
||||||
|
memintakan
|
||||||
|
memisalkan
|
||||||
|
memperbuat
|
||||||
|
mempergunakan
|
||||||
|
memperkirakan
|
||||||
|
memperlihatkan
|
||||||
|
mempersiapkan
|
||||||
|
mempersoalkan
|
||||||
|
mempertanyakan
|
||||||
|
mempunyai
|
||||||
|
memulai
|
||||||
|
memungkinkan
|
||||||
|
menaiki
|
||||||
|
menambahkan
|
||||||
|
menandaskan
|
||||||
|
menanti
|
||||||
|
menanti-nanti
|
||||||
|
menantikan
|
||||||
|
menanya
|
||||||
|
menanyai
|
||||||
|
menanyakan
|
||||||
|
mendapat
|
||||||
|
mendapatkan
|
||||||
|
mendatang
|
||||||
|
mendatangi
|
||||||
|
mendatangkan
|
||||||
|
menegaskan
|
||||||
|
mengakhiri
|
||||||
|
mengapa
|
||||||
|
mengatakan
|
||||||
|
mengatakannya
|
||||||
|
mengenai
|
||||||
|
mengerjakan
|
||||||
|
mengetahui
|
||||||
|
menggunakan
|
||||||
|
menghendaki
|
||||||
|
mengibaratkan
|
||||||
|
mengibaratkannya
|
||||||
|
mengingat
|
||||||
|
mengingatkan
|
||||||
|
menginginkan
|
||||||
|
mengira
|
||||||
|
mengucapkan
|
||||||
|
mengucapkannya
|
||||||
|
mengungkapkan
|
||||||
|
menjadi
|
||||||
|
menjawab
|
||||||
|
menjelaskan
|
||||||
|
menuju
|
||||||
|
menunjuk
|
||||||
|
menunjuki
|
||||||
|
menunjukkan
|
||||||
|
menunjuknya
|
||||||
|
menurut
|
||||||
|
menuturkan
|
||||||
|
menyampaikan
|
||||||
|
menyangkut
|
||||||
|
menyatakan
|
||||||
|
menyebutkan
|
||||||
|
menyeluruh
|
||||||
|
menyiapkan
|
||||||
|
merasa
|
||||||
|
mereka
|
||||||
|
merekalah
|
||||||
|
merupakan
|
||||||
|
meski
|
||||||
|
meskipun
|
||||||
|
meyakini
|
||||||
|
meyakinkan
|
||||||
|
minta
|
||||||
|
mirip
|
||||||
|
misal
|
||||||
|
misalkan
|
||||||
|
misalnya
|
||||||
|
mula
|
||||||
|
mulai
|
||||||
|
mulailah
|
||||||
|
mulanya
|
||||||
|
mungkin
|
||||||
|
mungkinkah
|
||||||
|
nah
|
||||||
|
naik
|
||||||
|
namun
|
||||||
|
nanti
|
||||||
|
nantinya
|
||||||
|
nyaris
|
||||||
|
nyatanya
|
||||||
|
oleh
|
||||||
|
olehnya
|
||||||
|
pada
|
||||||
|
padahal
|
||||||
|
padanya
|
||||||
|
pak
|
||||||
|
paling
|
||||||
|
panjang
|
||||||
|
pantas
|
||||||
|
para
|
||||||
|
pasti
|
||||||
|
pastilah
|
||||||
|
penting
|
||||||
|
pentingnya
|
||||||
|
per
|
||||||
|
percuma
|
||||||
|
perlu
|
||||||
|
perlukah
|
||||||
|
perlunya
|
||||||
|
pernah
|
||||||
|
persoalan
|
||||||
|
pertama
|
||||||
|
pertama-tama
|
||||||
|
pertanyaan
|
||||||
|
pertanyakan
|
||||||
|
pihak
|
||||||
|
pihaknya
|
||||||
|
pukul
|
||||||
|
pula
|
||||||
|
pun
|
||||||
|
punya
|
||||||
|
rasa
|
||||||
|
rasanya
|
||||||
|
rata
|
||||||
|
rupanya
|
||||||
|
saat
|
||||||
|
saatnya
|
||||||
|
saja
|
||||||
|
sajalah
|
||||||
|
saling
|
||||||
|
sama
|
||||||
|
sama-sama
|
||||||
|
sambil
|
||||||
|
sampai
|
||||||
|
sampai-sampai
|
||||||
|
sampaikan
|
||||||
|
sana
|
||||||
|
sangat
|
||||||
|
sangatlah
|
||||||
|
satu
|
||||||
|
saya
|
||||||
|
sayalah
|
||||||
|
se
|
||||||
|
sebab
|
||||||
|
sebabnya
|
||||||
|
sebagai
|
||||||
|
sebagaimana
|
||||||
|
sebagainya
|
||||||
|
sebagian
|
||||||
|
sebaik
|
||||||
|
sebaik-baiknya
|
||||||
|
sebaiknya
|
||||||
|
sebaliknya
|
||||||
|
sebanyak
|
||||||
|
sebegini
|
||||||
|
sebegitu
|
||||||
|
sebelum
|
||||||
|
sebelumnya
|
||||||
|
sebenarnya
|
||||||
|
seberapa
|
||||||
|
sebesar
|
||||||
|
sebetulnya
|
||||||
|
sebisanya
|
||||||
|
sebuah
|
||||||
|
sebut
|
||||||
|
sebutlah
|
||||||
|
sebutnya
|
||||||
|
secara
|
||||||
|
secukupnya
|
||||||
|
sedang
|
||||||
|
sedangkan
|
||||||
|
sedemikian
|
||||||
|
sedikit
|
||||||
|
sedikitnya
|
||||||
|
seenaknya
|
||||||
|
segala
|
||||||
|
segalanya
|
||||||
|
segera
|
||||||
|
seharusnya
|
||||||
|
sehingga
|
||||||
|
seingat
|
||||||
|
sejak
|
||||||
|
sejauh
|
||||||
|
sejenak
|
||||||
|
sejumlah
|
||||||
|
sekadar
|
||||||
|
sekadarnya
|
||||||
|
sekali
|
||||||
|
sekali-kali
|
||||||
|
sekalian
|
||||||
|
sekaligus
|
||||||
|
sekalipun
|
||||||
|
sekarang
|
||||||
|
sekarang
|
||||||
|
sekecil
|
||||||
|
seketika
|
||||||
|
sekiranya
|
||||||
|
sekitar
|
||||||
|
sekitarnya
|
||||||
|
sekurang-kurangnya
|
||||||
|
sekurangnya
|
||||||
|
sela
|
||||||
|
selain
|
||||||
|
selaku
|
||||||
|
selalu
|
||||||
|
selama
|
||||||
|
selama-lamanya
|
||||||
|
selamanya
|
||||||
|
selanjutnya
|
||||||
|
seluruh
|
||||||
|
seluruhnya
|
||||||
|
semacam
|
||||||
|
semakin
|
||||||
|
semampu
|
||||||
|
semampunya
|
||||||
|
semasa
|
||||||
|
semasih
|
||||||
|
semata
|
||||||
|
semata-mata
|
||||||
|
semaunya
|
||||||
|
sementara
|
||||||
|
semisal
|
||||||
|
semisalnya
|
||||||
|
sempat
|
||||||
|
semua
|
||||||
|
semuanya
|
||||||
|
semula
|
||||||
|
sendiri
|
||||||
|
sendirian
|
||||||
|
sendirinya
|
||||||
|
seolah
|
||||||
|
seolah-olah
|
||||||
|
seorang
|
||||||
|
sepanjang
|
||||||
|
sepantasnya
|
||||||
|
sepantasnyalah
|
||||||
|
seperlunya
|
||||||
|
seperti
|
||||||
|
sepertinya
|
||||||
|
sepihak
|
||||||
|
sering
|
||||||
|
seringnya
|
||||||
|
serta
|
||||||
|
serupa
|
||||||
|
sesaat
|
||||||
|
sesama
|
||||||
|
sesampai
|
||||||
|
sesegera
|
||||||
|
sesekali
|
||||||
|
seseorang
|
||||||
|
sesuatu
|
||||||
|
sesuatunya
|
||||||
|
sesudah
|
||||||
|
sesudahnya
|
||||||
|
setelah
|
||||||
|
setempat
|
||||||
|
setengah
|
||||||
|
seterusnya
|
||||||
|
setiap
|
||||||
|
setiba
|
||||||
|
setibanya
|
||||||
|
setidak-tidaknya
|
||||||
|
setidaknya
|
||||||
|
setinggi
|
||||||
|
seusai
|
||||||
|
sewaktu
|
||||||
|
siap
|
||||||
|
siapa
|
||||||
|
siapakah
|
||||||
|
siapapun
|
||||||
|
sini
|
||||||
|
sinilah
|
||||||
|
soal
|
||||||
|
soalnya
|
||||||
|
suatu
|
||||||
|
sudah
|
||||||
|
sudahkah
|
||||||
|
sudahlah
|
||||||
|
supaya
|
||||||
|
tadi
|
||||||
|
tadinya
|
||||||
|
tahu
|
||||||
|
tahun
|
||||||
|
tak
|
||||||
|
tambah
|
||||||
|
tambahnya
|
||||||
|
tampak
|
||||||
|
tampaknya
|
||||||
|
tandas
|
||||||
|
tandasnya
|
||||||
|
tanpa
|
||||||
|
tanya
|
||||||
|
tanyakan
|
||||||
|
tanyanya
|
||||||
|
tapi
|
||||||
|
tegas
|
||||||
|
tegasnya
|
||||||
|
telah
|
||||||
|
tempat
|
||||||
|
tengah
|
||||||
|
tentang
|
||||||
|
tentu
|
||||||
|
tentulah
|
||||||
|
tentunya
|
||||||
|
tepat
|
||||||
|
terakhir
|
||||||
|
terasa
|
||||||
|
terbanyak
|
||||||
|
terdahulu
|
||||||
|
terdapat
|
||||||
|
terdiri
|
||||||
|
terhadap
|
||||||
|
terhadapnya
|
||||||
|
teringat
|
||||||
|
teringat-ingat
|
||||||
|
terjadi
|
||||||
|
terjadilah
|
||||||
|
terjadinya
|
||||||
|
terkira
|
||||||
|
terlalu
|
||||||
|
terlebih
|
||||||
|
terlihat
|
||||||
|
termasuk
|
||||||
|
ternyata
|
||||||
|
tersampaikan
|
||||||
|
tersebut
|
||||||
|
tersebutlah
|
||||||
|
tertentu
|
||||||
|
tertuju
|
||||||
|
terus
|
||||||
|
terutama
|
||||||
|
tetap
|
||||||
|
tetapi
|
||||||
|
tiap
|
||||||
|
tiba
|
||||||
|
tiba-tiba
|
||||||
|
tidak
|
||||||
|
tidakkah
|
||||||
|
tidaklah
|
||||||
|
tiga
|
||||||
|
tinggi
|
||||||
|
toh
|
||||||
|
tunjuk
|
||||||
|
turut
|
||||||
|
tutur
|
||||||
|
tuturnya
|
||||||
|
ucap
|
||||||
|
ucapnya
|
||||||
|
ujar
|
||||||
|
ujarnya
|
||||||
|
umum
|
||||||
|
umumnya
|
||||||
|
ungkap
|
||||||
|
ungkapnya
|
||||||
|
untuk
|
||||||
|
usah
|
||||||
|
usai
|
||||||
|
waduh
|
||||||
|
wah
|
||||||
|
wahai
|
||||||
|
waktu
|
||||||
|
waktunya
|
||||||
|
walau
|
||||||
|
walaupun
|
||||||
|
wong
|
||||||
|
yaitu
|
||||||
|
yakin
|
||||||
|
yakni
|
||||||
|
yang
|
||||||
|
""".split())
|
42
spacy/lang/id/syntax_iterators.py
Normal file
42
spacy/lang/id/syntax_iterators.py
Normal file
|
@ -0,0 +1,42 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from ...symbols import NOUN, PROPN, PRON
|
||||||
|
|
||||||
|
|
||||||
|
def noun_chunks(obj):
|
||||||
|
"""
|
||||||
|
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
|
||||||
|
"""
|
||||||
|
labels = ['nsubj', 'nsubj:pass', 'obj', 'iobj', 'ROOT', 'appos', 'nmod', 'nmod:poss']
|
||||||
|
doc = obj.doc # Ensure works on both Doc and Span.
|
||||||
|
np_deps = [doc.vocab.strings[label] for label in labels]
|
||||||
|
conj = doc.vocab.strings.add('conj')
|
||||||
|
np_label = doc.vocab.strings.add('NP')
|
||||||
|
seen = set()
|
||||||
|
for i, word in enumerate(obj):
|
||||||
|
if word.pos not in (NOUN, PROPN, PRON):
|
||||||
|
continue
|
||||||
|
# Prevent nested chunks from being produced
|
||||||
|
if word.i in seen:
|
||||||
|
continue
|
||||||
|
if word.dep in np_deps:
|
||||||
|
if any(w.i in seen for w in word.subtree):
|
||||||
|
continue
|
||||||
|
seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
|
||||||
|
yield word.left_edge.i, word.right_edge.i+1, np_label
|
||||||
|
elif word.dep == conj:
|
||||||
|
head = word.head
|
||||||
|
while head.dep == conj and head.head.i < head.i:
|
||||||
|
head = head.head
|
||||||
|
# If the head is an NP, and we're coordinated to it, we're an NP
|
||||||
|
if head.dep in np_deps:
|
||||||
|
if any(w.i in seen for w in word.subtree):
|
||||||
|
continue
|
||||||
|
seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
|
||||||
|
yield word.left_edge.i, word.right_edge.i+1, np_label
|
||||||
|
|
||||||
|
|
||||||
|
SYNTAX_ITERATORS = {
|
||||||
|
'noun_chunks': noun_chunks
|
||||||
|
}
|
50
spacy/lang/id/tokenizer_exceptions.py
Normal file
50
spacy/lang/id/tokenizer_exceptions.py
Normal file
|
@ -0,0 +1,50 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import regex as re
|
||||||
|
|
||||||
|
from ._tokenizer_exceptions_list import ID_BASE_EXCEPTIONS
|
||||||
|
from ..tokenizer_exceptions import URL_PATTERN
|
||||||
|
from ...symbols import ORTH
|
||||||
|
|
||||||
|
|
||||||
|
_exc = {}
|
||||||
|
|
||||||
|
for orth in ID_BASE_EXCEPTIONS:
|
||||||
|
_exc[orth] = [{ORTH: orth}]
|
||||||
|
|
||||||
|
orth_title = orth.title()
|
||||||
|
_exc[orth_title] = [{ORTH: orth_title}]
|
||||||
|
|
||||||
|
orth_caps = orth.upper()
|
||||||
|
_exc[orth_caps] = [{ORTH: orth_caps}]
|
||||||
|
|
||||||
|
orth_lower = orth.lower()
|
||||||
|
_exc[orth_lower] = [{ORTH: orth_lower}]
|
||||||
|
|
||||||
|
if '-' in orth:
|
||||||
|
orth_title = '-'.join([part.title() for part in orth.split('-')])
|
||||||
|
_exc[orth_title] = [{ORTH: orth_title}]
|
||||||
|
|
||||||
|
orth_caps = '-'.join([part.upper() for part in orth.split('-')])
|
||||||
|
_exc[orth_caps] = [{ORTH: orth_caps}]
|
||||||
|
|
||||||
|
|
||||||
|
for orth in [
|
||||||
|
"'d", "a.m.", "Adm.", "Bros.", "co.", "Co.", "Corp.", "D.C.", "Dr.", "e.g.",
|
||||||
|
"E.g.", "E.G.", "Gen.", "Gov.", "i.e.", "I.e.", "I.E.", "Inc.", "Jr.",
|
||||||
|
"Ltd.", "Md.", "Messrs.", "Mo.", "Mont.", "Mr.", "Mrs.", "Ms.", "p.m.",
|
||||||
|
"Ph.D.", "Rep.", "Rev.", "Sen.", "St.", "vs.",
|
||||||
|
"B.A.", "B.Ch.E.", "B.Sc.", "Dr.", "Dra.", "Drs.", "Hj.", "Ka.", "Kp.",
|
||||||
|
"M.Ag.", "M.Hum.", "M.Kes,", "M.Kom.", "M.M.", "M.P.", "M.Pd.", "M.Sc.",
|
||||||
|
"M.Si.", "M.Sn.", "M.T.", "M.Th.", "No.", "Pjs.", "Plt.", "R.A.", "S.Ag.",
|
||||||
|
"S.E.", "S.H.", "S.Hut.", "S.K.M.", "S.Kedg.", "S.Kedh.", "S.Kom.",
|
||||||
|
"S.Pd.", "S.Pol.", "S.Psi.", "S.S.", "S.Sos.", "S.T.", "S.Tekp.", "S.Th.",
|
||||||
|
"a.l.", "a.n.", "a.s.", "b.d.", "d.a.", "d.l.", "d/h", "dkk.", "dll.",
|
||||||
|
"dr.", "drh.", "ds.", "dsb.", "dst.", "faks.", "fax.", "hlm.", "i/o",
|
||||||
|
"n.b.", "p.p." "pjs.", "s.d.", "tel.", "u.p.",
|
||||||
|
]:
|
||||||
|
_exc[orth] = [{ORTH: orth}]
|
||||||
|
|
||||||
|
TOKENIZER_EXCEPTIONS = dict(_exc)
|
||||||
|
|
18
spacy/lang/it/examples.py
Normal file
18
spacy/lang/it/examples.py
Normal file
|
@ -0,0 +1,18 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
>>> from spacy.lang.it.examples import sentences
|
||||||
|
>>> docs = nlp.pipe(sentences)
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
sentences = [
|
||||||
|
"Apple vuole comprare una startup del Regno Unito per un miliardo di dollari",
|
||||||
|
"Le automobili a guida autonoma spostano la responsabilità assicurativa verso i produttori",
|
||||||
|
"San Francisco prevede di bandire i robot di consegna porta a porta",
|
||||||
|
"Londra è una grande città del Regno Unito."
|
||||||
|
]
|
|
@ -137,6 +137,7 @@ LEX_ATTRS = {
|
||||||
attrs.IS_UPPER: lambda string: string.isupper(),
|
attrs.IS_UPPER: lambda string: string.isupper(),
|
||||||
attrs.IS_STOP: lambda string: False,
|
attrs.IS_STOP: lambda string: False,
|
||||||
attrs.IS_OOV: lambda string: True,
|
attrs.IS_OOV: lambda string: True,
|
||||||
|
attrs.PROB: lambda string: -20.,
|
||||||
attrs.LIKE_EMAIL: like_email,
|
attrs.LIKE_EMAIL: like_email,
|
||||||
attrs.LIKE_NUM: like_num,
|
attrs.LIKE_NUM: like_num,
|
||||||
attrs.IS_PUNCT: is_punct,
|
attrs.IS_PUNCT: is_punct,
|
||||||
|
|
18
spacy/lang/nb/examples.py
Normal file
18
spacy/lang/nb/examples.py
Normal file
|
@ -0,0 +1,18 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
>>> from spacy.lang.nb.examples import sentences
|
||||||
|
>>> docs = nlp.pipe(sentences)
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
sentences = [
|
||||||
|
"Apple vurderer å kjøpe britisk oppstartfirma for en milliard dollar",
|
||||||
|
"Selvkjørende biler flytter forsikringsansvaret over på produsentene ",
|
||||||
|
"San Francisco vurderer å forby robotbud på fortauene",
|
||||||
|
"London er en stor by i Storbritannia."
|
||||||
|
]
|
|
@ -1,6 +1,7 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
|
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
|
@ -15,7 +16,7 @@ class PolishDefaults(Language.Defaults):
|
||||||
lex_attr_getters[LANG] = lambda text: 'pl'
|
lex_attr_getters[LANG] = lambda text: 'pl'
|
||||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
||||||
|
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
stop_words = set(STOP_WORDS)
|
stop_words = set(STOP_WORDS)
|
||||||
|
|
||||||
|
|
||||||
|
|
20
spacy/lang/pl/examples.py
Normal file
20
spacy/lang/pl/examples.py
Normal file
|
@ -0,0 +1,20 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
>>> from spacy.lang.pl.examples import sentences
|
||||||
|
>>> docs = nlp.pipe(sentences)
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
sentences = [
|
||||||
|
"Poczuł przyjemną woń mocnej kawy.",
|
||||||
|
"Istnieje wiele dróg oddziaływania substancji psychoaktywnej na układ nerwowy.",
|
||||||
|
"Powitał mnie biało-czarny kot, płosząc siedzące na płocie trzy dorodne dudki.",
|
||||||
|
"Nowy abonament pod lupą Komisji Europejskiej",
|
||||||
|
"Czy w ciągu ostatnich 48 godzin spożyłeś leki zawierające paracetamol?",
|
||||||
|
"Kto ma ochotę zapoznać się z innymi niż w książkach przygodami Muminków i ich przyjaciół, temu polecam komiks Tove Jansson „Muminki i morze”."
|
||||||
|
]
|
23
spacy/lang/pl/tokenizer_exceptions.py
Normal file
23
spacy/lang/pl/tokenizer_exceptions.py
Normal file
|
@ -0,0 +1,23 @@
|
||||||
|
# encoding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from ..symbols import ORTH, LEMMA, POS
|
||||||
|
|
||||||
|
|
||||||
|
_exc = {}
|
||||||
|
|
||||||
|
for exc_data in [
|
||||||
|
{ORTH: "m.in.", LEMMA: "między innymi", POS: ADV},
|
||||||
|
{ORTH: "inż.", LEMMA: "inżynier", POS: NOUN},
|
||||||
|
{ORTH: "mgr.", LEMMA: "magister", POS: NOUN},
|
||||||
|
{ORTH: "tzn.", LEMMA: "to znaczy", POS: ADV},
|
||||||
|
{ORTH: "tj.", LEMMA: "to jest", POS: ADV},
|
||||||
|
{ORTH: "tzw.", LEMMA: "tak zwany", POS: ADJ}]:
|
||||||
|
_exc[exc_data[ORTH]] = [dict(exc_data)],
|
||||||
|
|
||||||
|
for orth in [
|
||||||
|
"w.", "r."]:
|
||||||
|
_exc[orth] = [{ORTH: orth}]
|
||||||
|
|
||||||
|
|
||||||
|
TOKENIZER_EXCEPTIONS = dict(_exc)
|
18
spacy/lang/pt/examples.py
Normal file
18
spacy/lang/pt/examples.py
Normal file
|
@ -0,0 +1,18 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
>>> from spacy.lang.pt.examples import sentences
|
||||||
|
>>> docs = nlp.pipe(sentences)
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
sentences = [
|
||||||
|
"Apple está querendo comprar uma startup do Reino Unido por 100 milhões de dólares",
|
||||||
|
"Carros autônomos empurram a responsabilidade do seguro para os fabricantes."
|
||||||
|
"São Francisco considera banir os robôs de entrega que andam pelas calçadas",
|
||||||
|
"Londres é a maior cidade do Reino Unido"
|
||||||
|
]
|
18
spacy/lang/sv/examples.py
Normal file
18
spacy/lang/sv/examples.py
Normal file
|
@ -0,0 +1,18 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
>>> from spacy.lang.sv.examples import sentences
|
||||||
|
>>> docs = nlp.pipe(sentences)
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
sentences = [
|
||||||
|
"Apple överväger att köpa brittisk startup för 1 miljard dollar.",
|
||||||
|
"Självkörande bilar förskjuter försäkringsansvar mot tillverkare.",
|
||||||
|
"San Fransisco överväger förbud mot leveransrobotar på trottoarer.".
|
||||||
|
"London är en storstad i Storbritannien."
|
||||||
|
]
|
|
@ -15,6 +15,7 @@ class Chinese(Language):
|
||||||
raise ImportError("The Chinese tokenizer requires the Jieba library: "
|
raise ImportError("The Chinese tokenizer requires the Jieba library: "
|
||||||
"https://github.com/fxsjy/jieba")
|
"https://github.com/fxsjy/jieba")
|
||||||
words = list(jieba.cut(text, cut_all=True))
|
words = list(jieba.cut(text, cut_all=True))
|
||||||
|
words=[x for x in words if x]
|
||||||
return Doc(self.vocab, words=words, spaces=[False]*len(words))
|
return Doc(self.vocab, words=words, spaces=[False]*len(words))
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -10,6 +10,7 @@ from thinc.neural.optimizers import Adam, SGD
|
||||||
import random
|
import random
|
||||||
import ujson
|
import ujson
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
|
import itertools
|
||||||
|
|
||||||
from .tokenizer import Tokenizer
|
from .tokenizer import Tokenizer
|
||||||
from .vocab import Vocab
|
from .vocab import Vocab
|
||||||
|
@ -22,8 +23,10 @@ from .pipeline import NeuralDependencyParser, EntityRecognizer
|
||||||
from .pipeline import TokenVectorEncoder, NeuralTagger, NeuralEntityRecognizer
|
from .pipeline import TokenVectorEncoder, NeuralTagger, NeuralEntityRecognizer
|
||||||
from .pipeline import NeuralLabeller
|
from .pipeline import NeuralLabeller
|
||||||
from .pipeline import SimilarityHook
|
from .pipeline import SimilarityHook
|
||||||
|
from .pipeline import TextCategorizer
|
||||||
|
from . import about
|
||||||
|
|
||||||
from .compat import json_dumps
|
from .compat import json_dumps, izip
|
||||||
from .attrs import IS_STOP
|
from .attrs import IS_STOP
|
||||||
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
||||||
from .lang.tokenizer_exceptions import TOKEN_MATCH
|
from .lang.tokenizer_exceptions import TOKEN_MATCH
|
||||||
|
@ -92,7 +95,7 @@ class BaseDefaults(object):
|
||||||
meta = nlp.meta if nlp is not None else {}
|
meta = nlp.meta if nlp is not None else {}
|
||||||
# Resolve strings, like "cnn", "lstm", etc
|
# Resolve strings, like "cnn", "lstm", etc
|
||||||
pipeline = []
|
pipeline = []
|
||||||
for entry in cls.pipeline:
|
for entry in meta.get('pipeline', []):
|
||||||
if entry in disable or getattr(entry, 'name', entry) in disable:
|
if entry in disable or getattr(entry, 'name', entry) in disable:
|
||||||
continue
|
continue
|
||||||
factory = cls.Defaults.factories[entry]
|
factory = cls.Defaults.factories[entry]
|
||||||
|
@ -107,6 +110,8 @@ class BaseDefaults(object):
|
||||||
NeuralDependencyParser(nlp.vocab, **cfg),
|
NeuralDependencyParser(nlp.vocab, **cfg),
|
||||||
nonproj.deprojectivize],
|
nonproj.deprojectivize],
|
||||||
'ner': lambda nlp, **cfg: [NeuralEntityRecognizer(nlp.vocab, **cfg)],
|
'ner': lambda nlp, **cfg: [NeuralEntityRecognizer(nlp.vocab, **cfg)],
|
||||||
|
'similarity': lambda nlp, **cfg: [SimilarityHook(nlp.vocab, **cfg)],
|
||||||
|
'textcat': lambda nlp, **cfg: [TextCategorizer(nlp.vocab, **cfg)],
|
||||||
# Temporary compatibility -- delete after pivot
|
# Temporary compatibility -- delete after pivot
|
||||||
'token_vectors': lambda nlp, **cfg: [TokenVectorEncoder(nlp.vocab, **cfg)],
|
'token_vectors': lambda nlp, **cfg: [TokenVectorEncoder(nlp.vocab, **cfg)],
|
||||||
'tags': lambda nlp, **cfg: [NeuralTagger(nlp.vocab, **cfg)],
|
'tags': lambda nlp, **cfg: [NeuralTagger(nlp.vocab, **cfg)],
|
||||||
|
@ -115,7 +120,6 @@ class BaseDefaults(object):
|
||||||
nonproj.deprojectivize,
|
nonproj.deprojectivize,
|
||||||
],
|
],
|
||||||
'entities': lambda nlp, **cfg: [NeuralEntityRecognizer(nlp.vocab, **cfg)],
|
'entities': lambda nlp, **cfg: [NeuralEntityRecognizer(nlp.vocab, **cfg)],
|
||||||
'similarity': lambda nlp, **cfg: [SimilarityHook(nlp.vocab, **cfg)]
|
|
||||||
}
|
}
|
||||||
|
|
||||||
token_match = TOKEN_MATCH
|
token_match = TOKEN_MATCH
|
||||||
|
@ -147,8 +151,8 @@ class Language(object):
|
||||||
Defaults = BaseDefaults
|
Defaults = BaseDefaults
|
||||||
lang = None
|
lang = None
|
||||||
|
|
||||||
def __init__(self, vocab=True, make_doc=True, pipeline=None, meta={},
|
def __init__(self, vocab=True, make_doc=True, pipeline=None,
|
||||||
disable=tuple(), **kwargs):
|
meta={}, disable=tuple(), **kwargs):
|
||||||
"""Initialise a Language object.
|
"""Initialise a Language object.
|
||||||
|
|
||||||
vocab (Vocab): A `Vocab` object. If `True`, a vocab is created via
|
vocab (Vocab): A `Vocab` object. If `True`, a vocab is created via
|
||||||
|
@ -165,7 +169,7 @@ class Language(object):
|
||||||
models to add model meta data.
|
models to add model meta data.
|
||||||
RETURNS (Language): The newly constructed object.
|
RETURNS (Language): The newly constructed object.
|
||||||
"""
|
"""
|
||||||
self.meta = dict(meta)
|
self._meta = dict(meta)
|
||||||
if vocab is True:
|
if vocab is True:
|
||||||
factory = self.Defaults.create_vocab
|
factory = self.Defaults.create_vocab
|
||||||
vocab = factory(self, **meta.get('vocab', {}))
|
vocab = factory(self, **meta.get('vocab', {}))
|
||||||
|
@ -196,6 +200,29 @@ class Language(object):
|
||||||
else:
|
else:
|
||||||
flat_list.append(pipe)
|
flat_list.append(pipe)
|
||||||
self.pipeline = flat_list
|
self.pipeline = flat_list
|
||||||
|
self._optimizer = None
|
||||||
|
|
||||||
|
@property
|
||||||
|
def meta(self):
|
||||||
|
self._meta.setdefault('lang', self.vocab.lang)
|
||||||
|
self._meta.setdefault('name', '')
|
||||||
|
self._meta.setdefault('version', '0.0.0')
|
||||||
|
self._meta.setdefault('spacy_version', about.__version__)
|
||||||
|
self._meta.setdefault('description', '')
|
||||||
|
self._meta.setdefault('author', '')
|
||||||
|
self._meta.setdefault('email', '')
|
||||||
|
self._meta.setdefault('url', '')
|
||||||
|
self._meta.setdefault('license', '')
|
||||||
|
pipeline = []
|
||||||
|
for component in self.pipeline:
|
||||||
|
if hasattr(component, 'name'):
|
||||||
|
pipeline.append(component.name)
|
||||||
|
self._meta['pipeline'] = pipeline
|
||||||
|
return self._meta
|
||||||
|
|
||||||
|
@meta.setter
|
||||||
|
def meta(self, value):
|
||||||
|
self._meta = value
|
||||||
|
|
||||||
# Conveniences to access pipeline components
|
# Conveniences to access pipeline components
|
||||||
@property
|
@property
|
||||||
|
@ -251,7 +278,8 @@ class Language(object):
|
||||||
def make_doc(self, text):
|
def make_doc(self, text):
|
||||||
return self.tokenizer(text)
|
return self.tokenizer(text)
|
||||||
|
|
||||||
def update(self, docs, golds, drop=0., sgd=None, losses=None):
|
def update(self, docs, golds, drop=0., sgd=None, losses=None,
|
||||||
|
update_shared=False):
|
||||||
"""Update the models in the pipeline.
|
"""Update the models in the pipeline.
|
||||||
|
|
||||||
docs (iterable): A batch of `Doc` objects.
|
docs (iterable): A batch of `Doc` objects.
|
||||||
|
@ -266,6 +294,15 @@ class Language(object):
|
||||||
>>> for docs, golds in epoch:
|
>>> for docs, golds in epoch:
|
||||||
>>> state = nlp.update(docs, golds, sgd=optimizer)
|
>>> state = nlp.update(docs, golds, sgd=optimizer)
|
||||||
"""
|
"""
|
||||||
|
if len(docs) != len(golds):
|
||||||
|
raise IndexError("Update expects same number of docs and golds "
|
||||||
|
"Got: %d, %d" % (len(docs), len(golds)))
|
||||||
|
if len(docs) == 0:
|
||||||
|
return
|
||||||
|
if sgd is None:
|
||||||
|
if self._optimizer is None:
|
||||||
|
self._optimizer = Adam(Model.ops, 0.001)
|
||||||
|
sgd = self._optimizer
|
||||||
tok2vec = self.pipeline[0]
|
tok2vec = self.pipeline[0]
|
||||||
feats = tok2vec.doc2feats(docs)
|
feats = tok2vec.doc2feats(docs)
|
||||||
grads = {}
|
grads = {}
|
||||||
|
@ -273,14 +310,18 @@ class Language(object):
|
||||||
grads[key] = (W, dW)
|
grads[key] = (W, dW)
|
||||||
pipes = list(self.pipeline[1:])
|
pipes = list(self.pipeline[1:])
|
||||||
random.shuffle(pipes)
|
random.shuffle(pipes)
|
||||||
|
tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop)
|
||||||
|
all_d_tokvecses = [tok2vec.model.ops.allocate(tv.shape) for tv in tokvecses]
|
||||||
for proc in pipes:
|
for proc in pipes:
|
||||||
if not hasattr(proc, 'update'):
|
if not hasattr(proc, 'update'):
|
||||||
continue
|
continue
|
||||||
tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop)
|
|
||||||
d_tokvecses = proc.update((docs, tokvecses), golds,
|
d_tokvecses = proc.update((docs, tokvecses), golds,
|
||||||
drop=drop, sgd=get_grads, losses=losses)
|
drop=drop, sgd=get_grads, losses=losses)
|
||||||
if d_tokvecses is not None:
|
if update_shared and d_tokvecses is not None:
|
||||||
bp_tokvecses(d_tokvecses, sgd=sgd)
|
for i, d_tv in enumerate(d_tokvecses):
|
||||||
|
all_d_tokvecses[i] += d_tv
|
||||||
|
if update_shared and bp_tokvecses is not None:
|
||||||
|
bp_tokvecses(all_d_tokvecses, sgd=sgd)
|
||||||
for key, (W, dW) in grads.items():
|
for key, (W, dW) in grads.items():
|
||||||
sgd(W, dW, key=key)
|
sgd(W, dW, key=key)
|
||||||
# Clear the tensor variable, to free GPU memory.
|
# Clear the tensor variable, to free GPU memory.
|
||||||
|
@ -343,16 +384,25 @@ class Language(object):
|
||||||
eps = util.env_opt('optimizer_eps', 1e-08)
|
eps = util.env_opt('optimizer_eps', 1e-08)
|
||||||
L2 = util.env_opt('L2_penalty', 1e-6)
|
L2 = util.env_opt('L2_penalty', 1e-6)
|
||||||
max_grad_norm = util.env_opt('grad_norm_clip', 1.)
|
max_grad_norm = util.env_opt('grad_norm_clip', 1.)
|
||||||
optimizer = Adam(Model.ops, learn_rate, L2=L2, beta1=beta1,
|
self._optimizer = Adam(Model.ops, learn_rate, L2=L2, beta1=beta1,
|
||||||
beta2=beta2, eps=eps)
|
beta2=beta2, eps=eps)
|
||||||
optimizer.max_grad_norm = max_grad_norm
|
self._optimizer.max_grad_norm = max_grad_norm
|
||||||
optimizer.device = device
|
self._optimizer.device = device
|
||||||
return optimizer
|
return self._optimizer
|
||||||
|
|
||||||
def evaluate(self, docs_golds):
|
def evaluate(self, docs_golds):
|
||||||
docs, golds = zip(*docs_golds)
|
|
||||||
scorer = Scorer()
|
scorer = Scorer()
|
||||||
for doc, gold in zip(self.pipe(docs, batch_size=32), golds):
|
docs, golds = zip(*docs_golds)
|
||||||
|
docs = list(docs)
|
||||||
|
golds = list(golds)
|
||||||
|
for pipe in self.pipeline:
|
||||||
|
if not hasattr(pipe, 'pipe'):
|
||||||
|
for doc in docs:
|
||||||
|
pipe(doc)
|
||||||
|
else:
|
||||||
|
docs = list(pipe.pipe(docs))
|
||||||
|
assert len(docs) == len(golds)
|
||||||
|
for doc, gold in zip(docs, golds):
|
||||||
scorer.score(doc, gold)
|
scorer.score(doc, gold)
|
||||||
doc.tensor = None
|
doc.tensor = None
|
||||||
return scorer
|
return scorer
|
||||||
|
@ -386,11 +436,16 @@ class Language(object):
|
||||||
except StopIteration:
|
except StopIteration:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def pipe(self, texts, n_threads=2, batch_size=1000, disable=[]):
|
def pipe(self, texts, as_tuples=False, n_threads=2, batch_size=1000,
|
||||||
|
disable=[]):
|
||||||
"""Process texts as a stream, and yield `Doc` objects in order. Supports
|
"""Process texts as a stream, and yield `Doc` objects in order. Supports
|
||||||
GIL-free multi-threading.
|
GIL-free multi-threading.
|
||||||
|
|
||||||
texts (iterator): A sequence of texts to process.
|
texts (iterator): A sequence of texts to process.
|
||||||
|
as_tuples (bool):
|
||||||
|
If set to True, inputs should be a sequence of
|
||||||
|
(text, context) tuples. Output will then be a sequence of
|
||||||
|
(doc, context) tuples. Defaults to False.
|
||||||
n_threads (int): The number of worker threads to use. If -1, OpenMP will
|
n_threads (int): The number of worker threads to use. If -1, OpenMP will
|
||||||
decide how many to use at run time. Default is 2.
|
decide how many to use at run time. Default is 2.
|
||||||
batch_size (int): The number of texts to buffer.
|
batch_size (int): The number of texts to buffer.
|
||||||
|
@ -402,8 +457,16 @@ class Language(object):
|
||||||
>>> for doc in nlp.pipe(texts, batch_size=50, n_threads=4):
|
>>> for doc in nlp.pipe(texts, batch_size=50, n_threads=4):
|
||||||
>>> assert doc.is_parsed
|
>>> assert doc.is_parsed
|
||||||
"""
|
"""
|
||||||
|
if as_tuples:
|
||||||
|
text_context1, text_context2 = itertools.tee(texts)
|
||||||
|
texts = (tc[0] for tc in text_context1)
|
||||||
|
contexts = (tc[1] for tc in text_context2)
|
||||||
|
docs = self.pipe(texts, n_threads=n_threads, batch_size=batch_size,
|
||||||
|
disable=disable)
|
||||||
|
for doc, context in izip(docs, contexts):
|
||||||
|
yield (doc, context)
|
||||||
|
return
|
||||||
docs = (self.make_doc(text) for text in texts)
|
docs = (self.make_doc(text) for text in texts)
|
||||||
docs = texts
|
|
||||||
for proc in self.pipeline:
|
for proc in self.pipeline:
|
||||||
name = getattr(proc, 'name', None)
|
name = getattr(proc, 'name', None)
|
||||||
if name in disable:
|
if name in disable:
|
||||||
|
|
|
@ -44,6 +44,11 @@ class Lemmatizer(object):
|
||||||
return True
|
return True
|
||||||
elif univ_pos == 'verb' and morphology.get('VerbForm') == 'inf':
|
elif univ_pos == 'verb' and morphology.get('VerbForm') == 'inf':
|
||||||
return True
|
return True
|
||||||
|
# This maps 'VBP' to base form -- probably just need 'IS_BASE'
|
||||||
|
# morphology
|
||||||
|
elif univ_pos == 'verb' and (morphology.get('VerbForm') == 'fin' and \
|
||||||
|
morphology.get('Tense') == 'pres'):
|
||||||
|
return True
|
||||||
elif univ_pos == 'adj' and morphology.get('Degree') == 'pos':
|
elif univ_pos == 'adj' and morphology.get('Degree') == 'pos':
|
||||||
return True
|
return True
|
||||||
elif VerbForm_inf in morphology:
|
elif VerbForm_inf in morphology:
|
||||||
|
|
|
@ -171,6 +171,8 @@ cdef class Lexeme:
|
||||||
property rank:
|
property rank:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.c.id
|
return self.c.id
|
||||||
|
def __set__(self, value):
|
||||||
|
self.c.id = value
|
||||||
|
|
||||||
property sentiment:
|
property sentiment:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
|
|
|
@ -42,15 +42,148 @@ from .compat import json_dumps
|
||||||
|
|
||||||
from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP, POS
|
from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP, POS
|
||||||
from ._ml import rebatch, Tok2Vec, flatten, get_col, doc2feats
|
from ._ml import rebatch, Tok2Vec, flatten, get_col, doc2feats
|
||||||
|
from ._ml import build_text_classifier, build_tagger_model
|
||||||
from .parts_of_speech import X
|
from .parts_of_speech import X
|
||||||
|
|
||||||
|
|
||||||
class TokenVectorEncoder(object):
|
class SentenceSegmenter(object):
|
||||||
|
'''A simple spaCy hook, to allow custom sentence boundary detection logic
|
||||||
|
(that doesn't require the dependency parse).
|
||||||
|
|
||||||
|
To change the sentence boundary detection strategy, pass a generator
|
||||||
|
function `strategy` on initialization, or assign a new strategy to
|
||||||
|
the .strategy attribute.
|
||||||
|
|
||||||
|
Sentence detection strategies should be generators that take `Doc` objects
|
||||||
|
and yield `Span` objects for each sentence.
|
||||||
|
'''
|
||||||
|
name = 'sbd'
|
||||||
|
|
||||||
|
def __init__(self, vocab, strategy=None):
|
||||||
|
self.vocab = vocab
|
||||||
|
if strategy is None or strategy == 'on_punct':
|
||||||
|
strategy = self.split_on_punct
|
||||||
|
self.strategy = strategy
|
||||||
|
|
||||||
|
def __call__(self, doc):
|
||||||
|
doc.user_hooks['sents'] = self.strategy
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def split_on_punct(doc):
|
||||||
|
start = 0
|
||||||
|
seen_period = False
|
||||||
|
for i, word in enumerate(doc):
|
||||||
|
if seen_period and not word.is_punct:
|
||||||
|
yield doc[start : word.i]
|
||||||
|
start = word.i
|
||||||
|
seen_period = False
|
||||||
|
elif word.text in ['.', '!', '?']:
|
||||||
|
seen_period = True
|
||||||
|
if start < len(doc):
|
||||||
|
yield doc[start : len(doc)]
|
||||||
|
|
||||||
|
|
||||||
|
class BaseThincComponent(object):
|
||||||
|
name = None
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def Model(cls, *shape, **kwargs):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def __init__(self, vocab, model=True, **cfg):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def __call__(self, doc):
|
||||||
|
scores = self.predict([doc])
|
||||||
|
self.set_annotations([doc], scores)
|
||||||
|
return doc
|
||||||
|
|
||||||
|
def pipe(self, stream, batch_size=128, n_threads=-1):
|
||||||
|
for docs in cytoolz.partition_all(batch_size, stream):
|
||||||
|
docs = list(docs)
|
||||||
|
scores = self.predict(docs)
|
||||||
|
self.set_annotations(docs, scores)
|
||||||
|
yield from docs
|
||||||
|
|
||||||
|
def predict(self, docs):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def set_annotations(self, docs, scores):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def update(self, docs_tensors, golds, state=None, drop=0., sgd=None, losses=None):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def get_loss(self, docs, golds, scores):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def begin_training(self, gold_tuples=tuple(), pipeline=None):
|
||||||
|
token_vector_width = pipeline[0].model.nO
|
||||||
|
if self.model is True:
|
||||||
|
self.model = self.Model(1, token_vector_width)
|
||||||
|
|
||||||
|
def use_params(self, params):
|
||||||
|
with self.model.use_params(params):
|
||||||
|
yield
|
||||||
|
|
||||||
|
def to_bytes(self, **exclude):
|
||||||
|
serialize = OrderedDict((
|
||||||
|
('cfg', lambda: json_dumps(self.cfg)),
|
||||||
|
('model', lambda: self.model.to_bytes()),
|
||||||
|
('vocab', lambda: self.vocab.to_bytes())
|
||||||
|
))
|
||||||
|
return util.to_bytes(serialize, exclude)
|
||||||
|
|
||||||
|
def from_bytes(self, bytes_data, **exclude):
|
||||||
|
def load_model(b):
|
||||||
|
if self.model is True:
|
||||||
|
self.model = self.Model(**self.cfg)
|
||||||
|
self.model.from_bytes(b)
|
||||||
|
|
||||||
|
deserialize = OrderedDict((
|
||||||
|
('cfg', lambda b: self.cfg.update(ujson.loads(b))),
|
||||||
|
('model', load_model),
|
||||||
|
('vocab', lambda b: self.vocab.from_bytes(b))
|
||||||
|
))
|
||||||
|
util.from_bytes(bytes_data, deserialize, exclude)
|
||||||
|
return self
|
||||||
|
|
||||||
|
def to_disk(self, path, **exclude):
|
||||||
|
serialize = OrderedDict((
|
||||||
|
('cfg', lambda p: p.open('w').write(json_dumps(self.cfg))),
|
||||||
|
('model', lambda p: p.open('wb').write(self.model.to_bytes())),
|
||||||
|
('vocab', lambda p: self.vocab.to_disk(p))
|
||||||
|
))
|
||||||
|
util.to_disk(path, serialize, exclude)
|
||||||
|
|
||||||
|
def from_disk(self, path, **exclude):
|
||||||
|
def load_model(p):
|
||||||
|
if self.model is True:
|
||||||
|
self.model = self.Model(**self.cfg)
|
||||||
|
self.model.from_bytes(p.open('rb').read())
|
||||||
|
|
||||||
|
deserialize = OrderedDict((
|
||||||
|
('cfg', lambda p: self.cfg.update(_load_cfg(p))),
|
||||||
|
('model', load_model),
|
||||||
|
('vocab', lambda p: self.vocab.from_disk(p)),
|
||||||
|
))
|
||||||
|
util.from_disk(path, deserialize, exclude)
|
||||||
|
return self
|
||||||
|
|
||||||
|
|
||||||
|
def _load_cfg(path):
|
||||||
|
if path.exists():
|
||||||
|
return ujson.load(path.open())
|
||||||
|
else:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
|
||||||
|
class TokenVectorEncoder(BaseThincComponent):
|
||||||
"""Assign position-sensitive vectors to tokens, using a CNN or RNN."""
|
"""Assign position-sensitive vectors to tokens, using a CNN or RNN."""
|
||||||
name = 'tensorizer'
|
name = 'tensorizer'
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def Model(cls, width=128, embed_size=7500, **cfg):
|
def Model(cls, width=128, embed_size=4000, **cfg):
|
||||||
"""Create a new statistical model for the class.
|
"""Create a new statistical model for the class.
|
||||||
|
|
||||||
width (int): Output size of the model.
|
width (int): Output size of the model.
|
||||||
|
@ -79,6 +212,7 @@ class TokenVectorEncoder(object):
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self.doc2feats = doc2feats()
|
self.doc2feats = doc2feats()
|
||||||
self.model = model
|
self.model = model
|
||||||
|
self.cfg = dict(cfg)
|
||||||
|
|
||||||
def __call__(self, doc):
|
def __call__(self, doc):
|
||||||
"""Add context-sensitive vectors to a `Doc`, e.g. from a CNN or LSTM
|
"""Add context-sensitive vectors to a `Doc`, e.g. from a CNN or LSTM
|
||||||
|
@ -144,7 +278,7 @@ class TokenVectorEncoder(object):
|
||||||
# TODO: implement
|
# TODO: implement
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def begin_training(self, gold_tuples, pipeline=None):
|
def begin_training(self, gold_tuples=tuple(), pipeline=None):
|
||||||
"""Allocate models, pre-process training data and acquire a trainer and
|
"""Allocate models, pre-process training data and acquire a trainer and
|
||||||
optimizer.
|
optimizer.
|
||||||
|
|
||||||
|
@ -155,74 +289,34 @@ class TokenVectorEncoder(object):
|
||||||
if self.model is True:
|
if self.model is True:
|
||||||
self.model = self.Model()
|
self.model = self.Model()
|
||||||
|
|
||||||
def use_params(self, params):
|
|
||||||
"""Replace weights of models in the pipeline with those provided in the
|
|
||||||
params dictionary.
|
|
||||||
|
|
||||||
params (dict): A dictionary of parameters keyed by model ID.
|
class NeuralTagger(BaseThincComponent):
|
||||||
"""
|
|
||||||
with self.model.use_params(params):
|
|
||||||
yield
|
|
||||||
|
|
||||||
def to_bytes(self, **exclude):
|
|
||||||
serialize = OrderedDict((
|
|
||||||
('model', lambda: self.model.to_bytes()),
|
|
||||||
('vocab', lambda: self.vocab.to_bytes())
|
|
||||||
))
|
|
||||||
return util.to_bytes(serialize, exclude)
|
|
||||||
|
|
||||||
def from_bytes(self, bytes_data, **exclude):
|
|
||||||
if self.model is True:
|
|
||||||
self.model = self.Model()
|
|
||||||
deserialize = OrderedDict((
|
|
||||||
('model', lambda b: self.model.from_bytes(b)),
|
|
||||||
('vocab', lambda b: self.vocab.from_bytes(b))
|
|
||||||
))
|
|
||||||
util.from_bytes(bytes_data, deserialize, exclude)
|
|
||||||
return self
|
|
||||||
|
|
||||||
def to_disk(self, path, **exclude):
|
|
||||||
serialize = OrderedDict((
|
|
||||||
('model', lambda p: p.open('wb').write(self.model.to_bytes())),
|
|
||||||
('vocab', lambda p: self.vocab.to_disk(p))
|
|
||||||
))
|
|
||||||
util.to_disk(path, serialize, exclude)
|
|
||||||
|
|
||||||
def from_disk(self, path, **exclude):
|
|
||||||
if self.model is True:
|
|
||||||
self.model = self.Model()
|
|
||||||
deserialize = OrderedDict((
|
|
||||||
('model', lambda p: self.model.from_bytes(p.open('rb').read())),
|
|
||||||
('vocab', lambda p: self.vocab.from_disk(p))
|
|
||||||
))
|
|
||||||
util.from_disk(path, deserialize, exclude)
|
|
||||||
return self
|
|
||||||
|
|
||||||
|
|
||||||
class NeuralTagger(object):
|
|
||||||
name = 'tagger'
|
name = 'tagger'
|
||||||
def __init__(self, vocab, model=True):
|
def __init__(self, vocab, model=True, **cfg):
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self.model = model
|
self.model = model
|
||||||
|
self.cfg = dict(cfg)
|
||||||
|
|
||||||
def __call__(self, doc):
|
def __call__(self, doc):
|
||||||
tags = self.predict([doc.tensor])
|
tags = self.predict(([doc], [doc.tensor]))
|
||||||
self.set_annotations([doc], tags)
|
self.set_annotations([doc], tags)
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
def pipe(self, stream, batch_size=128, n_threads=-1):
|
def pipe(self, stream, batch_size=128, n_threads=-1):
|
||||||
for docs in cytoolz.partition_all(batch_size, stream):
|
for docs in cytoolz.partition_all(batch_size, stream):
|
||||||
|
docs = list(docs)
|
||||||
tokvecs = [d.tensor for d in docs]
|
tokvecs = [d.tensor for d in docs]
|
||||||
tag_ids = self.predict(tokvecs)
|
tag_ids = self.predict((docs, tokvecs))
|
||||||
self.set_annotations(docs, tag_ids)
|
self.set_annotations(docs, tag_ids)
|
||||||
yield from docs
|
yield from docs
|
||||||
|
|
||||||
def predict(self, tokvecs):
|
def predict(self, docs_tokvecs):
|
||||||
scores = self.model(tokvecs)
|
scores = self.model(docs_tokvecs)
|
||||||
scores = self.model.ops.flatten(scores)
|
scores = self.model.ops.flatten(scores)
|
||||||
guesses = scores.argmax(axis=1)
|
guesses = scores.argmax(axis=1)
|
||||||
if not isinstance(guesses, numpy.ndarray):
|
if not isinstance(guesses, numpy.ndarray):
|
||||||
guesses = guesses.get()
|
guesses = guesses.get()
|
||||||
|
tokvecs = docs_tokvecs[1]
|
||||||
guesses = self.model.ops.unflatten(guesses,
|
guesses = self.model.ops.unflatten(guesses,
|
||||||
[tv.shape[0] for tv in tokvecs])
|
[tv.shape[0] for tv in tokvecs])
|
||||||
return guesses
|
return guesses
|
||||||
|
@ -235,6 +329,8 @@ class NeuralTagger(object):
|
||||||
cdef Vocab vocab = self.vocab
|
cdef Vocab vocab = self.vocab
|
||||||
for i, doc in enumerate(docs):
|
for i, doc in enumerate(docs):
|
||||||
doc_tag_ids = batch_tag_ids[i]
|
doc_tag_ids = batch_tag_ids[i]
|
||||||
|
if hasattr(doc_tag_ids, 'get'):
|
||||||
|
doc_tag_ids = doc_tag_ids.get()
|
||||||
for j, tag_id in enumerate(doc_tag_ids):
|
for j, tag_id in enumerate(doc_tag_ids):
|
||||||
# Don't clobber preset POS tags
|
# Don't clobber preset POS tags
|
||||||
if doc.c[j].tag == 0 and doc.c[j].pos == 0:
|
if doc.c[j].tag == 0 and doc.c[j].pos == 0:
|
||||||
|
@ -243,16 +339,18 @@ class NeuralTagger(object):
|
||||||
doc.is_tagged = True
|
doc.is_tagged = True
|
||||||
|
|
||||||
def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None):
|
def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None):
|
||||||
|
if losses is not None and self.name not in losses:
|
||||||
|
losses[self.name] = 0.
|
||||||
docs, tokvecs = docs_tokvecs
|
docs, tokvecs = docs_tokvecs
|
||||||
|
|
||||||
if self.model.nI is None:
|
if self.model.nI is None:
|
||||||
self.model.nI = tokvecs[0].shape[1]
|
self.model.nI = tokvecs[0].shape[1]
|
||||||
|
tag_scores, bp_tag_scores = self.model.begin_update(docs_tokvecs, drop=drop)
|
||||||
tag_scores, bp_tag_scores = self.model.begin_update(tokvecs, drop=drop)
|
|
||||||
loss, d_tag_scores = self.get_loss(docs, golds, tag_scores)
|
loss, d_tag_scores = self.get_loss(docs, golds, tag_scores)
|
||||||
|
|
||||||
d_tokvecs = bp_tag_scores(d_tag_scores, sgd=sgd)
|
d_tokvecs = bp_tag_scores(d_tag_scores, sgd=sgd)
|
||||||
|
if losses is not None:
|
||||||
|
losses[self.name] += loss
|
||||||
return d_tokvecs
|
return d_tokvecs
|
||||||
|
|
||||||
def get_loss(self, docs, golds, scores):
|
def get_loss(self, docs, golds, scores):
|
||||||
|
@ -276,7 +374,7 @@ class NeuralTagger(object):
|
||||||
d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
|
d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
|
||||||
return float(loss), d_scores
|
return float(loss), d_scores
|
||||||
|
|
||||||
def begin_training(self, gold_tuples, pipeline=None):
|
def begin_training(self, gold_tuples=tuple(), pipeline=None):
|
||||||
orig_tag_map = dict(self.vocab.morphology.tag_map)
|
orig_tag_map = dict(self.vocab.morphology.tag_map)
|
||||||
new_tag_map = {}
|
new_tag_map = {}
|
||||||
for raw_text, annots_brackets in gold_tuples:
|
for raw_text, annots_brackets in gold_tuples:
|
||||||
|
@ -300,9 +398,7 @@ class NeuralTagger(object):
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def Model(cls, n_tags, token_vector_width):
|
def Model(cls, n_tags, token_vector_width):
|
||||||
return with_flatten(
|
return build_tagger_model(n_tags, token_vector_width)
|
||||||
chain(Maxout(token_vector_width, token_vector_width),
|
|
||||||
Softmax(n_tags, token_vector_width)))
|
|
||||||
|
|
||||||
def use_params(self, params):
|
def use_params(self, params):
|
||||||
with self.model.use_params(params):
|
with self.model.use_params(params):
|
||||||
|
@ -321,7 +417,8 @@ class NeuralTagger(object):
|
||||||
def from_bytes(self, bytes_data, **exclude):
|
def from_bytes(self, bytes_data, **exclude):
|
||||||
def load_model(b):
|
def load_model(b):
|
||||||
if self.model is True:
|
if self.model is True:
|
||||||
token_vector_width = util.env_opt('token_vector_width', 128)
|
token_vector_width = util.env_opt('token_vector_width',
|
||||||
|
self.cfg.get('token_vector_width', 128))
|
||||||
self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width)
|
self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width)
|
||||||
self.model.from_bytes(b)
|
self.model.from_bytes(b)
|
||||||
|
|
||||||
|
@ -348,13 +445,15 @@ class NeuralTagger(object):
|
||||||
use_bin_type=True,
|
use_bin_type=True,
|
||||||
encoding='utf8'))),
|
encoding='utf8'))),
|
||||||
('model', lambda p: p.open('wb').write(self.model.to_bytes())),
|
('model', lambda p: p.open('wb').write(self.model.to_bytes())),
|
||||||
|
('cfg', lambda p: p.open('w').write(json_dumps(self.cfg)))
|
||||||
))
|
))
|
||||||
util.to_disk(path, serialize, exclude)
|
util.to_disk(path, serialize, exclude)
|
||||||
|
|
||||||
def from_disk(self, path, **exclude):
|
def from_disk(self, path, **exclude):
|
||||||
def load_model(p):
|
def load_model(p):
|
||||||
if self.model is True:
|
if self.model is True:
|
||||||
token_vector_width = util.env_opt('token_vector_width', 128)
|
token_vector_width = util.env_opt('token_vector_width',
|
||||||
|
self.cfg.get('token_vector_width', 128))
|
||||||
self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width)
|
self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width)
|
||||||
self.model.from_bytes(p.open('rb').read())
|
self.model.from_bytes(p.open('rb').read())
|
||||||
|
|
||||||
|
@ -370,6 +469,7 @@ class NeuralTagger(object):
|
||||||
('vocab', lambda p: self.vocab.from_disk(p)),
|
('vocab', lambda p: self.vocab.from_disk(p)),
|
||||||
('tag_map', load_tag_map),
|
('tag_map', load_tag_map),
|
||||||
('model', load_model),
|
('model', load_model),
|
||||||
|
('cfg', lambda p: self.cfg.update(_load_cfg(p)))
|
||||||
))
|
))
|
||||||
util.from_disk(path, deserialize, exclude)
|
util.from_disk(path, deserialize, exclude)
|
||||||
return self
|
return self
|
||||||
|
@ -377,15 +477,23 @@ class NeuralTagger(object):
|
||||||
|
|
||||||
class NeuralLabeller(NeuralTagger):
|
class NeuralLabeller(NeuralTagger):
|
||||||
name = 'nn_labeller'
|
name = 'nn_labeller'
|
||||||
def __init__(self, vocab, model=True):
|
def __init__(self, vocab, model=True, **cfg):
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self.model = model
|
self.model = model
|
||||||
self.labels = {}
|
self.cfg = dict(cfg)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def labels(self):
|
||||||
|
return self.cfg.setdefault('labels', {})
|
||||||
|
|
||||||
|
@labels.setter
|
||||||
|
def labels(self, value):
|
||||||
|
self.cfg['labels'] = value
|
||||||
|
|
||||||
def set_annotations(self, docs, dep_ids):
|
def set_annotations(self, docs, dep_ids):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def begin_training(self, gold_tuples, pipeline=None):
|
def begin_training(self, gold_tuples=tuple(), pipeline=None):
|
||||||
gold_tuples = nonproj.preprocess_training_data(gold_tuples)
|
gold_tuples = nonproj.preprocess_training_data(gold_tuples)
|
||||||
for raw_text, annots_brackets in gold_tuples:
|
for raw_text, annots_brackets in gold_tuples:
|
||||||
for annots, brackets in annots_brackets:
|
for annots, brackets in annots_brackets:
|
||||||
|
@ -399,9 +507,7 @@ class NeuralLabeller(NeuralTagger):
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def Model(cls, n_tags, token_vector_width):
|
def Model(cls, n_tags, token_vector_width):
|
||||||
return with_flatten(
|
return build_tagger_model(n_tags, token_vector_width)
|
||||||
chain(Maxout(token_vector_width, token_vector_width),
|
|
||||||
Softmax(n_tags, token_vector_width)))
|
|
||||||
|
|
||||||
def get_loss(self, docs, golds, scores):
|
def get_loss(self, docs, golds, scores):
|
||||||
scores = self.model.ops.flatten(scores)
|
scores = self.model.ops.flatten(scores)
|
||||||
|
@ -423,7 +529,7 @@ class NeuralLabeller(NeuralTagger):
|
||||||
return float(loss), d_scores
|
return float(loss), d_scores
|
||||||
|
|
||||||
|
|
||||||
class SimilarityHook(object):
|
class SimilarityHook(BaseThincComponent):
|
||||||
"""
|
"""
|
||||||
Experimental
|
Experimental
|
||||||
|
|
||||||
|
@ -439,9 +545,10 @@ class SimilarityHook(object):
|
||||||
Where W is a vector of dimension weights, initialized to 1.
|
Where W is a vector of dimension weights, initialized to 1.
|
||||||
"""
|
"""
|
||||||
name = 'similarity'
|
name = 'similarity'
|
||||||
def __init__(self, vocab, model=True):
|
def __init__(self, vocab, model=True, **cfg):
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self.model = model
|
self.model = model
|
||||||
|
self.cfg = dict(cfg)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def Model(cls, length):
|
def Model(cls, length):
|
||||||
|
@ -467,7 +574,7 @@ class SimilarityHook(object):
|
||||||
|
|
||||||
return d_tensor1s, d_tensor2s
|
return d_tensor1s, d_tensor2s
|
||||||
|
|
||||||
def begin_training(self, _, pipeline=None):
|
def begin_training(self, _=tuple(), pipeline=None):
|
||||||
"""
|
"""
|
||||||
Allocate model, using width from tensorizer in pipeline.
|
Allocate model, using width from tensorizer in pipeline.
|
||||||
|
|
||||||
|
@ -477,48 +584,77 @@ class SimilarityHook(object):
|
||||||
if self.model is True:
|
if self.model is True:
|
||||||
self.model = self.Model(pipeline[0].model.nO)
|
self.model = self.Model(pipeline[0].model.nO)
|
||||||
|
|
||||||
def use_params(self, params):
|
|
||||||
"""Replace weights of models in the pipeline with those provided in the
|
|
||||||
params dictionary.
|
|
||||||
|
|
||||||
params (dict): A dictionary of parameters keyed by model ID.
|
class TextCategorizer(BaseThincComponent):
|
||||||
"""
|
name = 'textcat'
|
||||||
with self.model.use_params(params):
|
|
||||||
yield
|
|
||||||
|
|
||||||
def to_bytes(self, **exclude):
|
@classmethod
|
||||||
serialize = OrderedDict((
|
def Model(cls, nr_class=1, width=64, **cfg):
|
||||||
('model', lambda: self.model.to_bytes()),
|
return build_text_classifier(nr_class, width, **cfg)
|
||||||
('vocab', lambda: self.vocab.to_bytes())
|
|
||||||
))
|
|
||||||
return util.to_bytes(serialize, exclude)
|
|
||||||
|
|
||||||
def from_bytes(self, bytes_data, **exclude):
|
def __init__(self, vocab, model=True, **cfg):
|
||||||
|
self.vocab = vocab
|
||||||
|
self.model = model
|
||||||
|
self.cfg = dict(cfg)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def labels(self):
|
||||||
|
return self.cfg.get('labels', ['LABEL'])
|
||||||
|
|
||||||
|
@labels.setter
|
||||||
|
def labels(self, value):
|
||||||
|
self.cfg['labels'] = value
|
||||||
|
|
||||||
|
def __call__(self, doc):
|
||||||
|
scores = self.predict([doc])
|
||||||
|
self.set_annotations([doc], scores)
|
||||||
|
return doc
|
||||||
|
|
||||||
|
def pipe(self, stream, batch_size=128, n_threads=-1):
|
||||||
|
for docs in cytoolz.partition_all(batch_size, stream):
|
||||||
|
docs = list(docs)
|
||||||
|
scores = self.predict(docs)
|
||||||
|
self.set_annotations(docs, scores)
|
||||||
|
yield from docs
|
||||||
|
|
||||||
|
def predict(self, docs):
|
||||||
|
scores = self.model(docs)
|
||||||
|
scores = self.model.ops.asarray(scores)
|
||||||
|
return scores
|
||||||
|
|
||||||
|
def set_annotations(self, docs, scores):
|
||||||
|
for i, doc in enumerate(docs):
|
||||||
|
for j, label in enumerate(self.labels):
|
||||||
|
doc.cats[label] = float(scores[i, j])
|
||||||
|
|
||||||
|
def update(self, docs_tensors, golds, state=None, drop=0., sgd=None, losses=None):
|
||||||
|
docs, tensors = docs_tensors
|
||||||
|
scores, bp_scores = self.model.begin_update(docs, drop=drop)
|
||||||
|
loss, d_scores = self.get_loss(docs, golds, scores)
|
||||||
|
d_tensors = bp_scores(d_scores, sgd=sgd)
|
||||||
|
if losses is not None:
|
||||||
|
losses.setdefault(self.name, 0.0)
|
||||||
|
losses[self.name] += loss
|
||||||
|
return d_tensors
|
||||||
|
|
||||||
|
def get_loss(self, docs, golds, scores):
|
||||||
|
truths = numpy.zeros((len(golds), len(self.labels)), dtype='f')
|
||||||
|
for i, gold in enumerate(golds):
|
||||||
|
for j, label in enumerate(self.labels):
|
||||||
|
truths[i, j] = label in gold.cats
|
||||||
|
truths = self.model.ops.asarray(truths)
|
||||||
|
d_scores = (scores-truths) / scores.shape[0]
|
||||||
|
mean_square_error = ((scores-truths)**2).sum(axis=1).mean()
|
||||||
|
return mean_square_error, d_scores
|
||||||
|
|
||||||
|
def begin_training(self, gold_tuples=tuple(), pipeline=None):
|
||||||
|
if pipeline and getattr(pipeline[0], 'name', None) == 'tensorizer':
|
||||||
|
token_vector_width = pipeline[0].model.nO
|
||||||
|
else:
|
||||||
|
token_vector_width = 64
|
||||||
if self.model is True:
|
if self.model is True:
|
||||||
self.model = self.Model()
|
self.model = self.Model(len(self.labels), token_vector_width,
|
||||||
deserialize = OrderedDict((
|
**self.cfg)
|
||||||
('model', lambda b: self.model.from_bytes(b)),
|
|
||||||
('vocab', lambda b: self.vocab.from_bytes(b))
|
|
||||||
))
|
|
||||||
util.from_bytes(bytes_data, deserialize, exclude)
|
|
||||||
return self
|
|
||||||
|
|
||||||
def to_disk(self, path, **exclude):
|
|
||||||
serialize = OrderedDict((
|
|
||||||
('model', lambda p: p.open('wb').write(self.model.to_bytes())),
|
|
||||||
('vocab', lambda p: self.vocab.to_disk(p))
|
|
||||||
))
|
|
||||||
util.to_disk(path, serialize, exclude)
|
|
||||||
|
|
||||||
def from_disk(self, path, **exclude):
|
|
||||||
if self.model is True:
|
|
||||||
self.model = self.Model()
|
|
||||||
deserialize = OrderedDict((
|
|
||||||
('model', lambda p: self.model.from_bytes(p.open('rb').read())),
|
|
||||||
('vocab', lambda p: self.vocab.from_disk(p))
|
|
||||||
))
|
|
||||||
util.from_disk(path, deserialize, exclude)
|
|
||||||
return self
|
|
||||||
|
|
||||||
|
|
||||||
cdef class EntityRecognizer(LinearParser):
|
cdef class EntityRecognizer(LinearParser):
|
||||||
|
@ -569,6 +705,14 @@ cdef class NeuralEntityRecognizer(NeuralParser):
|
||||||
|
|
||||||
nr_feature = 6
|
nr_feature = 6
|
||||||
|
|
||||||
|
def predict_confidences(self, docs):
|
||||||
|
tensors = [d.tensor for d in docs]
|
||||||
|
samples = []
|
||||||
|
for i in range(10):
|
||||||
|
states = self.parse_batch(docs, tensors, drop=0.3)
|
||||||
|
for state in states:
|
||||||
|
samples.append(self._get_entities(state))
|
||||||
|
|
||||||
def __reduce__(self):
|
def __reduce__(self):
|
||||||
return (NeuralEntityRecognizer, (self.vocab, self.moves, self.model), None, None)
|
return (NeuralEntityRecognizer, (self.vocab, self.moves, self.model), None, None)
|
||||||
|
|
||||||
|
|
|
@ -215,7 +215,10 @@ cdef class StringStore:
|
||||||
path = util.ensure_path(path)
|
path = util.ensure_path(path)
|
||||||
with path.open('r') as file_:
|
with path.open('r') as file_:
|
||||||
strings = ujson.load(file_)
|
strings = ujson.load(file_)
|
||||||
|
prev = list(self)
|
||||||
self._reset_and_load(strings)
|
self._reset_and_load(strings)
|
||||||
|
for word in prev:
|
||||||
|
self.add(word)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def to_bytes(self, **exclude):
|
def to_bytes(self, **exclude):
|
||||||
|
@ -234,7 +237,10 @@ cdef class StringStore:
|
||||||
RETURNS (StringStore): The `StringStore` object.
|
RETURNS (StringStore): The `StringStore` object.
|
||||||
"""
|
"""
|
||||||
strings = ujson.loads(bytes_data)
|
strings = ujson.loads(bytes_data)
|
||||||
|
prev = list(self)
|
||||||
self._reset_and_load(strings)
|
self._reset_and_load(strings)
|
||||||
|
for word in prev:
|
||||||
|
self.add(word)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def set_frozen(self, bint is_frozen):
|
def set_frozen(self, bint is_frozen):
|
||||||
|
|
286
spacy/syntax/_beam_utils.pyx
Normal file
286
spacy/syntax/_beam_utils.pyx
Normal file
|
@ -0,0 +1,286 @@
|
||||||
|
# cython: infer_types=True
|
||||||
|
# cython: profile=True
|
||||||
|
cimport numpy as np
|
||||||
|
import numpy
|
||||||
|
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
|
||||||
|
from thinc.extra.search cimport Beam
|
||||||
|
from thinc.extra.search import MaxViolation
|
||||||
|
from thinc.typedefs cimport hash_t, class_t
|
||||||
|
from thinc.extra.search cimport MaxViolation
|
||||||
|
|
||||||
|
from .transition_system cimport TransitionSystem, Transition
|
||||||
|
from .stateclass cimport StateClass
|
||||||
|
from ..gold cimport GoldParse
|
||||||
|
from ..tokens.doc cimport Doc
|
||||||
|
|
||||||
|
|
||||||
|
# These are passed as callbacks to thinc.search.Beam
|
||||||
|
cdef int _transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1:
|
||||||
|
dest = <StateClass>_dest
|
||||||
|
src = <StateClass>_src
|
||||||
|
moves = <const Transition*>_moves
|
||||||
|
dest.clone(src)
|
||||||
|
moves[clas].do(dest.c, moves[clas].label)
|
||||||
|
|
||||||
|
|
||||||
|
cdef int _check_final_state(void* _state, void* extra_args) except -1:
|
||||||
|
return (<StateClass>_state).is_final()
|
||||||
|
|
||||||
|
|
||||||
|
def _cleanup(Beam beam):
|
||||||
|
for i in range(beam.width):
|
||||||
|
Py_XDECREF(<PyObject*>beam._states[i].content)
|
||||||
|
Py_XDECREF(<PyObject*>beam._parents[i].content)
|
||||||
|
|
||||||
|
|
||||||
|
cdef hash_t _hash_state(void* _state, void* _) except 0:
|
||||||
|
state = <StateClass>_state
|
||||||
|
if state.c.is_final():
|
||||||
|
return 1
|
||||||
|
else:
|
||||||
|
return state.c.hash()
|
||||||
|
|
||||||
|
|
||||||
|
cdef class ParserBeam(object):
|
||||||
|
cdef public TransitionSystem moves
|
||||||
|
cdef public object states
|
||||||
|
cdef public object golds
|
||||||
|
cdef public object beams
|
||||||
|
cdef public object dones
|
||||||
|
|
||||||
|
def __init__(self, TransitionSystem moves, states, golds,
|
||||||
|
int width, float density):
|
||||||
|
self.moves = moves
|
||||||
|
self.states = states
|
||||||
|
self.golds = golds
|
||||||
|
self.beams = []
|
||||||
|
cdef Beam beam
|
||||||
|
cdef StateClass state, st
|
||||||
|
for state in states:
|
||||||
|
beam = Beam(self.moves.n_moves, width, density)
|
||||||
|
beam.initialize(self.moves.init_beam_state, state.c.length, state.c._sent)
|
||||||
|
for i in range(beam.width):
|
||||||
|
st = <StateClass>beam.at(i)
|
||||||
|
st.c.offset = state.c.offset
|
||||||
|
self.beams.append(beam)
|
||||||
|
self.dones = [False] * len(self.beams)
|
||||||
|
|
||||||
|
def __dealloc__(self):
|
||||||
|
if self.beams is not None:
|
||||||
|
for beam in self.beams:
|
||||||
|
if beam is not None:
|
||||||
|
_cleanup(beam)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def is_done(self):
|
||||||
|
return all(b.is_done or self.dones[i] for i, b in enumerate(self.beams))
|
||||||
|
|
||||||
|
def __getitem__(self, i):
|
||||||
|
return self.beams[i]
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return len(self.beams)
|
||||||
|
|
||||||
|
def advance(self, scores, follow_gold=False):
|
||||||
|
cdef Beam beam
|
||||||
|
for i, beam in enumerate(self.beams):
|
||||||
|
if beam.is_done or not scores[i].size or self.dones[i]:
|
||||||
|
continue
|
||||||
|
self._set_scores(beam, scores[i])
|
||||||
|
if self.golds is not None:
|
||||||
|
self._set_costs(beam, self.golds[i], follow_gold=follow_gold)
|
||||||
|
if follow_gold:
|
||||||
|
beam.advance(_transition_state, NULL, <void*>self.moves.c)
|
||||||
|
else:
|
||||||
|
beam.advance(_transition_state, _hash_state, <void*>self.moves.c)
|
||||||
|
beam.check_done(_check_final_state, NULL)
|
||||||
|
if beam.is_done and self.golds is not None:
|
||||||
|
for j in range(beam.size):
|
||||||
|
state = <StateClass>beam.at(j)
|
||||||
|
if state.is_final():
|
||||||
|
try:
|
||||||
|
if self.moves.is_gold_parse(state, self.golds[i]):
|
||||||
|
beam._states[j].loss = 0.0
|
||||||
|
elif beam._states[j].loss == 0.0:
|
||||||
|
beam._states[j].loss = 1.0
|
||||||
|
except NotImplementedError:
|
||||||
|
break
|
||||||
|
|
||||||
|
def _set_scores(self, Beam beam, float[:, ::1] scores):
|
||||||
|
cdef float* c_scores = &scores[0, 0]
|
||||||
|
cdef int nr_state = min(scores.shape[0], beam.size)
|
||||||
|
cdef int nr_class = scores.shape[1]
|
||||||
|
for i in range(nr_state):
|
||||||
|
state = <StateClass>beam.at(i)
|
||||||
|
if not state.is_final():
|
||||||
|
for j in range(nr_class):
|
||||||
|
beam.scores[i][j] = c_scores[i * nr_class + j]
|
||||||
|
self.moves.set_valid(beam.is_valid[i], state.c)
|
||||||
|
else:
|
||||||
|
for j in range(beam.nr_class):
|
||||||
|
beam.scores[i][j] = 0
|
||||||
|
beam.costs[i][j] = 0
|
||||||
|
|
||||||
|
def _set_costs(self, Beam beam, GoldParse gold, int follow_gold=False):
|
||||||
|
for i in range(beam.size):
|
||||||
|
state = <StateClass>beam.at(i)
|
||||||
|
if not state.c.is_final():
|
||||||
|
self.moves.set_costs(beam.is_valid[i], beam.costs[i], state, gold)
|
||||||
|
if follow_gold:
|
||||||
|
for j in range(beam.nr_class):
|
||||||
|
if beam.costs[i][j] >= 1:
|
||||||
|
beam.is_valid[i][j] = 0
|
||||||
|
|
||||||
|
|
||||||
|
def get_token_ids(states, int n_tokens):
|
||||||
|
cdef StateClass state
|
||||||
|
cdef np.ndarray ids = numpy.zeros((len(states), n_tokens),
|
||||||
|
dtype='int32', order='C')
|
||||||
|
c_ids = <int*>ids.data
|
||||||
|
for i, state in enumerate(states):
|
||||||
|
if not state.is_final():
|
||||||
|
state.c.set_context_tokens(c_ids, n_tokens)
|
||||||
|
else:
|
||||||
|
ids[i] = -1
|
||||||
|
c_ids += ids.shape[1]
|
||||||
|
return ids
|
||||||
|
|
||||||
|
nr_update = 0
|
||||||
|
def update_beam(TransitionSystem moves, int nr_feature, int max_steps,
|
||||||
|
states, tokvecs, golds,
|
||||||
|
state2vec, vec2scores,
|
||||||
|
int width, float density,
|
||||||
|
sgd=None, losses=None, drop=0.):
|
||||||
|
global nr_update
|
||||||
|
cdef MaxViolation violn
|
||||||
|
nr_update += 1
|
||||||
|
pbeam = ParserBeam(moves, states, golds,
|
||||||
|
width=width, density=density)
|
||||||
|
gbeam = ParserBeam(moves, states, golds,
|
||||||
|
width=width, density=0.0)
|
||||||
|
cdef StateClass state
|
||||||
|
beam_maps = []
|
||||||
|
backprops = []
|
||||||
|
violns = [MaxViolation() for _ in range(len(states))]
|
||||||
|
for t in range(max_steps):
|
||||||
|
if pbeam.is_done and gbeam.is_done:
|
||||||
|
break
|
||||||
|
# The beam maps let us find the right row in the flattened scores
|
||||||
|
# arrays for each state. States are identified by (example id, history).
|
||||||
|
# We keep a different beam map for each step (since we'll have a flat
|
||||||
|
# scores array for each step). The beam map will let us take the per-state
|
||||||
|
# losses, and compute the gradient for each (step, state, class).
|
||||||
|
beam_maps.append({})
|
||||||
|
# Gather all states from the two beams in a list. Some stats may occur
|
||||||
|
# in both beams. To figure out which beam each state belonged to,
|
||||||
|
# we keep two lists of indices, p_indices and g_indices
|
||||||
|
states, p_indices, g_indices = get_states(pbeam, gbeam, beam_maps[-1], nr_update)
|
||||||
|
if not states:
|
||||||
|
break
|
||||||
|
# Now that we have our flat list of states, feed them through the model
|
||||||
|
token_ids = get_token_ids(states, nr_feature)
|
||||||
|
vectors, bp_vectors = state2vec.begin_update(token_ids, drop=drop)
|
||||||
|
scores, bp_scores = vec2scores.begin_update(vectors, drop=drop)
|
||||||
|
|
||||||
|
# Store the callbacks for the backward pass
|
||||||
|
backprops.append((token_ids, bp_vectors, bp_scores))
|
||||||
|
|
||||||
|
# Unpack the flat scores into lists for the two beams. The indices arrays
|
||||||
|
# tell us which example and state the scores-row refers to.
|
||||||
|
p_scores = [numpy.ascontiguousarray(scores[indices], dtype='f') for indices in p_indices]
|
||||||
|
g_scores = [numpy.ascontiguousarray(scores[indices], dtype='f') for indices in g_indices]
|
||||||
|
# Now advance the states in the beams. The gold beam is contrained to
|
||||||
|
# to follow only gold analyses.
|
||||||
|
pbeam.advance(p_scores)
|
||||||
|
gbeam.advance(g_scores, follow_gold=True)
|
||||||
|
# Track the "maximum violation", to use in the update.
|
||||||
|
for i, violn in enumerate(violns):
|
||||||
|
violn.check_crf(pbeam[i], gbeam[i])
|
||||||
|
histories = []
|
||||||
|
losses = []
|
||||||
|
for violn in violns:
|
||||||
|
if violn.p_hist:
|
||||||
|
histories.append(violn.p_hist + violn.g_hist)
|
||||||
|
losses.append(violn.p_probs + violn.g_probs)
|
||||||
|
else:
|
||||||
|
histories.append([])
|
||||||
|
losses.append([])
|
||||||
|
states_d_scores = get_gradient(moves.n_moves, beam_maps, histories, losses)
|
||||||
|
return states_d_scores, backprops[:len(states_d_scores)]
|
||||||
|
|
||||||
|
|
||||||
|
def get_states(pbeams, gbeams, beam_map, nr_update):
|
||||||
|
seen = {}
|
||||||
|
states = []
|
||||||
|
p_indices = []
|
||||||
|
g_indices = []
|
||||||
|
cdef Beam pbeam, gbeam
|
||||||
|
assert len(pbeams) == len(gbeams)
|
||||||
|
for eg_id, (pbeam, gbeam) in enumerate(zip(pbeams, gbeams)):
|
||||||
|
p_indices.append([])
|
||||||
|
g_indices.append([])
|
||||||
|
for i in range(pbeam.size):
|
||||||
|
state = <StateClass>pbeam.at(i)
|
||||||
|
if not state.is_final():
|
||||||
|
key = tuple([eg_id] + pbeam.histories[i])
|
||||||
|
assert key not in seen, (key, seen)
|
||||||
|
seen[key] = len(states)
|
||||||
|
p_indices[-1].append(len(states))
|
||||||
|
states.append(state)
|
||||||
|
beam_map.update(seen)
|
||||||
|
for i in range(gbeam.size):
|
||||||
|
state = <StateClass>gbeam.at(i)
|
||||||
|
if not state.is_final():
|
||||||
|
key = tuple([eg_id] + gbeam.histories[i])
|
||||||
|
if key in seen:
|
||||||
|
g_indices[-1].append(seen[key])
|
||||||
|
else:
|
||||||
|
g_indices[-1].append(len(states))
|
||||||
|
beam_map[key] = len(states)
|
||||||
|
states.append(state)
|
||||||
|
p_idx = [numpy.asarray(idx, dtype='i') for idx in p_indices]
|
||||||
|
g_idx = [numpy.asarray(idx, dtype='i') for idx in g_indices]
|
||||||
|
return states, p_idx, g_idx
|
||||||
|
|
||||||
|
|
||||||
|
def get_gradient(nr_class, beam_maps, histories, losses):
|
||||||
|
"""
|
||||||
|
The global model assigns a loss to each parse. The beam scores
|
||||||
|
are additive, so the same gradient is applied to each action
|
||||||
|
in the history. This gives the gradient of a single *action*
|
||||||
|
for a beam state -- so we have "the gradient of loss for taking
|
||||||
|
action i given history H."
|
||||||
|
|
||||||
|
Histories: Each hitory is a list of actions
|
||||||
|
Each candidate has a history
|
||||||
|
Each beam has multiple candidates
|
||||||
|
Each batch has multiple beams
|
||||||
|
So history is list of lists of lists of ints
|
||||||
|
"""
|
||||||
|
nr_step = len(beam_maps)
|
||||||
|
grads = []
|
||||||
|
nr_step = 0
|
||||||
|
for eg_id, hists in enumerate(histories):
|
||||||
|
for loss, hist in zip(losses[eg_id], hists):
|
||||||
|
if loss != 0.0 and not numpy.isnan(loss):
|
||||||
|
nr_step = max(nr_step, len(hist))
|
||||||
|
for i in range(nr_step):
|
||||||
|
grads.append(numpy.zeros((max(beam_maps[i].values())+1, nr_class), dtype='f'))
|
||||||
|
assert len(histories) == len(losses)
|
||||||
|
for eg_id, hists in enumerate(histories):
|
||||||
|
for loss, hist in zip(losses[eg_id], hists):
|
||||||
|
if loss == 0.0 or numpy.isnan(loss):
|
||||||
|
continue
|
||||||
|
key = tuple([eg_id])
|
||||||
|
# Adjust loss for length
|
||||||
|
avg_loss = loss / len(hist)
|
||||||
|
loss += avg_loss * (nr_step - len(hist))
|
||||||
|
for j, clas in enumerate(hist):
|
||||||
|
i = beam_maps[j][key]
|
||||||
|
# In step j, at state i action clas
|
||||||
|
# resulted in loss
|
||||||
|
grads[j][i, clas] += loss
|
||||||
|
key = key + tuple([clas])
|
||||||
|
return grads
|
||||||
|
|
||||||
|
|
|
@ -37,6 +37,7 @@ cdef cppclass StateC:
|
||||||
this.shifted = <bint*>calloc(length + (PADDING * 2), sizeof(bint))
|
this.shifted = <bint*>calloc(length + (PADDING * 2), sizeof(bint))
|
||||||
this._sent = <TokenC*>calloc(length + (PADDING * 2), sizeof(TokenC))
|
this._sent = <TokenC*>calloc(length + (PADDING * 2), sizeof(TokenC))
|
||||||
this._ents = <Entity*>calloc(length + (PADDING * 2), sizeof(Entity))
|
this._ents = <Entity*>calloc(length + (PADDING * 2), sizeof(Entity))
|
||||||
|
this.offset = 0
|
||||||
cdef int i
|
cdef int i
|
||||||
for i in range(length + (PADDING * 2)):
|
for i in range(length + (PADDING * 2)):
|
||||||
this._ents[i].end = -1
|
this._ents[i].end = -1
|
||||||
|
@ -73,7 +74,16 @@ cdef cppclass StateC:
|
||||||
free(this.shifted - PADDING)
|
free(this.shifted - PADDING)
|
||||||
|
|
||||||
void set_context_tokens(int* ids, int n) nogil:
|
void set_context_tokens(int* ids, int n) nogil:
|
||||||
if n == 13:
|
if n == 8:
|
||||||
|
ids[0] = this.B(0)
|
||||||
|
ids[1] = this.B(1)
|
||||||
|
ids[2] = this.S(0)
|
||||||
|
ids[3] = this.S(1)
|
||||||
|
ids[4] = this.H(this.S(0))
|
||||||
|
ids[5] = this.L(this.B(0), 1)
|
||||||
|
ids[6] = this.L(this.S(0), 2)
|
||||||
|
ids[7] = this.R(this.S(0), 1)
|
||||||
|
elif n == 13:
|
||||||
ids[0] = this.B(0)
|
ids[0] = this.B(0)
|
||||||
ids[1] = this.B(1)
|
ids[1] = this.B(1)
|
||||||
ids[2] = this.S(0)
|
ids[2] = this.S(0)
|
||||||
|
|
|
@ -10,6 +10,8 @@ from libc.stdint cimport uint32_t
|
||||||
from libc.string cimport memcpy
|
from libc.string cimport memcpy
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
|
from thinc.extra.search cimport Beam
|
||||||
|
import numpy
|
||||||
|
|
||||||
from .stateclass cimport StateClass
|
from .stateclass cimport StateClass
|
||||||
from ._state cimport StateC, is_space_token
|
from ._state cimport StateC, is_space_token
|
||||||
|
@ -18,7 +20,7 @@ from .transition_system cimport do_func_t, get_cost_func_t
|
||||||
from .transition_system cimport move_cost_func_t, label_cost_func_t
|
from .transition_system cimport move_cost_func_t, label_cost_func_t
|
||||||
from ..gold cimport GoldParse
|
from ..gold cimport GoldParse
|
||||||
from ..gold cimport GoldParseC
|
from ..gold cimport GoldParseC
|
||||||
from ..attrs cimport TAG, HEAD, DEP, ENT_IOB, ENT_TYPE, IS_SPACE
|
from ..attrs cimport TAG, HEAD, DEP, ENT_IOB, ENT_TYPE, IS_SPACE, IS_PUNCT
|
||||||
from ..lexeme cimport Lexeme
|
from ..lexeme cimport Lexeme
|
||||||
from ..structs cimport TokenC
|
from ..structs cimport TokenC
|
||||||
|
|
||||||
|
@ -284,7 +286,7 @@ cdef class Break:
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
cdef int _get_root(int word, const GoldParseC* gold) nogil:
|
cdef int _get_root(int word, const GoldParseC* gold) nogil:
|
||||||
while gold.heads[word] != word and not gold.has_dep[word] and word >= 0:
|
while gold.heads[word] != word and gold.has_dep[word] and word >= 0:
|
||||||
word = gold.heads[word]
|
word = gold.heads[word]
|
||||||
if not gold.has_dep[word]:
|
if not gold.has_dep[word]:
|
||||||
return -1
|
return -1
|
||||||
|
@ -349,6 +351,20 @@ cdef class ArcEager(TransitionSystem):
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return (SHIFT, REDUCE, LEFT, RIGHT, BREAK)
|
return (SHIFT, REDUCE, LEFT, RIGHT, BREAK)
|
||||||
|
|
||||||
|
def is_gold_parse(self, StateClass state, GoldParse gold):
|
||||||
|
predicted = set()
|
||||||
|
truth = set()
|
||||||
|
for i in range(gold.length):
|
||||||
|
if gold.cand_to_gold[i] is None:
|
||||||
|
continue
|
||||||
|
if state.safe_get(i).dep:
|
||||||
|
predicted.add((i, state.H(i), self.strings[state.safe_get(i).dep]))
|
||||||
|
else:
|
||||||
|
predicted.add((i, state.H(i), 'ROOT'))
|
||||||
|
id_, word, tag, head, dep, ner = gold.orig_annot[gold.cand_to_gold[i]]
|
||||||
|
truth.add((id_, head, dep))
|
||||||
|
return truth == predicted
|
||||||
|
|
||||||
def has_gold(self, GoldParse gold, start=0, end=None):
|
def has_gold(self, GoldParse gold, start=0, end=None):
|
||||||
end = end or len(gold.heads)
|
end = end or len(gold.heads)
|
||||||
if all([tag is None for tag in gold.heads[start:end]]):
|
if all([tag is None for tag in gold.heads[start:end]]):
|
||||||
|
@ -360,7 +376,7 @@ cdef class ArcEager(TransitionSystem):
|
||||||
if not self.has_gold(gold):
|
if not self.has_gold(gold):
|
||||||
return None
|
return None
|
||||||
for i in range(gold.length):
|
for i in range(gold.length):
|
||||||
if gold.heads[i] is None: # Missing values
|
if gold.heads[i] is None or gold.labels[i] is None: # Missing values
|
||||||
gold.c.heads[i] = i
|
gold.c.heads[i] = i
|
||||||
gold.c.has_dep[i] = False
|
gold.c.has_dep[i] = False
|
||||||
else:
|
else:
|
||||||
|
@ -383,6 +399,7 @@ cdef class ArcEager(TransitionSystem):
|
||||||
for i in range(self.n_moves):
|
for i in range(self.n_moves):
|
||||||
if self.c[i].move == move and self.c[i].label == label:
|
if self.c[i].move == move and self.c[i].label == label:
|
||||||
return self.c[i]
|
return self.c[i]
|
||||||
|
return Transition(clas=0, move=MISSING, label=0)
|
||||||
|
|
||||||
def move_name(self, int move, attr_t label):
|
def move_name(self, int move, attr_t label):
|
||||||
label_str = self.strings[label]
|
label_str = self.strings[label]
|
||||||
|
@ -499,9 +516,11 @@ cdef class ArcEager(TransitionSystem):
|
||||||
"before training and after parsing. Either pass make_projective=True "
|
"before training and after parsing. Either pass make_projective=True "
|
||||||
"to the GoldParse class, or use PseudoProjectivity.preprocess_training_data")
|
"to the GoldParse class, or use PseudoProjectivity.preprocess_training_data")
|
||||||
else:
|
else:
|
||||||
|
print(gold.orig_annot)
|
||||||
print(gold.words)
|
print(gold.words)
|
||||||
print(gold.heads)
|
print(gold.heads)
|
||||||
print(gold.labels)
|
print(gold.labels)
|
||||||
|
print(gold.sent_starts)
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"Could not find a gold-standard action to supervise the dependency "
|
"Could not find a gold-standard action to supervise the dependency "
|
||||||
"parser.\n"
|
"parser.\n"
|
||||||
|
@ -510,3 +529,23 @@ cdef class ArcEager(TransitionSystem):
|
||||||
"State at failure:\n"
|
"State at failure:\n"
|
||||||
"%s" % (self.n_moves, stcls.print_state(gold.words)))
|
"%s" % (self.n_moves, stcls.print_state(gold.words)))
|
||||||
assert n_gold >= 1
|
assert n_gold >= 1
|
||||||
|
|
||||||
|
def get_beam_annot(self, Beam beam):
|
||||||
|
length = (<StateClass>beam.at(0)).c.length
|
||||||
|
heads = [{} for _ in range(length)]
|
||||||
|
deps = [{} for _ in range(length)]
|
||||||
|
probs = beam.probs
|
||||||
|
for i in range(beam.size):
|
||||||
|
stcls = <StateClass>beam.at(i)
|
||||||
|
self.finalize_state(stcls.c)
|
||||||
|
if stcls.is_final():
|
||||||
|
prob = probs[i]
|
||||||
|
for j in range(stcls.c.length):
|
||||||
|
head = j + stcls.c._sent[j].head
|
||||||
|
dep = stcls.c._sent[j].dep
|
||||||
|
heads[j].setdefault(head, 0.0)
|
||||||
|
heads[j][head] += prob
|
||||||
|
deps[j].setdefault(dep, 0.0)
|
||||||
|
deps[j][dep] += prob
|
||||||
|
return heads, deps
|
||||||
|
|
||||||
|
|
|
@ -107,7 +107,7 @@ cdef class BeamParser(Parser):
|
||||||
# The non-monotonic oracle makes it difficult to ensure final costs are
|
# The non-monotonic oracle makes it difficult to ensure final costs are
|
||||||
# correct. Therefore do final correction
|
# correct. Therefore do final correction
|
||||||
for i in range(pred.size):
|
for i in range(pred.size):
|
||||||
if is_gold(<StateClass>pred.at(i), gold_parse, self.moves.strings):
|
if self.moves.is_gold_parse(<StateClass>pred.at(i), gold_parse):
|
||||||
pred._states[i].loss = 0.0
|
pred._states[i].loss = 0.0
|
||||||
elif pred._states[i].loss == 0.0:
|
elif pred._states[i].loss == 0.0:
|
||||||
pred._states[i].loss = 1.0
|
pred._states[i].loss = 1.0
|
||||||
|
@ -213,7 +213,7 @@ def _check_train_integrity(Beam pred, Beam gold, GoldParse gold_parse, Transitio
|
||||||
if not pred._states[i].is_done or pred._states[i].loss == 0:
|
if not pred._states[i].is_done or pred._states[i].loss == 0:
|
||||||
continue
|
continue
|
||||||
state = <StateClass>pred.at(i)
|
state = <StateClass>pred.at(i)
|
||||||
if is_gold(state, gold_parse, moves.strings) == True:
|
if moves.is_gold_parse(state, gold_parse) == True:
|
||||||
for dep in gold_parse.orig_annot:
|
for dep in gold_parse.orig_annot:
|
||||||
print(dep[1], dep[3], dep[4])
|
print(dep[1], dep[3], dep[4])
|
||||||
print("Cost", pred._states[i].loss)
|
print("Cost", pred._states[i].loss)
|
||||||
|
@ -227,7 +227,7 @@ def _check_train_integrity(Beam pred, Beam gold, GoldParse gold_parse, Transitio
|
||||||
if not gold._states[i].is_done:
|
if not gold._states[i].is_done:
|
||||||
continue
|
continue
|
||||||
state = <StateClass>gold.at(i)
|
state = <StateClass>gold.at(i)
|
||||||
if is_gold(state, gold_parse, moves.strings) == False:
|
if moves.is_gold(state, gold_parse) == False:
|
||||||
print("Truth")
|
print("Truth")
|
||||||
for dep in gold_parse.orig_annot:
|
for dep in gold_parse.orig_annot:
|
||||||
print(dep[1], dep[3], dep[4])
|
print(dep[1], dep[3], dep[4])
|
||||||
|
@ -237,16 +237,3 @@ def _check_train_integrity(Beam pred, Beam gold, GoldParse gold_parse, Transitio
|
||||||
raise Exception("Gold parse is not gold-standard")
|
raise Exception("Gold parse is not gold-standard")
|
||||||
|
|
||||||
|
|
||||||
def is_gold(StateClass state, GoldParse gold, StringStore strings):
|
|
||||||
predicted = set()
|
|
||||||
truth = set()
|
|
||||||
for i in range(gold.length):
|
|
||||||
if gold.cand_to_gold[i] is None:
|
|
||||||
continue
|
|
||||||
if state.safe_get(i).dep:
|
|
||||||
predicted.add((i, state.H(i), strings[state.safe_get(i).dep]))
|
|
||||||
else:
|
|
||||||
predicted.add((i, state.H(i), 'ROOT'))
|
|
||||||
id_, word, tag, head, dep, ner = gold.orig_annot[gold.cand_to_gold[i]]
|
|
||||||
truth.add((id_, head, dep))
|
|
||||||
return truth == predicted
|
|
||||||
|
|
|
@ -110,5 +110,35 @@ def es_noun_chunks(obj):
|
||||||
token = next_token(token)
|
token = next_token(token)
|
||||||
|
|
||||||
|
|
||||||
|
def french_noun_chunks(obj):
|
||||||
|
labels = ['nsubj', 'nsubj:pass', 'obj', 'iobj', 'ROOT', 'appos', 'nmod', 'nmod:poss']
|
||||||
|
doc = obj.doc # Ensure works on both Doc and Span.
|
||||||
|
np_deps = [doc.vocab.strings[label] for label in labels]
|
||||||
|
conj = doc.vocab.strings.add('conj')
|
||||||
|
np_label = doc.vocab.strings.add('NP')
|
||||||
|
seen = set()
|
||||||
|
for i, word in enumerate(obj):
|
||||||
|
if word.pos not in (NOUN, PROPN, PRON):
|
||||||
|
continue
|
||||||
|
# Prevent nested chunks from being produced
|
||||||
|
if word.i in seen:
|
||||||
|
continue
|
||||||
|
if word.dep in np_deps:
|
||||||
|
if any(w.i in seen for w in word.subtree):
|
||||||
|
continue
|
||||||
|
seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
|
||||||
|
yield word.left_edge.i, word.right_edge.i+1, np_label
|
||||||
|
elif word.dep == conj:
|
||||||
|
head = word.head
|
||||||
|
while head.dep == conj and head.head.i < head.i:
|
||||||
|
head = head.head
|
||||||
|
# If the head is an NP, and we're coordinated to it, we're an NP
|
||||||
|
if head.dep in np_deps:
|
||||||
|
if any(w.i in seen for w in word.subtree):
|
||||||
|
continue
|
||||||
|
seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
|
||||||
|
yield word.left_edge.i, word.right_edge.i+1, np_label
|
||||||
|
|
||||||
|
|
||||||
CHUNKERS = {'en': english_noun_chunks, 'de': german_noun_chunks,
|
CHUNKERS = {'en': english_noun_chunks, 'de': german_noun_chunks,
|
||||||
'es': es_noun_chunks}
|
'es': es_noun_chunks, 'fr': french_noun_chunks}
|
||||||
|
|
|
@ -2,7 +2,10 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from thinc.typedefs cimport weight_t
|
from thinc.typedefs cimport weight_t
|
||||||
|
from thinc.extra.search cimport Beam
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
|
import numpy
|
||||||
|
from thinc.neural.ops import NumpyOps
|
||||||
|
|
||||||
from .stateclass cimport StateClass
|
from .stateclass cimport StateClass
|
||||||
from ._state cimport StateC
|
from ._state cimport StateC
|
||||||
|
@ -110,7 +113,7 @@ cdef class BiluoPushDown(TransitionSystem):
|
||||||
|
|
||||||
def has_gold(self, GoldParse gold, start=0, end=None):
|
def has_gold(self, GoldParse gold, start=0, end=None):
|
||||||
end = end or len(gold.ner)
|
end = end or len(gold.ner)
|
||||||
if all([tag == '-' for tag in gold.ner[start:end]]):
|
if all([tag in ('-', None) for tag in gold.ner[start:end]]):
|
||||||
return False
|
return False
|
||||||
else:
|
else:
|
||||||
return True
|
return True
|
||||||
|
@ -122,11 +125,46 @@ cdef class BiluoPushDown(TransitionSystem):
|
||||||
gold.c.ner[i] = self.lookup_transition(gold.ner[i])
|
gold.c.ner[i] = self.lookup_transition(gold.ner[i])
|
||||||
return gold
|
return gold
|
||||||
|
|
||||||
|
def get_beam_annot(self, Beam beam):
|
||||||
|
entities = {}
|
||||||
|
probs = beam.probs
|
||||||
|
for i in range(beam.size):
|
||||||
|
stcls = <StateClass>beam.at(i)
|
||||||
|
if stcls.is_final():
|
||||||
|
self.finalize_state(stcls.c)
|
||||||
|
prob = probs[i]
|
||||||
|
for j in range(stcls.c._e_i):
|
||||||
|
start = stcls.c._ents[j].start
|
||||||
|
end = stcls.c._ents[j].end
|
||||||
|
label = stcls.c._ents[j].label
|
||||||
|
entities.setdefault((start, end, label), 0.0)
|
||||||
|
entities[(start, end, label)] += prob
|
||||||
|
return entities
|
||||||
|
|
||||||
|
def get_beam_parses(self, Beam beam):
|
||||||
|
parses = []
|
||||||
|
probs = beam.probs
|
||||||
|
for i in range(beam.size):
|
||||||
|
stcls = <StateClass>beam.at(i)
|
||||||
|
if stcls.is_final():
|
||||||
|
self.finalize_state(stcls.c)
|
||||||
|
prob = probs[i]
|
||||||
|
parse = []
|
||||||
|
for j in range(stcls.c._e_i):
|
||||||
|
start = stcls.c._ents[j].start
|
||||||
|
end = stcls.c._ents[j].end
|
||||||
|
label = stcls.c._ents[j].label
|
||||||
|
parse.append((start, end, self.strings[label]))
|
||||||
|
parses.append((prob, parse))
|
||||||
|
return parses
|
||||||
|
|
||||||
cdef Transition lookup_transition(self, object name) except *:
|
cdef Transition lookup_transition(self, object name) except *:
|
||||||
cdef attr_t label
|
cdef attr_t label
|
||||||
if name == '-' or name == None:
|
if name == '-' or name == None:
|
||||||
move_str = 'M'
|
move_str = 'M'
|
||||||
label = 0
|
label = 0
|
||||||
|
elif name == '!O':
|
||||||
|
return Transition(clas=0, move=ISNT, label=0, score=0)
|
||||||
elif '-' in name:
|
elif '-' in name:
|
||||||
move_str, label_str = name.split('-', 1)
|
move_str, label_str = name.split('-', 1)
|
||||||
# Hacky way to denote 'not this entity'
|
# Hacky way to denote 'not this entity'
|
||||||
|
@ -308,6 +346,9 @@ cdef class In:
|
||||||
elif g_act == UNIT:
|
elif g_act == UNIT:
|
||||||
# I, Gold U --> True iff next tag == O
|
# I, Gold U --> True iff next tag == O
|
||||||
return next_act != OUT
|
return next_act != OUT
|
||||||
|
# Support partial supervision in the form of "not this label"
|
||||||
|
elif g_act == ISNT:
|
||||||
|
return 0
|
||||||
else:
|
else:
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
|
@ -350,6 +391,9 @@ cdef class Last:
|
||||||
elif g_act == UNIT:
|
elif g_act == UNIT:
|
||||||
# L, Gold U --> True
|
# L, Gold U --> True
|
||||||
return 0
|
return 0
|
||||||
|
# Support partial supervision in the form of "not this label"
|
||||||
|
elif g_act == ISNT:
|
||||||
|
return 0
|
||||||
else:
|
else:
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
|
@ -418,7 +462,9 @@ cdef class Out:
|
||||||
cdef int g_act = gold.ner[s.B(0)].move
|
cdef int g_act = gold.ner[s.B(0)].move
|
||||||
cdef attr_t g_tag = gold.ner[s.B(0)].label
|
cdef attr_t g_tag = gold.ner[s.B(0)].label
|
||||||
|
|
||||||
if g_act == MISSING or g_act == ISNT:
|
if g_act == ISNT and g_tag == 0:
|
||||||
|
return 1
|
||||||
|
elif g_act == MISSING or g_act == ISNT:
|
||||||
return 0
|
return 0
|
||||||
elif g_act == BEGIN:
|
elif g_act == BEGIN:
|
||||||
# O, Gold B --> False
|
# O, Gold B --> False
|
||||||
|
|
|
@ -29,21 +29,26 @@ from thinc.linear.avgtron cimport AveragedPerceptron
|
||||||
from thinc.linalg cimport VecVec
|
from thinc.linalg cimport VecVec
|
||||||
from thinc.structs cimport SparseArrayC, FeatureC, ExampleC
|
from thinc.structs cimport SparseArrayC, FeatureC, ExampleC
|
||||||
from thinc.extra.eg cimport Example
|
from thinc.extra.eg cimport Example
|
||||||
|
from thinc.extra.search cimport Beam
|
||||||
|
|
||||||
from cymem.cymem cimport Pool, Address
|
from cymem.cymem cimport Pool, Address
|
||||||
from murmurhash.mrmr cimport hash64
|
from murmurhash.mrmr cimport hash64
|
||||||
from preshed.maps cimport MapStruct
|
from preshed.maps cimport MapStruct
|
||||||
from preshed.maps cimport map_get
|
from preshed.maps cimport map_get
|
||||||
|
|
||||||
from thinc.api import layerize, chain, noop, clone
|
from thinc.api import layerize, chain, noop, clone, with_flatten
|
||||||
from thinc.neural import Model, Affine, ELU, ReLu, Maxout
|
from thinc.neural import Model, Affine, ReLu, Maxout
|
||||||
|
from thinc.neural._classes.batchnorm import BatchNorm as BN
|
||||||
|
from thinc.neural._classes.selu import SELU
|
||||||
|
from thinc.neural._classes.layernorm import LayerNorm
|
||||||
from thinc.neural.ops import NumpyOps, CupyOps
|
from thinc.neural.ops import NumpyOps, CupyOps
|
||||||
from thinc.neural.util import get_array_module
|
from thinc.neural.util import get_array_module
|
||||||
|
|
||||||
from .. import util
|
from .. import util
|
||||||
from ..util import get_async, get_cuda_stream
|
from ..util import get_async, get_cuda_stream
|
||||||
from .._ml import zero_init, PrecomputableAffine, PrecomputableMaxouts
|
from .._ml import zero_init, PrecomputableAffine, PrecomputableMaxouts
|
||||||
from .._ml import Tok2Vec, doc2feats, rebatch
|
from .._ml import Tok2Vec, doc2feats, rebatch, fine_tune
|
||||||
|
from .._ml import Residual, drop_layer
|
||||||
from ..compat import json_dumps
|
from ..compat import json_dumps
|
||||||
|
|
||||||
from . import _parse_features
|
from . import _parse_features
|
||||||
|
@ -58,8 +63,10 @@ from ..structs cimport TokenC
|
||||||
from ..tokens.doc cimport Doc
|
from ..tokens.doc cimport Doc
|
||||||
from ..strings cimport StringStore
|
from ..strings cimport StringStore
|
||||||
from ..gold cimport GoldParse
|
from ..gold cimport GoldParse
|
||||||
from ..attrs cimport TAG, DEP
|
from ..attrs cimport ID, TAG, DEP, ORTH, NORM, PREFIX, SUFFIX, TAG
|
||||||
|
from . import _beam_utils
|
||||||
|
|
||||||
|
USE_FINE_TUNE = True
|
||||||
|
|
||||||
def get_templates(*args, **kwargs):
|
def get_templates(*args, **kwargs):
|
||||||
return []
|
return []
|
||||||
|
@ -110,7 +117,6 @@ cdef class precompute_hiddens:
|
||||||
self.nO = cached.shape[2]
|
self.nO = cached.shape[2]
|
||||||
self.nP = getattr(lower_model, 'nP', 1)
|
self.nP = getattr(lower_model, 'nP', 1)
|
||||||
self.ops = lower_model.ops
|
self.ops = lower_model.ops
|
||||||
self._features = numpy.zeros((batch_size, self.nO*self.nP), dtype='f')
|
|
||||||
self._is_synchronized = False
|
self._is_synchronized = False
|
||||||
self._cuda_stream = cuda_stream
|
self._cuda_stream = cuda_stream
|
||||||
self._cached = cached
|
self._cached = cached
|
||||||
|
@ -127,13 +133,12 @@ cdef class precompute_hiddens:
|
||||||
return self.begin_update(X)[0]
|
return self.begin_update(X)[0]
|
||||||
|
|
||||||
def begin_update(self, token_ids, drop=0.):
|
def begin_update(self, token_ids, drop=0.):
|
||||||
self._features.fill(0)
|
cdef np.ndarray state_vector = numpy.zeros((token_ids.shape[0], self.nO*self.nP), dtype='f')
|
||||||
# This is tricky, but (assuming GPU available);
|
# This is tricky, but (assuming GPU available);
|
||||||
# - Input to forward on CPU
|
# - Input to forward on CPU
|
||||||
# - Output from forward on CPU
|
# - Output from forward on CPU
|
||||||
# - Input to backward on GPU!
|
# - Input to backward on GPU!
|
||||||
# - Output from backward on GPU
|
# - Output from backward on GPU
|
||||||
cdef np.ndarray state_vector = self._features[:len(token_ids)]
|
|
||||||
bp_hiddens = self._bp_hiddens
|
bp_hiddens = self._bp_hiddens
|
||||||
|
|
||||||
feat_weights = self.get_feat_weights()
|
feat_weights = self.get_feat_weights()
|
||||||
|
@ -233,11 +238,14 @@ cdef class Parser:
|
||||||
Base class of the DependencyParser and EntityRecognizer.
|
Base class of the DependencyParser and EntityRecognizer.
|
||||||
"""
|
"""
|
||||||
@classmethod
|
@classmethod
|
||||||
def Model(cls, nr_class, token_vector_width=128, hidden_width=128, depth=1, **cfg):
|
def Model(cls, nr_class, token_vector_width=128, hidden_width=300, depth=1, **cfg):
|
||||||
depth = util.env_opt('parser_hidden_depth', depth)
|
depth = util.env_opt('parser_hidden_depth', depth)
|
||||||
token_vector_width = util.env_opt('token_vector_width', token_vector_width)
|
token_vector_width = util.env_opt('token_vector_width', token_vector_width)
|
||||||
hidden_width = util.env_opt('hidden_width', hidden_width)
|
hidden_width = util.env_opt('hidden_width', hidden_width)
|
||||||
parser_maxout_pieces = util.env_opt('parser_maxout_pieces', 2)
|
parser_maxout_pieces = util.env_opt('parser_maxout_pieces', 2)
|
||||||
|
embed_size = util.env_opt('embed_size', 4000)
|
||||||
|
tensors = fine_tune(Tok2Vec(token_vector_width, embed_size,
|
||||||
|
preprocess=doc2feats()))
|
||||||
if parser_maxout_pieces == 1:
|
if parser_maxout_pieces == 1:
|
||||||
lower = PrecomputableAffine(hidden_width if depth >= 1 else nr_class,
|
lower = PrecomputableAffine(hidden_width if depth >= 1 else nr_class,
|
||||||
nF=cls.nr_feature,
|
nF=cls.nr_feature,
|
||||||
|
@ -269,7 +277,7 @@ cdef class Parser:
|
||||||
'hidden_width': hidden_width,
|
'hidden_width': hidden_width,
|
||||||
'maxout_pieces': parser_maxout_pieces
|
'maxout_pieces': parser_maxout_pieces
|
||||||
}
|
}
|
||||||
return (lower, upper), cfg
|
return (tensors, lower, upper), cfg
|
||||||
|
|
||||||
def __init__(self, Vocab vocab, moves=True, model=True, **cfg):
|
def __init__(self, Vocab vocab, moves=True, model=True, **cfg):
|
||||||
"""
|
"""
|
||||||
|
@ -295,6 +303,10 @@ cdef class Parser:
|
||||||
self.moves = self.TransitionSystem(self.vocab.strings, {})
|
self.moves = self.TransitionSystem(self.vocab.strings, {})
|
||||||
else:
|
else:
|
||||||
self.moves = moves
|
self.moves = moves
|
||||||
|
if 'beam_width' not in cfg:
|
||||||
|
cfg['beam_width'] = util.env_opt('beam_width', 1)
|
||||||
|
if 'beam_density' not in cfg:
|
||||||
|
cfg['beam_density'] = util.env_opt('beam_density', 0.0)
|
||||||
self.cfg = cfg
|
self.cfg = cfg
|
||||||
if 'actions' in self.cfg:
|
if 'actions' in self.cfg:
|
||||||
for action, labels in self.cfg.get('actions', {}).items():
|
for action, labels in self.cfg.get('actions', {}).items():
|
||||||
|
@ -305,7 +317,7 @@ cdef class Parser:
|
||||||
def __reduce__(self):
|
def __reduce__(self):
|
||||||
return (Parser, (self.vocab, self.moves, self.model), None, None)
|
return (Parser, (self.vocab, self.moves, self.model), None, None)
|
||||||
|
|
||||||
def __call__(self, Doc doc):
|
def __call__(self, Doc doc, beam_width=None, beam_density=None):
|
||||||
"""
|
"""
|
||||||
Apply the parser or entity recognizer, setting the annotations onto the Doc object.
|
Apply the parser or entity recognizer, setting the annotations onto the Doc object.
|
||||||
|
|
||||||
|
@ -314,11 +326,26 @@ cdef class Parser:
|
||||||
Returns:
|
Returns:
|
||||||
None
|
None
|
||||||
"""
|
"""
|
||||||
|
if beam_width is None:
|
||||||
|
beam_width = self.cfg.get('beam_width', 1)
|
||||||
|
if beam_density is None:
|
||||||
|
beam_density = self.cfg.get('beam_density', 0.0)
|
||||||
|
cdef Beam beam
|
||||||
|
if beam_width == 1:
|
||||||
states = self.parse_batch([doc], [doc.tensor])
|
states = self.parse_batch([doc], [doc.tensor])
|
||||||
self.set_annotations([doc], states)
|
self.set_annotations([doc], states)
|
||||||
return doc
|
return doc
|
||||||
|
else:
|
||||||
|
beam = self.beam_parse([doc], [doc.tensor],
|
||||||
|
beam_width=beam_width, beam_density=beam_density)[0]
|
||||||
|
output = self.moves.get_beam_annot(beam)
|
||||||
|
state = <StateClass>beam.at(0)
|
||||||
|
self.set_annotations([doc], [state])
|
||||||
|
_cleanup(beam)
|
||||||
|
return output
|
||||||
|
|
||||||
def pipe(self, docs, int batch_size=1000, int n_threads=2):
|
def pipe(self, docs, int batch_size=1000, int n_threads=2,
|
||||||
|
beam_width=None, beam_density=None):
|
||||||
"""
|
"""
|
||||||
Process a stream of documents.
|
Process a stream of documents.
|
||||||
|
|
||||||
|
@ -330,13 +357,23 @@ cdef class Parser:
|
||||||
The number of threads with which to work on the buffer in parallel.
|
The number of threads with which to work on the buffer in parallel.
|
||||||
Yields (Doc): Documents, in order.
|
Yields (Doc): Documents, in order.
|
||||||
"""
|
"""
|
||||||
cdef StateClass parse_state
|
if beam_width is None:
|
||||||
|
beam_width = self.cfg.get('beam_width', 1)
|
||||||
|
if beam_density is None:
|
||||||
|
beam_density = self.cfg.get('beam_density', 0.0)
|
||||||
cdef Doc doc
|
cdef Doc doc
|
||||||
queue = []
|
cdef Beam beam
|
||||||
for docs in cytoolz.partition_all(batch_size, docs):
|
for docs in cytoolz.partition_all(batch_size, docs):
|
||||||
docs = list(docs)
|
docs = list(docs)
|
||||||
tokvecs = [d.tensor for d in docs]
|
tokvecs = [doc.tensor for doc in docs]
|
||||||
|
if beam_width == 1:
|
||||||
parse_states = self.parse_batch(docs, tokvecs)
|
parse_states = self.parse_batch(docs, tokvecs)
|
||||||
|
else:
|
||||||
|
beams = self.beam_parse(docs, tokvecs,
|
||||||
|
beam_width=beam_width, beam_density=beam_density)
|
||||||
|
parse_states = []
|
||||||
|
for beam in beams:
|
||||||
|
parse_states.append(<StateClass>beam.at(0))
|
||||||
self.set_annotations(docs, parse_states)
|
self.set_annotations(docs, parse_states)
|
||||||
yield from docs
|
yield from docs
|
||||||
|
|
||||||
|
@ -351,8 +388,13 @@ cdef class Parser:
|
||||||
int nr_class, nr_feat, nr_piece, nr_dim, nr_state
|
int nr_class, nr_feat, nr_piece, nr_dim, nr_state
|
||||||
if isinstance(docs, Doc):
|
if isinstance(docs, Doc):
|
||||||
docs = [docs]
|
docs = [docs]
|
||||||
|
if isinstance(tokvecses, np.ndarray):
|
||||||
|
tokvecses = [tokvecses]
|
||||||
|
|
||||||
tokvecs = self.model[0].ops.flatten(tokvecses)
|
tokvecs = self.model[0].ops.flatten(tokvecses)
|
||||||
|
if USE_FINE_TUNE:
|
||||||
|
# TODO: This is incorrect! Unhack when training next model
|
||||||
|
tokvecs += self.model[0].ops.flatten(self.model[0]((docs, tokvecses)))
|
||||||
|
|
||||||
nr_state = len(docs)
|
nr_state = len(docs)
|
||||||
nr_class = self.moves.n_moves
|
nr_class = self.moves.n_moves
|
||||||
|
@ -404,6 +446,55 @@ cdef class Parser:
|
||||||
next_step.push_back(st)
|
next_step.push_back(st)
|
||||||
return states
|
return states
|
||||||
|
|
||||||
|
def beam_parse(self, docs, tokvecses, int beam_width=3, float beam_density=0.001):
|
||||||
|
cdef Beam beam
|
||||||
|
cdef np.ndarray scores
|
||||||
|
cdef Doc doc
|
||||||
|
cdef int nr_class = self.moves.n_moves
|
||||||
|
cdef StateClass stcls, output
|
||||||
|
tokvecs = self.model[0].ops.flatten(tokvecses)
|
||||||
|
if USE_FINE_TUNE:
|
||||||
|
tokvecs = self.model[0].ops.flatten(self.model[0]((docs, tokvecses)))
|
||||||
|
cuda_stream = get_cuda_stream()
|
||||||
|
state2vec, vec2scores = self.get_batch_model(len(docs), tokvecs,
|
||||||
|
cuda_stream, 0.0)
|
||||||
|
beams = []
|
||||||
|
cdef int offset = 0
|
||||||
|
cdef int j = 0
|
||||||
|
cdef int k
|
||||||
|
for doc in docs:
|
||||||
|
beam = Beam(nr_class, beam_width, min_density=beam_density)
|
||||||
|
beam.initialize(self.moves.init_beam_state, doc.length, doc.c)
|
||||||
|
for i in range(beam.width):
|
||||||
|
stcls = <StateClass>beam.at(i)
|
||||||
|
stcls.c.offset = offset
|
||||||
|
offset += len(doc)
|
||||||
|
beam.check_done(_check_final_state, NULL)
|
||||||
|
while not beam.is_done:
|
||||||
|
states = []
|
||||||
|
for i in range(beam.size):
|
||||||
|
stcls = <StateClass>beam.at(i)
|
||||||
|
# This way we avoid having to score finalized states
|
||||||
|
# We do have to take care to keep indexes aligned, though
|
||||||
|
if not stcls.is_final():
|
||||||
|
states.append(stcls)
|
||||||
|
token_ids = self.get_token_ids(states)
|
||||||
|
vectors = state2vec(token_ids)
|
||||||
|
scores = vec2scores(vectors)
|
||||||
|
j = 0
|
||||||
|
c_scores = <float*>scores.data
|
||||||
|
for i in range(beam.size):
|
||||||
|
stcls = <StateClass>beam.at(i)
|
||||||
|
if not stcls.is_final():
|
||||||
|
self.moves.set_valid(beam.is_valid[i], stcls.c)
|
||||||
|
for k in range(nr_class):
|
||||||
|
beam.scores[i][k] = c_scores[j * scores.shape[1] + k]
|
||||||
|
j += 1
|
||||||
|
beam.advance(_transition_state, _hash_state, <void*>self.moves.c)
|
||||||
|
beam.check_done(_check_final_state, NULL)
|
||||||
|
beams.append(beam)
|
||||||
|
return beams
|
||||||
|
|
||||||
cdef void _parse_step(self, StateC* state,
|
cdef void _parse_step(self, StateC* state,
|
||||||
const float* feat_weights,
|
const float* feat_weights,
|
||||||
int nr_class, int nr_feat, int nr_piece) nogil:
|
int nr_class, int nr_feat, int nr_piece) nogil:
|
||||||
|
@ -427,6 +518,12 @@ cdef class Parser:
|
||||||
free(token_ids)
|
free(token_ids)
|
||||||
|
|
||||||
def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None):
|
def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None):
|
||||||
|
if not any(self.moves.has_gold(gold) for gold in golds):
|
||||||
|
return None
|
||||||
|
if self.cfg.get('beam_width', 1) >= 2 and numpy.random.random() >= 0.5:
|
||||||
|
return self.update_beam(docs_tokvecs, golds,
|
||||||
|
self.cfg['beam_width'], self.cfg['beam_density'],
|
||||||
|
drop=drop, sgd=sgd, losses=losses)
|
||||||
if losses is not None and self.name not in losses:
|
if losses is not None and self.name not in losses:
|
||||||
losses[self.name] = 0.
|
losses[self.name] = 0.
|
||||||
docs, tokvec_lists = docs_tokvecs
|
docs, tokvec_lists = docs_tokvecs
|
||||||
|
@ -434,6 +531,9 @@ cdef class Parser:
|
||||||
if isinstance(docs, Doc) and isinstance(golds, GoldParse):
|
if isinstance(docs, Doc) and isinstance(golds, GoldParse):
|
||||||
docs = [docs]
|
docs = [docs]
|
||||||
golds = [golds]
|
golds = [golds]
|
||||||
|
if USE_FINE_TUNE:
|
||||||
|
my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop)
|
||||||
|
tokvecs += self.model[0].ops.flatten(my_tokvecs)
|
||||||
|
|
||||||
cuda_stream = get_cuda_stream()
|
cuda_stream = get_cuda_stream()
|
||||||
|
|
||||||
|
@ -460,13 +560,14 @@ cdef class Parser:
|
||||||
scores, bp_scores = vec2scores.begin_update(vector, drop=drop)
|
scores, bp_scores = vec2scores.begin_update(vector, drop=drop)
|
||||||
|
|
||||||
d_scores = self.get_batch_loss(states, golds, scores)
|
d_scores = self.get_batch_loss(states, golds, scores)
|
||||||
d_vector = bp_scores(d_scores / d_scores.shape[0], sgd=sgd)
|
d_scores /= len(docs)
|
||||||
|
d_vector = bp_scores(d_scores, sgd=sgd)
|
||||||
if drop != 0:
|
if drop != 0:
|
||||||
d_vector *= mask
|
d_vector *= mask
|
||||||
|
|
||||||
if isinstance(self.model[0].ops, CupyOps) \
|
if isinstance(self.model[0].ops, CupyOps) \
|
||||||
and not isinstance(token_ids, state2vec.ops.xp.ndarray):
|
and not isinstance(token_ids, state2vec.ops.xp.ndarray):
|
||||||
# Move token_ids and d_vector to CPU, asynchronously
|
# Move token_ids and d_vector to GPU, asynchronously
|
||||||
backprops.append((
|
backprops.append((
|
||||||
get_async(cuda_stream, token_ids),
|
get_async(cuda_stream, token_ids),
|
||||||
get_async(cuda_stream, d_vector),
|
get_async(cuda_stream, d_vector),
|
||||||
|
@ -483,7 +584,65 @@ cdef class Parser:
|
||||||
break
|
break
|
||||||
self._make_updates(d_tokvecs,
|
self._make_updates(d_tokvecs,
|
||||||
backprops, sgd, cuda_stream)
|
backprops, sgd, cuda_stream)
|
||||||
return self.model[0].ops.unflatten(d_tokvecs, [len(d) for d in docs])
|
d_tokvecs = self.model[0].ops.unflatten(d_tokvecs, [len(d) for d in docs])
|
||||||
|
if USE_FINE_TUNE:
|
||||||
|
d_tokvecs = bp_my_tokvecs(d_tokvecs, sgd=sgd)
|
||||||
|
return d_tokvecs
|
||||||
|
|
||||||
|
def update_beam(self, docs_tokvecs, golds, width=None, density=None,
|
||||||
|
drop=0., sgd=None, losses=None):
|
||||||
|
if not any(self.moves.has_gold(gold) for gold in golds):
|
||||||
|
return None
|
||||||
|
if not golds:
|
||||||
|
return None
|
||||||
|
if width is None:
|
||||||
|
width = self.cfg.get('beam_width', 2)
|
||||||
|
if density is None:
|
||||||
|
density = self.cfg.get('beam_density', 0.0)
|
||||||
|
if losses is not None and self.name not in losses:
|
||||||
|
losses[self.name] = 0.
|
||||||
|
docs, tokvecs = docs_tokvecs
|
||||||
|
lengths = [len(d) for d in docs]
|
||||||
|
assert min(lengths) >= 1
|
||||||
|
tokvecs = self.model[0].ops.flatten(tokvecs)
|
||||||
|
if USE_FINE_TUNE:
|
||||||
|
my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop)
|
||||||
|
tokvecs += self.model[0].ops.flatten(my_tokvecs)
|
||||||
|
|
||||||
|
states = self.moves.init_batch(docs)
|
||||||
|
for gold in golds:
|
||||||
|
self.moves.preprocess_gold(gold)
|
||||||
|
|
||||||
|
cuda_stream = get_cuda_stream()
|
||||||
|
state2vec, vec2scores = self.get_batch_model(len(states), tokvecs, cuda_stream, 0.0)
|
||||||
|
|
||||||
|
states_d_scores, backprops = _beam_utils.update_beam(self.moves, self.nr_feature, 500,
|
||||||
|
states, tokvecs, golds,
|
||||||
|
state2vec, vec2scores,
|
||||||
|
width, density,
|
||||||
|
sgd=sgd, drop=drop, losses=losses)
|
||||||
|
backprop_lower = []
|
||||||
|
cdef float batch_size = len(docs)
|
||||||
|
for i, d_scores in enumerate(states_d_scores):
|
||||||
|
d_scores /= batch_size
|
||||||
|
if losses is not None:
|
||||||
|
losses[self.name] += (d_scores**2).sum()
|
||||||
|
ids, bp_vectors, bp_scores = backprops[i]
|
||||||
|
d_vector = bp_scores(d_scores, sgd=sgd)
|
||||||
|
if isinstance(self.model[0].ops, CupyOps) \
|
||||||
|
and not isinstance(ids, state2vec.ops.xp.ndarray):
|
||||||
|
backprop_lower.append((
|
||||||
|
get_async(cuda_stream, ids),
|
||||||
|
get_async(cuda_stream, d_vector),
|
||||||
|
bp_vectors))
|
||||||
|
else:
|
||||||
|
backprop_lower.append((ids, d_vector, bp_vectors))
|
||||||
|
d_tokvecs = self.model[0].ops.allocate(tokvecs.shape)
|
||||||
|
self._make_updates(d_tokvecs, backprop_lower, sgd, cuda_stream)
|
||||||
|
d_tokvecs = self.model[0].ops.unflatten(d_tokvecs, lengths)
|
||||||
|
if USE_FINE_TUNE:
|
||||||
|
d_tokvecs = bp_my_tokvecs(d_tokvecs, sgd=sgd)
|
||||||
|
return d_tokvecs
|
||||||
|
|
||||||
def _init_gold_batch(self, whole_docs, whole_golds):
|
def _init_gold_batch(self, whole_docs, whole_golds):
|
||||||
"""Make a square batch, of length equal to the shortest doc. A long
|
"""Make a square batch, of length equal to the shortest doc. A long
|
||||||
|
@ -528,14 +687,10 @@ cdef class Parser:
|
||||||
xp = get_array_module(d_tokvecs)
|
xp = get_array_module(d_tokvecs)
|
||||||
for ids, d_vector, bp_vector in backprops:
|
for ids, d_vector, bp_vector in backprops:
|
||||||
d_state_features = bp_vector(d_vector, sgd=sgd)
|
d_state_features = bp_vector(d_vector, sgd=sgd)
|
||||||
active_feats = ids * (ids >= 0)
|
mask = ids >= 0
|
||||||
active_feats = active_feats.reshape((ids.shape[0], ids.shape[1], 1))
|
d_state_features *= mask.reshape(ids.shape + (1,))
|
||||||
if hasattr(xp, 'scatter_add'):
|
self.model[0].ops.scatter_add(d_tokvecs, ids * mask,
|
||||||
xp.scatter_add(d_tokvecs,
|
d_state_features)
|
||||||
ids, d_state_features * active_feats)
|
|
||||||
else:
|
|
||||||
xp.add.at(d_tokvecs,
|
|
||||||
ids, d_state_features * active_feats)
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def move_names(self):
|
def move_names(self):
|
||||||
|
@ -546,7 +701,7 @@ cdef class Parser:
|
||||||
return names
|
return names
|
||||||
|
|
||||||
def get_batch_model(self, batch_size, tokvecs, stream, dropout):
|
def get_batch_model(self, batch_size, tokvecs, stream, dropout):
|
||||||
lower, upper = self.model
|
_, lower, upper = self.model
|
||||||
state2vec = precompute_hiddens(batch_size, tokvecs,
|
state2vec = precompute_hiddens(batch_size, tokvecs,
|
||||||
lower, stream, drop=dropout)
|
lower, stream, drop=dropout)
|
||||||
return state2vec, upper
|
return state2vec, upper
|
||||||
|
@ -560,6 +715,7 @@ cdef class Parser:
|
||||||
dtype='i', order='C')
|
dtype='i', order='C')
|
||||||
c_ids = <int*>ids.data
|
c_ids = <int*>ids.data
|
||||||
for i, state in enumerate(states):
|
for i, state in enumerate(states):
|
||||||
|
if not state.is_final():
|
||||||
state.c.set_context_tokens(c_ids, n_tokens)
|
state.c.set_context_tokens(c_ids, n_tokens)
|
||||||
c_ids += ids.shape[1]
|
c_ids += ids.shape[1]
|
||||||
return ids
|
return ids
|
||||||
|
@ -635,10 +791,12 @@ cdef class Parser:
|
||||||
|
|
||||||
def to_disk(self, path, **exclude):
|
def to_disk(self, path, **exclude):
|
||||||
serializers = {
|
serializers = {
|
||||||
'lower_model': lambda p: p.open('wb').write(
|
'tok2vec_model': lambda p: p.open('wb').write(
|
||||||
self.model[0].to_bytes()),
|
self.model[0].to_bytes()),
|
||||||
'upper_model': lambda p: p.open('wb').write(
|
'lower_model': lambda p: p.open('wb').write(
|
||||||
self.model[1].to_bytes()),
|
self.model[1].to_bytes()),
|
||||||
|
'upper_model': lambda p: p.open('wb').write(
|
||||||
|
self.model[2].to_bytes()),
|
||||||
'vocab': lambda p: self.vocab.to_disk(p),
|
'vocab': lambda p: self.vocab.to_disk(p),
|
||||||
'moves': lambda p: self.moves.to_disk(p, strings=False),
|
'moves': lambda p: self.moves.to_disk(p, strings=False),
|
||||||
'cfg': lambda p: p.open('w').write(json_dumps(self.cfg))
|
'cfg': lambda p: p.open('w').write(json_dumps(self.cfg))
|
||||||
|
@ -659,24 +817,29 @@ cdef class Parser:
|
||||||
self.model, cfg = self.Model(**self.cfg)
|
self.model, cfg = self.Model(**self.cfg)
|
||||||
else:
|
else:
|
||||||
cfg = {}
|
cfg = {}
|
||||||
with (path / 'lower_model').open('rb') as file_:
|
with (path / 'tok2vec_model').open('rb') as file_:
|
||||||
bytes_data = file_.read()
|
bytes_data = file_.read()
|
||||||
self.model[0].from_bytes(bytes_data)
|
self.model[0].from_bytes(bytes_data)
|
||||||
with (path / 'upper_model').open('rb') as file_:
|
with (path / 'lower_model').open('rb') as file_:
|
||||||
bytes_data = file_.read()
|
bytes_data = file_.read()
|
||||||
self.model[1].from_bytes(bytes_data)
|
self.model[1].from_bytes(bytes_data)
|
||||||
|
with (path / 'upper_model').open('rb') as file_:
|
||||||
|
bytes_data = file_.read()
|
||||||
|
self.model[2].from_bytes(bytes_data)
|
||||||
self.cfg.update(cfg)
|
self.cfg.update(cfg)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def to_bytes(self, **exclude):
|
def to_bytes(self, **exclude):
|
||||||
serializers = OrderedDict((
|
serializers = OrderedDict((
|
||||||
('lower_model', lambda: self.model[0].to_bytes()),
|
('tok2vec_model', lambda: self.model[0].to_bytes()),
|
||||||
('upper_model', lambda: self.model[1].to_bytes()),
|
('lower_model', lambda: self.model[1].to_bytes()),
|
||||||
|
('upper_model', lambda: self.model[2].to_bytes()),
|
||||||
('vocab', lambda: self.vocab.to_bytes()),
|
('vocab', lambda: self.vocab.to_bytes()),
|
||||||
('moves', lambda: self.moves.to_bytes(strings=False)),
|
('moves', lambda: self.moves.to_bytes(strings=False)),
|
||||||
('cfg', lambda: ujson.dumps(self.cfg))
|
('cfg', lambda: ujson.dumps(self.cfg))
|
||||||
))
|
))
|
||||||
if 'model' in exclude:
|
if 'model' in exclude:
|
||||||
|
exclude['tok2vec_model'] = True
|
||||||
exclude['lower_model'] = True
|
exclude['lower_model'] = True
|
||||||
exclude['upper_model'] = True
|
exclude['upper_model'] = True
|
||||||
exclude.pop('model')
|
exclude.pop('model')
|
||||||
|
@ -687,6 +850,7 @@ cdef class Parser:
|
||||||
('vocab', lambda b: self.vocab.from_bytes(b)),
|
('vocab', lambda b: self.vocab.from_bytes(b)),
|
||||||
('moves', lambda b: self.moves.from_bytes(b, strings=False)),
|
('moves', lambda b: self.moves.from_bytes(b, strings=False)),
|
||||||
('cfg', lambda b: self.cfg.update(ujson.loads(b))),
|
('cfg', lambda b: self.cfg.update(ujson.loads(b))),
|
||||||
|
('tok2vec_model', lambda b: None),
|
||||||
('lower_model', lambda b: None),
|
('lower_model', lambda b: None),
|
||||||
('upper_model', lambda b: None)
|
('upper_model', lambda b: None)
|
||||||
))
|
))
|
||||||
|
@ -696,10 +860,12 @@ cdef class Parser:
|
||||||
self.model, cfg = self.Model(self.moves.n_moves)
|
self.model, cfg = self.Model(self.moves.n_moves)
|
||||||
else:
|
else:
|
||||||
cfg = {}
|
cfg = {}
|
||||||
|
if 'tok2vec_model' in msg:
|
||||||
|
self.model[0].from_bytes(msg['tok2vec_model'])
|
||||||
if 'lower_model' in msg:
|
if 'lower_model' in msg:
|
||||||
self.model[0].from_bytes(msg['lower_model'])
|
self.model[1].from_bytes(msg['lower_model'])
|
||||||
if 'upper_model' in msg:
|
if 'upper_model' in msg:
|
||||||
self.model[1].from_bytes(msg['upper_model'])
|
self.model[2].from_bytes(msg['upper_model'])
|
||||||
self.cfg.update(cfg)
|
self.cfg.update(cfg)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
@ -762,3 +928,30 @@ cdef int _arg_max_clas(const weight_t* scores, int move, const Transition* actio
|
||||||
mode = i
|
mode = i
|
||||||
score = scores[i]
|
score = scores[i]
|
||||||
return mode
|
return mode
|
||||||
|
|
||||||
|
|
||||||
|
# These are passed as callbacks to thinc.search.Beam
|
||||||
|
cdef int _transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1:
|
||||||
|
dest = <StateClass>_dest
|
||||||
|
src = <StateClass>_src
|
||||||
|
moves = <const Transition*>_moves
|
||||||
|
dest.clone(src)
|
||||||
|
moves[clas].do(dest.c, moves[clas].label)
|
||||||
|
|
||||||
|
|
||||||
|
cdef int _check_final_state(void* _state, void* extra_args) except -1:
|
||||||
|
return (<StateClass>_state).is_final()
|
||||||
|
|
||||||
|
|
||||||
|
def _cleanup(Beam beam):
|
||||||
|
for i in range(beam.width):
|
||||||
|
Py_XDECREF(<PyObject*>beam._states[i].content)
|
||||||
|
Py_XDECREF(<PyObject*>beam._parents[i].content)
|
||||||
|
|
||||||
|
|
||||||
|
cdef hash_t _hash_state(void* _state, void* _) except 0:
|
||||||
|
state = <StateClass>_state
|
||||||
|
if state.c.is_final():
|
||||||
|
return 1
|
||||||
|
else:
|
||||||
|
return state.c.hash()
|
||||||
|
|
|
@ -99,6 +99,9 @@ cdef class TransitionSystem:
|
||||||
def preprocess_gold(self, GoldParse gold):
|
def preprocess_gold(self, GoldParse gold):
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def is_gold_parse(self, StateClass state, GoldParse gold):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
cdef Transition lookup_transition(self, object name) except *:
|
cdef Transition lookup_transition(self, object name) except *:
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
|
@ -107,6 +110,8 @@ cdef class TransitionSystem:
|
||||||
|
|
||||||
def is_valid(self, StateClass stcls, move_name):
|
def is_valid(self, StateClass stcls, move_name):
|
||||||
action = self.lookup_transition(move_name)
|
action = self.lookup_transition(move_name)
|
||||||
|
if action.move == 0:
|
||||||
|
return False
|
||||||
return action.is_valid(stcls.c, action.label)
|
return action.is_valid(stcls.c, action.label)
|
||||||
|
|
||||||
cdef int set_valid(self, int* is_valid, const StateC* st) nogil:
|
cdef int set_valid(self, int* is_valid, const StateC* st) nogil:
|
||||||
|
@ -137,6 +142,10 @@ cdef class TransitionSystem:
|
||||||
"the entity recognizer\n"
|
"the entity recognizer\n"
|
||||||
"The transition system has %d actions." % (self.n_moves))
|
"The transition system has %d actions." % (self.n_moves))
|
||||||
|
|
||||||
|
def get_class_name(self, int clas):
|
||||||
|
act = self.c[clas]
|
||||||
|
return self.move_name(act.move, act.label)
|
||||||
|
|
||||||
def add_action(self, int action, label_name):
|
def add_action(self, int action, label_name):
|
||||||
cdef attr_t label_id
|
cdef attr_t label_id
|
||||||
if not isinstance(label_name, int):
|
if not isinstance(label_name, int):
|
||||||
|
|
|
@ -11,9 +11,9 @@ from ..strings import StringStore
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
||||||
|
|
||||||
_languages = ['bn', 'da', 'de', 'en', 'es', 'fi', 'fr', 'he', 'hu', 'it', 'nb',
|
_languages = ['bn', 'da', 'de', 'en', 'es', 'fi', 'fr', 'he', 'hu', 'id',
|
||||||
'nl', 'pl', 'pt', 'sv', 'xx']
|
'it', 'nb', 'nl', 'pl', 'pt', 'sv', 'xx']
|
||||||
_models = {'en': ['en_depent_web_sm', 'en_core_web_md'],
|
_models = {'en': ['en_core_web_sm'],
|
||||||
'de': ['de_core_news_md'],
|
'de': ['de_core_news_md'],
|
||||||
'fr': ['fr_depvec_web_lg'],
|
'fr': ['fr_depvec_web_lg'],
|
||||||
'xx': ['xx_ent_web_md']}
|
'xx': ['xx_ent_web_md']}
|
||||||
|
@ -86,6 +86,9 @@ def hu_tokenizer():
|
||||||
def fi_tokenizer():
|
def fi_tokenizer():
|
||||||
return util.get_lang_class('fi').Defaults.create_tokenizer()
|
return util.get_lang_class('fi').Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def id_tokenizer():
|
||||||
|
return util.get_lang_class('id').Defaults.create_tokenizer()
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def sv_tokenizer():
|
def sv_tokenizer():
|
||||||
|
|
|
@ -2,12 +2,18 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
from ....tokens.doc import Doc
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def en_lemmatizer(EN):
|
def en_lemmatizer(EN):
|
||||||
return EN.Defaults.create_lemmatizer()
|
return EN.Defaults.create_lemmatizer()
|
||||||
|
|
||||||
|
@pytest.mark.models('en')
|
||||||
|
def test_doc_lemmatization(EN):
|
||||||
|
doc = Doc(EN.vocab, words=['bleed'])
|
||||||
|
doc[0].tag_ = 'VBP'
|
||||||
|
assert doc[0].lemma_ == 'bleed'
|
||||||
|
|
||||||
@pytest.mark.models('en')
|
@pytest.mark.models('en')
|
||||||
@pytest.mark.parametrize('text,lemmas', [("aardwolves", ["aardwolf"]),
|
@pytest.mark.parametrize('text,lemmas', [("aardwolves", ["aardwolf"]),
|
||||||
|
@ -19,6 +25,16 @@ def test_en_lemmatizer_noun_lemmas(en_lemmatizer, text, lemmas):
|
||||||
assert en_lemmatizer.noun(text) == set(lemmas)
|
assert en_lemmatizer.noun(text) == set(lemmas)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.models('en')
|
||||||
|
@pytest.mark.parametrize('text,lemmas', [("bleed", ["bleed"]),
|
||||||
|
("feed", ["feed"]),
|
||||||
|
("need", ["need"]),
|
||||||
|
("ring", ["ring"]),
|
||||||
|
("axes", ["axis", "axe", "ax"])])
|
||||||
|
def test_en_lemmatizer_noun_lemmas(en_lemmatizer, text, lemmas):
|
||||||
|
assert en_lemmatizer.noun(text) == set(lemmas)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail
|
@pytest.mark.xfail
|
||||||
@pytest.mark.models('en')
|
@pytest.mark.models('en')
|
||||||
def test_en_lemmatizer_base_forms(en_lemmatizer):
|
def test_en_lemmatizer_base_forms(en_lemmatizer):
|
||||||
|
|
|
@ -25,7 +25,6 @@ def test_tag_names(EN):
|
||||||
doc = EN(text, disable=['parser'])
|
doc = EN(text, disable=['parser'])
|
||||||
assert type(doc[2].pos) == int
|
assert type(doc[2].pos) == int
|
||||||
assert isinstance(doc[2].pos_, six.text_type)
|
assert isinstance(doc[2].pos_, six.text_type)
|
||||||
assert type(doc[2].dep) == int
|
|
||||||
assert isinstance(doc[2].dep_, six.text_type)
|
assert isinstance(doc[2].dep_, six.text_type)
|
||||||
assert doc[2].tag_ == u'NNS'
|
assert doc[2].tag_ == u'NNS'
|
||||||
|
|
||||||
|
|
0
spacy/tests/lang/id/__init__.py
Normal file
0
spacy/tests/lang/id/__init__.py
Normal file
115
spacy/tests/lang/id/test_prefix_suffix_infix.py
Normal file
115
spacy/tests/lang/id/test_prefix_suffix_infix.py
Normal file
|
@ -0,0 +1,115 @@
|
||||||
|
# coding: utf-8
|
||||||
|
"""Test that tokenizer prefixes, suffixes and infixes are handled correctly."""
|
||||||
|
|
||||||
|
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text', ["(Ma'arif)"])
|
||||||
|
def test_tokenizer_splits_no_special(id_tokenizer, text):
|
||||||
|
tokens = id_tokenizer(text)
|
||||||
|
assert len(tokens) == 3
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text', ["Ma'arif"])
|
||||||
|
def test_tokenizer_splits_no_punct(id_tokenizer, text):
|
||||||
|
tokens = id_tokenizer(text)
|
||||||
|
assert len(tokens) == 1
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text', ["(Ma'arif"])
|
||||||
|
def test_tokenizer_splits_prefix_punct(id_tokenizer, text):
|
||||||
|
tokens = id_tokenizer(text)
|
||||||
|
assert len(tokens) == 2
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text', ["Ma'arif)"])
|
||||||
|
def test_tokenizer_splits_suffix_punct(id_tokenizer, text):
|
||||||
|
tokens = id_tokenizer(text)
|
||||||
|
assert len(tokens) == 2
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text', ["(Ma'arif)"])
|
||||||
|
def test_tokenizer_splits_even_wrap(id_tokenizer, text):
|
||||||
|
tokens = id_tokenizer(text)
|
||||||
|
assert len(tokens) == 3
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text', ["(Ma'arif?)"])
|
||||||
|
def test_tokenizer_splits_uneven_wrap(id_tokenizer, text):
|
||||||
|
tokens = id_tokenizer(text)
|
||||||
|
assert len(tokens) == 4
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text,length', [("S.Kom.", 1), ("SKom.", 2), ("(S.Kom.", 2)])
|
||||||
|
def test_tokenizer_splits_prefix_interact(id_tokenizer, text, length):
|
||||||
|
tokens = id_tokenizer(text)
|
||||||
|
assert len(tokens) == length
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text', ["S.Kom.)"])
|
||||||
|
def test_tokenizer_splits_suffix_interact(id_tokenizer, text):
|
||||||
|
tokens = id_tokenizer(text)
|
||||||
|
assert len(tokens) == 2
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text', ["(S.Kom.)"])
|
||||||
|
def test_tokenizer_splits_even_wrap_interact(id_tokenizer, text):
|
||||||
|
tokens = id_tokenizer(text)
|
||||||
|
assert len(tokens) == 3
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text', ["(S.Kom.?)"])
|
||||||
|
def test_tokenizer_splits_uneven_wrap_interact(id_tokenizer, text):
|
||||||
|
tokens = id_tokenizer(text)
|
||||||
|
assert len(tokens) == 4
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text,length', [("gara-gara", 1), ("Jokowi-Ahok", 3), ("Sukarno-Hatta", 3)])
|
||||||
|
def test_tokenizer_splits_hyphens(id_tokenizer, text, length):
|
||||||
|
tokens = id_tokenizer(text)
|
||||||
|
assert len(tokens) == length
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text', ["0.1-13.5", "0.0-0.1", "103.27-300"])
|
||||||
|
def test_tokenizer_splits_numeric_range(id_tokenizer, text):
|
||||||
|
tokens = id_tokenizer(text)
|
||||||
|
assert len(tokens) == 3
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text', ["ini.Budi", "Halo.Bandung"])
|
||||||
|
def test_tokenizer_splits_period_infix(id_tokenizer, text):
|
||||||
|
tokens = id_tokenizer(text)
|
||||||
|
assert len(tokens) == 3
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text', ["Halo,Bandung", "satu,dua"])
|
||||||
|
def test_tokenizer_splits_comma_infix(id_tokenizer, text):
|
||||||
|
tokens = id_tokenizer(text)
|
||||||
|
assert len(tokens) == 3
|
||||||
|
assert tokens[0].text == text.split(",")[0]
|
||||||
|
assert tokens[1].text == ","
|
||||||
|
assert tokens[2].text == text.split(",")[1]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text', ["halo...Bandung", "dia...pergi"])
|
||||||
|
def test_tokenizer_splits_ellipsis_infix(id_tokenizer, text):
|
||||||
|
tokens = id_tokenizer(text)
|
||||||
|
assert len(tokens) == 3
|
||||||
|
|
||||||
|
|
||||||
|
def test_tokenizer_splits_double_hyphen_infix(id_tokenizer):
|
||||||
|
tokens = id_tokenizer("Arsene Wenger--manajer Arsenal--melakukan konferensi pers.")
|
||||||
|
assert len(tokens) == 10
|
||||||
|
assert tokens[0].text == "Arsene"
|
||||||
|
assert tokens[1].text == "Wenger"
|
||||||
|
assert tokens[2].text == "--"
|
||||||
|
assert tokens[3].text == "manajer"
|
||||||
|
assert tokens[4].text == "Arsenal"
|
||||||
|
assert tokens[5].text == "--"
|
||||||
|
assert tokens[6].text == "melakukan"
|
||||||
|
assert tokens[7].text == "konferensi"
|
||||||
|
assert tokens[8].text == "pers"
|
||||||
|
assert tokens[9].text == "."
|
10
spacy/tests/parser/test_beam_parse.py
Normal file
10
spacy/tests/parser/test_beam_parse.py
Normal file
|
@ -0,0 +1,10 @@
|
||||||
|
import spacy
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
@pytest.mark.models
|
||||||
|
def test_beam_parse():
|
||||||
|
nlp = spacy.load('en_core_web_sm')
|
||||||
|
doc = nlp(u'Australia is a country', disable=['ner'])
|
||||||
|
ents = nlp.entity(doc, beam_width=2)
|
||||||
|
print(ents)
|
||||||
|
|
73
spacy/tests/parser/test_ner.py
Normal file
73
spacy/tests/parser/test_ner.py
Normal file
|
@ -0,0 +1,73 @@
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from ...vocab import Vocab
|
||||||
|
from ...syntax.ner import BiluoPushDown
|
||||||
|
from ...gold import GoldParse
|
||||||
|
from ...tokens import Doc
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def vocab():
|
||||||
|
return Vocab()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def doc(vocab):
|
||||||
|
return Doc(vocab, words=['Casey', 'went', 'to', 'New', 'York', '.'])
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def entity_annots(doc):
|
||||||
|
casey = doc[0:1]
|
||||||
|
ny = doc[3:5]
|
||||||
|
return [(casey.start_char, casey.end_char, 'PERSON'),
|
||||||
|
(ny.start_char, ny.end_char, 'GPE')]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def entity_types(entity_annots):
|
||||||
|
return sorted(set([label for (s, e, label) in entity_annots]))
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def tsys(vocab, entity_types):
|
||||||
|
actions = BiluoPushDown.get_actions(entity_types=entity_types)
|
||||||
|
return BiluoPushDown(vocab.strings, actions)
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_oracle_moves(tsys, doc, entity_annots):
|
||||||
|
gold = GoldParse(doc, entities=entity_annots)
|
||||||
|
tsys.preprocess_gold(gold)
|
||||||
|
act_classes = tsys.get_oracle_sequence(doc, gold)
|
||||||
|
names = [tsys.get_class_name(act) for act in act_classes]
|
||||||
|
assert names == ['U-PERSON', 'O', 'O', 'B-GPE', 'L-GPE', 'O']
|
||||||
|
|
||||||
|
def test_get_oracle_moves_negative_entities(tsys, doc, entity_annots):
|
||||||
|
entity_annots = [(s, e, '!' + label) for s, e, label in entity_annots]
|
||||||
|
gold = GoldParse(doc, entities=entity_annots)
|
||||||
|
for i, tag in enumerate(gold.ner):
|
||||||
|
if tag == 'L-!GPE':
|
||||||
|
gold.ner[i] = '-'
|
||||||
|
tsys.preprocess_gold(gold)
|
||||||
|
act_classes = tsys.get_oracle_sequence(doc, gold)
|
||||||
|
names = [tsys.get_class_name(act) for act in act_classes]
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_oracle_moves_negative_entities2(tsys, vocab):
|
||||||
|
doc = Doc(vocab, words=['A', 'B', 'C', 'D'])
|
||||||
|
gold = GoldParse(doc, entities=[])
|
||||||
|
gold.ner = ['B-!PERSON', 'L-!PERSON', 'B-!PERSON', 'L-!PERSON']
|
||||||
|
tsys.preprocess_gold(gold)
|
||||||
|
act_classes = tsys.get_oracle_sequence(doc, gold)
|
||||||
|
names = [tsys.get_class_name(act) for act in act_classes]
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_oracle_moves_negative_O(tsys, vocab):
|
||||||
|
doc = Doc(vocab, words=['A', 'B', 'C', 'D'])
|
||||||
|
gold = GoldParse(doc, entities=[])
|
||||||
|
gold.ner = ['O', '!O', 'O', '!O']
|
||||||
|
tsys.preprocess_gold(gold)
|
||||||
|
act_classes = tsys.get_oracle_sequence(doc, gold)
|
||||||
|
names = [tsys.get_class_name(act) for act in act_classes]
|
|
@ -1,7 +1,6 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
from thinc.neural import Model
|
from thinc.neural import Model
|
||||||
from mock import Mock
|
|
||||||
import pytest
|
import pytest
|
||||||
import numpy
|
import numpy
|
||||||
|
|
||||||
|
@ -36,7 +35,7 @@ def parser(vocab, arc_eager):
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def model(arc_eager, tok2vec):
|
def model(arc_eager, tok2vec):
|
||||||
return Parser.Model(arc_eager.n_moves, token_vector_width=tok2vec.nO)
|
return Parser.Model(arc_eager.n_moves, token_vector_width=tok2vec.nO)[0]
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def doc(vocab):
|
def doc(vocab):
|
||||||
|
@ -45,29 +44,50 @@ def doc(vocab):
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def gold(doc):
|
def gold(doc):
|
||||||
return GoldParse(doc, heads=[1, 1, 1], deps=['L', 'ROOT', 'R'])
|
return GoldParse(doc, heads=[1, 1, 1], deps=['L', 'ROOT', 'R'])
|
||||||
|
|
||||||
|
|
||||||
def test_can_init_nn_parser(parser):
|
def test_can_init_nn_parser(parser):
|
||||||
assert parser.model is None
|
assert parser.model is None
|
||||||
|
|
||||||
|
|
||||||
def test_build_model(parser):
|
def test_build_model(parser):
|
||||||
parser.model = Parser.Model(parser.moves.n_moves)
|
parser.model = Parser.Model(parser.moves.n_moves)[0]
|
||||||
assert parser.model is not None
|
assert parser.model is not None
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail
|
|
||||||
def test_predict_doc(parser, tok2vec, model, doc):
|
def test_predict_doc(parser, tok2vec, model, doc):
|
||||||
doc.tensor = tok2vec([doc])
|
doc.tensor = tok2vec([doc])[0]
|
||||||
parser.model = model
|
parser.model = model
|
||||||
parser(doc)
|
parser(doc)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail
|
|
||||||
def test_update_doc(parser, tok2vec, model, doc, gold):
|
def test_update_doc(parser, tok2vec, model, doc, gold):
|
||||||
parser.model = model
|
parser.model = model
|
||||||
tokvecs, bp_tokvecs = tok2vec.begin_update([doc])
|
tokvecs, bp_tokvecs = tok2vec.begin_update([doc])
|
||||||
d_tokvecs = parser.update((doc, tokvecs), gold)
|
d_tokvecs = parser.update(([doc], tokvecs), [gold])
|
||||||
assert d_tokvecs.shape == tokvecs.shape
|
assert d_tokvecs[0].shape == tokvecs[0].shape
|
||||||
def optimize(weights, gradient, key=None):
|
def optimize(weights, gradient, key=None):
|
||||||
weights -= 0.001 * gradient
|
weights -= 0.001 * gradient
|
||||||
bp_tokvecs(d_tokvecs, sgd=optimize)
|
bp_tokvecs(d_tokvecs, sgd=optimize)
|
||||||
assert d_tokvecs.sum() == 0.
|
assert d_tokvecs[0].sum() == 0.
|
||||||
|
|
||||||
|
|
||||||
|
def test_predict_doc_beam(parser, tok2vec, model, doc):
|
||||||
|
doc.tensor = tok2vec([doc])[0]
|
||||||
|
parser.model = model
|
||||||
|
parser(doc, beam_width=32, beam_density=0.001)
|
||||||
|
for word in doc:
|
||||||
|
print(word.text, word.head, word.dep_)
|
||||||
|
|
||||||
|
|
||||||
|
def test_update_doc_beam(parser, tok2vec, model, doc, gold):
|
||||||
|
parser.model = model
|
||||||
|
tokvecs, bp_tokvecs = tok2vec.begin_update([doc])
|
||||||
|
d_tokvecs = parser.update_beam(([doc], tokvecs), [gold])
|
||||||
|
assert d_tokvecs[0].shape == tokvecs[0].shape
|
||||||
|
def optimize(weights, gradient, key=None):
|
||||||
|
weights -= 0.001 * gradient
|
||||||
|
bp_tokvecs(d_tokvecs, sgd=optimize)
|
||||||
|
assert d_tokvecs[0].sum() == 0.
|
||||||
|
|
||||||
|
|
||||||
|
|
87
spacy/tests/parser/test_nn_beam.py
Normal file
87
spacy/tests/parser/test_nn_beam.py
Normal file
|
@ -0,0 +1,87 @@
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
import pytest
|
||||||
|
import numpy
|
||||||
|
from thinc.api import layerize
|
||||||
|
|
||||||
|
from ...vocab import Vocab
|
||||||
|
from ...syntax.arc_eager import ArcEager
|
||||||
|
from ...tokens import Doc
|
||||||
|
from ...gold import GoldParse
|
||||||
|
from ...syntax._beam_utils import ParserBeam, update_beam
|
||||||
|
from ...syntax.stateclass import StateClass
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def vocab():
|
||||||
|
return Vocab()
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def moves(vocab):
|
||||||
|
aeager = ArcEager(vocab.strings, {})
|
||||||
|
aeager.add_action(2, 'nsubj')
|
||||||
|
aeager.add_action(3, 'dobj')
|
||||||
|
aeager.add_action(2, 'aux')
|
||||||
|
return aeager
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def docs(vocab):
|
||||||
|
return [Doc(vocab, words=['Rats', 'bite', 'things'])]
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def states(docs):
|
||||||
|
return [StateClass(doc) for doc in docs]
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def tokvecs(docs, vector_size):
|
||||||
|
output = []
|
||||||
|
for doc in docs:
|
||||||
|
vec = numpy.random.uniform(-0.1, 0.1, (len(doc), vector_size))
|
||||||
|
output.append(numpy.asarray(vec))
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def golds(docs):
|
||||||
|
return [GoldParse(doc) for doc in docs]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def batch_size(docs):
|
||||||
|
return len(docs)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def beam_width():
|
||||||
|
return 4
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def vector_size():
|
||||||
|
return 6
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def beam(moves, states, golds, beam_width):
|
||||||
|
return ParserBeam(moves, states, golds, width=beam_width, density=0.0)
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def scores(moves, batch_size, beam_width):
|
||||||
|
return [
|
||||||
|
numpy.asarray(
|
||||||
|
numpy.random.uniform(-0.1, 0.1, (batch_size, moves.n_moves)),
|
||||||
|
dtype='f')
|
||||||
|
for _ in range(batch_size)]
|
||||||
|
|
||||||
|
|
||||||
|
def test_create_beam(beam):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def test_beam_advance(beam, scores):
|
||||||
|
beam.advance(scores)
|
||||||
|
|
||||||
|
|
||||||
|
def test_beam_advance_too_few_scores(beam, scores):
|
||||||
|
with pytest.raises(IndexError):
|
||||||
|
beam.advance(scores[:-1])
|
12
spacy/tests/regression/test_issue1257.py
Normal file
12
spacy/tests/regression/test_issue1257.py
Normal file
|
@ -0,0 +1,12 @@
|
||||||
|
'''Test tokens compare correctly'''
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from ..util import get_doc
|
||||||
|
from ...vocab import Vocab
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue1257():
|
||||||
|
doc1 = get_doc(Vocab(), ['a', 'b', 'c'])
|
||||||
|
doc2 = get_doc(Vocab(), ['a', 'c', 'e'])
|
||||||
|
assert doc1[0] != doc2[0]
|
||||||
|
assert not doc1[0] == doc2[0]
|
|
@ -11,8 +11,8 @@ import pytest
|
||||||
def taggers(en_vocab):
|
def taggers(en_vocab):
|
||||||
tagger1 = Tagger(en_vocab)
|
tagger1 = Tagger(en_vocab)
|
||||||
tagger2 = Tagger(en_vocab)
|
tagger2 = Tagger(en_vocab)
|
||||||
tagger1.model = tagger1.Model(None, None)
|
tagger1.model = tagger1.Model(8, 8)
|
||||||
tagger2.model = tagger2.Model(None, None)
|
tagger2.model = tagger1.model
|
||||||
return (tagger1, tagger2)
|
return (tagger1, tagger2)
|
||||||
|
|
||||||
|
|
||||||
|
@ -20,7 +20,6 @@ def test_serialize_tagger_roundtrip_bytes(en_vocab, taggers):
|
||||||
tagger1, tagger2 = taggers
|
tagger1, tagger2 = taggers
|
||||||
tagger1_b = tagger1.to_bytes()
|
tagger1_b = tagger1.to_bytes()
|
||||||
tagger2_b = tagger2.to_bytes()
|
tagger2_b = tagger2.to_bytes()
|
||||||
assert tagger1_b == tagger2_b
|
|
||||||
tagger1 = tagger1.from_bytes(tagger1_b)
|
tagger1 = tagger1.from_bytes(tagger1_b)
|
||||||
assert tagger1.to_bytes() == tagger1_b
|
assert tagger1.to_bytes() == tagger1_b
|
||||||
new_tagger1 = Tagger(en_vocab).from_bytes(tagger1_b)
|
new_tagger1 = Tagger(en_vocab).from_bytes(tagger1_b)
|
||||||
|
|
|
@ -2,6 +2,7 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ..util import get_doc
|
from ..util import get_doc
|
||||||
|
from ...attrs import ORTH, LENGTH
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
@ -89,3 +90,19 @@ def test_spans_are_hashable(en_tokenizer):
|
||||||
span3 = tokens[0:2]
|
span3 = tokens[0:2]
|
||||||
assert hash(span3) == hash(span1)
|
assert hash(span3) == hash(span1)
|
||||||
|
|
||||||
|
|
||||||
|
def test_spans_by_character(doc):
|
||||||
|
span1 = doc[1:-2]
|
||||||
|
span2 = doc.char_span(span1.start_char, span1.end_char, label='GPE')
|
||||||
|
assert span1.start_char == span2.start_char
|
||||||
|
assert span1.end_char == span2.end_char
|
||||||
|
assert span2.label_ == 'GPE'
|
||||||
|
|
||||||
|
|
||||||
|
def test_span_to_array(doc):
|
||||||
|
span = doc[1:-2]
|
||||||
|
arr = span.to_array([ORTH, LENGTH])
|
||||||
|
assert arr.shape == (len(span), 2)
|
||||||
|
assert arr[0, 0] == span[0].orth
|
||||||
|
assert arr[0, 1] == len(span[0])
|
||||||
|
|
||||||
|
|
|
@ -79,9 +79,9 @@ def add_vecs_to_vocab(vocab, vectors):
|
||||||
"""Add list of vector tuples to given vocab. All vectors need to have the
|
"""Add list of vector tuples to given vocab. All vectors need to have the
|
||||||
same length. Format: [("text", [1, 2, 3])]"""
|
same length. Format: [("text", [1, 2, 3])]"""
|
||||||
length = len(vectors[0][1])
|
length = len(vectors[0][1])
|
||||||
vocab.resize_vectors(length)
|
vocab.clear_vectors(length)
|
||||||
for word, vec in vectors:
|
for word, vec in vectors:
|
||||||
vocab[word].vector = vec
|
vocab.set_vector(word, vec)
|
||||||
return vocab
|
return vocab
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -14,10 +14,9 @@ def vectors():
|
||||||
|
|
||||||
@pytest.fixture()
|
@pytest.fixture()
|
||||||
def vocab(en_vocab, vectors):
|
def vocab(en_vocab, vectors):
|
||||||
#return add_vecs_to_vocab(en_vocab, vectors)
|
add_vecs_to_vocab(en_vocab, vectors)
|
||||||
return None
|
return en_vocab
|
||||||
|
|
||||||
@pytest.mark.xfail
|
|
||||||
def test_vectors_similarity_LL(vocab, vectors):
|
def test_vectors_similarity_LL(vocab, vectors):
|
||||||
[(word1, vec1), (word2, vec2)] = vectors
|
[(word1, vec1), (word2, vec2)] = vectors
|
||||||
lex1 = vocab[word1]
|
lex1 = vocab[word1]
|
||||||
|
@ -31,7 +30,6 @@ def test_vectors_similarity_LL(vocab, vectors):
|
||||||
assert numpy.isclose(lex2.similarity(lex2), lex1.similarity(lex1))
|
assert numpy.isclose(lex2.similarity(lex2), lex1.similarity(lex1))
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail
|
|
||||||
def test_vectors_similarity_TT(vocab, vectors):
|
def test_vectors_similarity_TT(vocab, vectors):
|
||||||
[(word1, vec1), (word2, vec2)] = vectors
|
[(word1, vec1), (word2, vec2)] = vectors
|
||||||
doc = get_doc(vocab, words=[word1, word2])
|
doc = get_doc(vocab, words=[word1, word2])
|
||||||
|
@ -44,21 +42,18 @@ def test_vectors_similarity_TT(vocab, vectors):
|
||||||
assert numpy.isclose(doc[1].similarity(doc[0]), doc[0].similarity(doc[1]))
|
assert numpy.isclose(doc[1].similarity(doc[0]), doc[0].similarity(doc[1]))
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail
|
|
||||||
def test_vectors_similarity_TD(vocab, vectors):
|
def test_vectors_similarity_TD(vocab, vectors):
|
||||||
[(word1, vec1), (word2, vec2)] = vectors
|
[(word1, vec1), (word2, vec2)] = vectors
|
||||||
doc = get_doc(vocab, words=[word1, word2])
|
doc = get_doc(vocab, words=[word1, word2])
|
||||||
assert doc.similarity(doc[0]) == doc[0].similarity(doc)
|
assert doc.similarity(doc[0]) == doc[0].similarity(doc)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail
|
|
||||||
def test_vectors_similarity_DS(vocab, vectors):
|
def test_vectors_similarity_DS(vocab, vectors):
|
||||||
[(word1, vec1), (word2, vec2)] = vectors
|
[(word1, vec1), (word2, vec2)] = vectors
|
||||||
doc = get_doc(vocab, words=[word1, word2])
|
doc = get_doc(vocab, words=[word1, word2])
|
||||||
assert doc.similarity(doc[:2]) == doc[:2].similarity(doc)
|
assert doc.similarity(doc[:2]) == doc[:2].similarity(doc)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail
|
|
||||||
def test_vectors_similarity_TS(vocab, vectors):
|
def test_vectors_similarity_TS(vocab, vectors):
|
||||||
[(word1, vec1), (word2, vec2)] = vectors
|
[(word1, vec1), (word2, vec2)] = vectors
|
||||||
doc = get_doc(vocab, words=[word1, word2])
|
doc = get_doc(vocab, words=[word1, word2])
|
||||||
|
|
|
@ -2,6 +2,8 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ...vectors import Vectors
|
from ...vectors import Vectors
|
||||||
|
from ...tokenizer import Tokenizer
|
||||||
|
from ..util import add_vecs_to_vocab, get_doc
|
||||||
|
|
||||||
import numpy
|
import numpy
|
||||||
import pytest
|
import pytest
|
||||||
|
@ -11,22 +13,42 @@ import pytest
|
||||||
def strings():
|
def strings():
|
||||||
return ["apple", "orange"]
|
return ["apple", "orange"]
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def vectors():
|
||||||
|
return [
|
||||||
|
("apple", [1, 2, 3]),
|
||||||
|
("orange", [-1, -2, -3]),
|
||||||
|
('and', [-1, -1, -1]),
|
||||||
|
('juice', [5, 5, 10]),
|
||||||
|
('pie', [7, 6.3, 8.9])]
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def data():
|
def data():
|
||||||
return numpy.asarray([[0.0, 1.0, 2.0], [3.0, -2.0, 4.0]], dtype='f')
|
return numpy.asarray([[0.0, 1.0, 2.0], [3.0, -2.0, 4.0]], dtype='f')
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def vocab(en_vocab, vectors):
|
||||||
|
add_vecs_to_vocab(en_vocab, vectors)
|
||||||
|
return en_vocab
|
||||||
|
|
||||||
|
|
||||||
def test_init_vectors_with_data(strings, data):
|
def test_init_vectors_with_data(strings, data):
|
||||||
v = Vectors(strings, data)
|
v = Vectors(strings, data)
|
||||||
assert v.shape == data.shape
|
assert v.shape == data.shape
|
||||||
|
|
||||||
def test_init_vectors_with_width(strings):
|
def test_init_vectors_with_width(strings):
|
||||||
v = Vectors(strings, 3)
|
v = Vectors(strings, 3)
|
||||||
|
for string in strings:
|
||||||
|
v.add(string)
|
||||||
assert v.shape == (len(strings), 3)
|
assert v.shape == (len(strings), 3)
|
||||||
|
|
||||||
|
|
||||||
def test_get_vector(strings, data):
|
def test_get_vector(strings, data):
|
||||||
v = Vectors(strings, data)
|
v = Vectors(strings, data)
|
||||||
|
for string in strings:
|
||||||
|
v.add(string)
|
||||||
assert list(v[strings[0]]) == list(data[0])
|
assert list(v[strings[0]]) == list(data[0])
|
||||||
assert list(v[strings[0]]) != list(data[1])
|
assert list(v[strings[0]]) != list(data[1])
|
||||||
assert list(v[strings[1]]) != list(data[0])
|
assert list(v[strings[1]]) != list(data[0])
|
||||||
|
@ -35,6 +57,8 @@ def test_get_vector(strings, data):
|
||||||
def test_set_vector(strings, data):
|
def test_set_vector(strings, data):
|
||||||
orig = data.copy()
|
orig = data.copy()
|
||||||
v = Vectors(strings, data)
|
v = Vectors(strings, data)
|
||||||
|
for string in strings:
|
||||||
|
v.add(string)
|
||||||
assert list(v[strings[0]]) == list(orig[0])
|
assert list(v[strings[0]]) == list(orig[0])
|
||||||
assert list(v[strings[0]]) != list(orig[1])
|
assert list(v[strings[0]]) != list(orig[1])
|
||||||
v[strings[0]] = data[1]
|
v[strings[0]] = data[1]
|
||||||
|
@ -42,125 +66,111 @@ def test_set_vector(strings, data):
|
||||||
assert list(v[strings[0]]) != list(orig[0])
|
assert list(v[strings[0]]) != list(orig[0])
|
||||||
|
|
||||||
|
|
||||||
#
|
|
||||||
#@pytest.fixture()
|
@pytest.fixture()
|
||||||
#def tokenizer_v(vocab):
|
def tokenizer_v(vocab):
|
||||||
# return Tokenizer(vocab, {}, None, None, None)
|
return Tokenizer(vocab, {}, None, None, None)
|
||||||
#
|
|
||||||
#
|
|
||||||
#@pytest.mark.xfail
|
@pytest.mark.parametrize('text', ["apple and orange"])
|
||||||
#@pytest.mark.parametrize('text', ["apple and orange"])
|
def test_vectors_token_vector(tokenizer_v, vectors, text):
|
||||||
#def test_vectors_token_vector(tokenizer_v, vectors, text):
|
doc = tokenizer_v(text)
|
||||||
# doc = tokenizer_v(text)
|
assert vectors[0] == (doc[0].text, list(doc[0].vector))
|
||||||
# assert vectors[0] == (doc[0].text, list(doc[0].vector))
|
assert vectors[1] == (doc[2].text, list(doc[2].vector))
|
||||||
# assert vectors[1] == (doc[2].text, list(doc[2].vector))
|
|
||||||
#
|
|
||||||
#
|
@pytest.mark.parametrize('text', ["apple", "orange"])
|
||||||
#@pytest.mark.xfail
|
def test_vectors_lexeme_vector(vocab, text):
|
||||||
#@pytest.mark.parametrize('text', ["apple", "orange"])
|
lex = vocab[text]
|
||||||
#def test_vectors_lexeme_vector(vocab, text):
|
assert list(lex.vector)
|
||||||
# lex = vocab[text]
|
assert lex.vector_norm
|
||||||
# assert list(lex.vector)
|
|
||||||
# assert lex.vector_norm
|
|
||||||
#
|
@pytest.mark.parametrize('text', [["apple", "and", "orange"]])
|
||||||
#
|
def test_vectors_doc_vector(vocab, text):
|
||||||
#@pytest.mark.xfail
|
doc = get_doc(vocab, text)
|
||||||
#@pytest.mark.parametrize('text', [["apple", "and", "orange"]])
|
assert list(doc.vector)
|
||||||
#def test_vectors_doc_vector(vocab, text):
|
assert doc.vector_norm
|
||||||
# doc = get_doc(vocab, text)
|
|
||||||
# assert list(doc.vector)
|
|
||||||
# assert doc.vector_norm
|
@pytest.mark.parametrize('text', [["apple", "and", "orange"]])
|
||||||
#
|
def test_vectors_span_vector(vocab, text):
|
||||||
#
|
span = get_doc(vocab, text)[0:2]
|
||||||
#@pytest.mark.xfail
|
assert list(span.vector)
|
||||||
#@pytest.mark.parametrize('text', [["apple", "and", "orange"]])
|
assert span.vector_norm
|
||||||
#def test_vectors_span_vector(vocab, text):
|
|
||||||
# span = get_doc(vocab, text)[0:2]
|
|
||||||
# assert list(span.vector)
|
@pytest.mark.parametrize('text', ["apple orange"])
|
||||||
# assert span.vector_norm
|
def test_vectors_token_token_similarity(tokenizer_v, text):
|
||||||
#
|
doc = tokenizer_v(text)
|
||||||
#
|
assert doc[0].similarity(doc[1]) == doc[1].similarity(doc[0])
|
||||||
#@pytest.mark.xfail
|
assert -1. < doc[0].similarity(doc[1]) < 1.0
|
||||||
#@pytest.mark.parametrize('text', ["apple orange"])
|
|
||||||
#def test_vectors_token_token_similarity(tokenizer_v, text):
|
|
||||||
# doc = tokenizer_v(text)
|
@pytest.mark.parametrize('text1,text2', [("apple", "orange")])
|
||||||
# assert doc[0].similarity(doc[1]) == doc[1].similarity(doc[0])
|
def test_vectors_token_lexeme_similarity(tokenizer_v, vocab, text1, text2):
|
||||||
# assert 0.0 < doc[0].similarity(doc[1]) < 1.0
|
token = tokenizer_v(text1)
|
||||||
#
|
lex = vocab[text2]
|
||||||
#
|
assert token.similarity(lex) == lex.similarity(token)
|
||||||
#@pytest.mark.xfail
|
assert -1. < token.similarity(lex) < 1.0
|
||||||
#@pytest.mark.parametrize('text1,text2', [("apple", "orange")])
|
|
||||||
#def test_vectors_token_lexeme_similarity(tokenizer_v, vocab, text1, text2):
|
|
||||||
# token = tokenizer_v(text1)
|
@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
|
||||||
# lex = vocab[text2]
|
def test_vectors_token_span_similarity(vocab, text):
|
||||||
# assert token.similarity(lex) == lex.similarity(token)
|
doc = get_doc(vocab, text)
|
||||||
# assert 0.0 < token.similarity(lex) < 1.0
|
assert doc[0].similarity(doc[1:3]) == doc[1:3].similarity(doc[0])
|
||||||
#
|
assert -1. < doc[0].similarity(doc[1:3]) < 1.0
|
||||||
#
|
|
||||||
#@pytest.mark.xfail
|
|
||||||
#@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
|
@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
|
||||||
#def test_vectors_token_span_similarity(vocab, text):
|
def test_vectors_token_doc_similarity(vocab, text):
|
||||||
# doc = get_doc(vocab, text)
|
doc = get_doc(vocab, text)
|
||||||
# assert doc[0].similarity(doc[1:3]) == doc[1:3].similarity(doc[0])
|
assert doc[0].similarity(doc) == doc.similarity(doc[0])
|
||||||
# assert 0.0 < doc[0].similarity(doc[1:3]) < 1.0
|
assert -1. < doc[0].similarity(doc) < 1.0
|
||||||
#
|
|
||||||
#
|
|
||||||
#@pytest.mark.xfail
|
@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
|
||||||
#@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
|
def test_vectors_lexeme_span_similarity(vocab, text):
|
||||||
#def test_vectors_token_doc_similarity(vocab, text):
|
doc = get_doc(vocab, text)
|
||||||
# doc = get_doc(vocab, text)
|
lex = vocab[text[0]]
|
||||||
# assert doc[0].similarity(doc) == doc.similarity(doc[0])
|
assert lex.similarity(doc[1:3]) == doc[1:3].similarity(lex)
|
||||||
# assert 0.0 < doc[0].similarity(doc) < 1.0
|
assert -1. < doc.similarity(doc[1:3]) < 1.0
|
||||||
#
|
|
||||||
#
|
|
||||||
#@pytest.mark.xfail
|
@pytest.mark.parametrize('text1,text2', [("apple", "orange")])
|
||||||
#@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
|
def test_vectors_lexeme_lexeme_similarity(vocab, text1, text2):
|
||||||
#def test_vectors_lexeme_span_similarity(vocab, text):
|
lex1 = vocab[text1]
|
||||||
# doc = get_doc(vocab, text)
|
lex2 = vocab[text2]
|
||||||
# lex = vocab[text[0]]
|
assert lex1.similarity(lex2) == lex2.similarity(lex1)
|
||||||
# assert lex.similarity(doc[1:3]) == doc[1:3].similarity(lex)
|
assert -1. < lex1.similarity(lex2) < 1.0
|
||||||
# assert 0.0 < doc.similarity(doc[1:3]) < 1.0
|
|
||||||
#
|
|
||||||
#
|
@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
|
||||||
#@pytest.mark.xfail
|
def test_vectors_lexeme_doc_similarity(vocab, text):
|
||||||
#@pytest.mark.parametrize('text1,text2', [("apple", "orange")])
|
doc = get_doc(vocab, text)
|
||||||
#def test_vectors_lexeme_lexeme_similarity(vocab, text1, text2):
|
lex = vocab[text[0]]
|
||||||
# lex1 = vocab[text1]
|
assert lex.similarity(doc) == doc.similarity(lex)
|
||||||
# lex2 = vocab[text2]
|
assert -1. < lex.similarity(doc) < 1.0
|
||||||
# assert lex1.similarity(lex2) == lex2.similarity(lex1)
|
|
||||||
# assert 0.0 < lex1.similarity(lex2) < 1.0
|
|
||||||
#
|
@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
|
||||||
#
|
def test_vectors_span_span_similarity(vocab, text):
|
||||||
#@pytest.mark.xfail
|
doc = get_doc(vocab, text)
|
||||||
#@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
|
assert doc[0:2].similarity(doc[1:3]) == doc[1:3].similarity(doc[0:2])
|
||||||
#def test_vectors_lexeme_doc_similarity(vocab, text):
|
assert -1. < doc[0:2].similarity(doc[1:3]) < 1.0
|
||||||
# doc = get_doc(vocab, text)
|
|
||||||
# lex = vocab[text[0]]
|
|
||||||
# assert lex.similarity(doc) == doc.similarity(lex)
|
@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
|
||||||
# assert 0.0 < lex.similarity(doc) < 1.0
|
def test_vectors_span_doc_similarity(vocab, text):
|
||||||
#
|
doc = get_doc(vocab, text)
|
||||||
#
|
assert doc[0:2].similarity(doc) == doc.similarity(doc[0:2])
|
||||||
#@pytest.mark.xfail
|
assert -1. < doc[0:2].similarity(doc) < 1.0
|
||||||
#@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
|
|
||||||
#def test_vectors_span_span_similarity(vocab, text):
|
|
||||||
# doc = get_doc(vocab, text)
|
@pytest.mark.parametrize('text1,text2', [
|
||||||
# assert doc[0:2].similarity(doc[1:3]) == doc[1:3].similarity(doc[0:2])
|
(["apple", "and", "apple", "pie"], ["orange", "juice"])])
|
||||||
# assert 0.0 < doc[0:2].similarity(doc[1:3]) < 1.0
|
def test_vectors_doc_doc_similarity(vocab, text1, text2):
|
||||||
#
|
doc1 = get_doc(vocab, text1)
|
||||||
#
|
doc2 = get_doc(vocab, text2)
|
||||||
#@pytest.mark.xfail
|
assert doc1.similarity(doc2) == doc2.similarity(doc1)
|
||||||
#@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
|
assert -1. < doc1.similarity(doc2) < 1.0
|
||||||
#def test_vectors_span_doc_similarity(vocab, text):
|
|
||||||
# doc = get_doc(vocab, text)
|
|
||||||
# assert doc[0:2].similarity(doc) == doc.similarity(doc[0:2])
|
|
||||||
# assert 0.0 < doc[0:2].similarity(doc) < 1.0
|
|
||||||
#
|
|
||||||
#
|
|
||||||
#@pytest.mark.xfail
|
|
||||||
#@pytest.mark.parametrize('text1,text2', [
|
|
||||||
# (["apple", "and", "apple", "pie"], ["orange", "juice"])])
|
|
||||||
#def test_vectors_doc_doc_similarity(vocab, text1, text2):
|
|
||||||
# doc1 = get_doc(vocab, text1)
|
|
||||||
# doc2 = get_doc(vocab, text2)
|
|
||||||
# assert doc1.similarity(doc2) == doc2.similarity(doc1)
|
|
||||||
# assert 0.0 < doc1.similarity(doc2) < 1.0
|
|
||||||
|
|
|
@ -33,6 +33,7 @@ cdef class Doc:
|
||||||
cdef public object _vector_norm
|
cdef public object _vector_norm
|
||||||
|
|
||||||
cdef public object tensor
|
cdef public object tensor
|
||||||
|
cdef public object cats
|
||||||
cdef public object user_data
|
cdef public object user_data
|
||||||
|
|
||||||
cdef TokenC* c
|
cdef TokenC* c
|
||||||
|
|
|
@ -117,6 +117,7 @@ cdef class Doc:
|
||||||
self.is_tagged = False
|
self.is_tagged = False
|
||||||
self.is_parsed = False
|
self.is_parsed = False
|
||||||
self.sentiment = 0.0
|
self.sentiment = 0.0
|
||||||
|
self.cats = {}
|
||||||
self.user_hooks = {}
|
self.user_hooks = {}
|
||||||
self.user_token_hooks = {}
|
self.user_token_hooks = {}
|
||||||
self.user_span_hooks = {}
|
self.user_span_hooks = {}
|
||||||
|
@ -237,6 +238,29 @@ cdef class Doc:
|
||||||
def doc(self):
|
def doc(self):
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
def char_span(self, int start_idx, int end_idx, label=0, vector=None):
|
||||||
|
"""Create a `Span` object from the slice `doc.text[start : end]`.
|
||||||
|
|
||||||
|
doc (Doc): The parent document.
|
||||||
|
start (int): The index of the first character of the span.
|
||||||
|
end (int): The index of the first character after the span.
|
||||||
|
label (uint64 or string): A label to attach to the Span, e.g. for named entities.
|
||||||
|
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span.
|
||||||
|
RETURNS (Span): The newly constructed object.
|
||||||
|
"""
|
||||||
|
if not isinstance(label, int):
|
||||||
|
label = self.vocab.strings.add(label)
|
||||||
|
cdef int start = token_by_start(self.c, self.length, start_idx)
|
||||||
|
if start == -1:
|
||||||
|
return None
|
||||||
|
cdef int end = token_by_end(self.c, self.length, end_idx)
|
||||||
|
if end == -1:
|
||||||
|
return None
|
||||||
|
# Currently we have the token index, we want the range-end index
|
||||||
|
end += 1
|
||||||
|
cdef Span span = Span(self, start, end, label=label, vector=vector)
|
||||||
|
return span
|
||||||
|
|
||||||
def similarity(self, other):
|
def similarity(self, other):
|
||||||
"""Make a semantic similarity estimate. The default estimate is cosine
|
"""Make a semantic similarity estimate. The default estimate is cosine
|
||||||
similarity using an average of word vectors.
|
similarity using an average of word vectors.
|
||||||
|
@ -279,8 +303,14 @@ cdef class Doc:
|
||||||
return self.user_hooks['vector'](self)
|
return self.user_hooks['vector'](self)
|
||||||
if self._vector is not None:
|
if self._vector is not None:
|
||||||
return self._vector
|
return self._vector
|
||||||
elif self.has_vector and len(self):
|
elif not len(self):
|
||||||
self._vector = sum(t.vector for t in self) / len(self)
|
self._vector = numpy.zeros((self.vocab.vectors_length,), dtype='f')
|
||||||
|
return self._vector
|
||||||
|
elif self.has_vector:
|
||||||
|
vector = numpy.zeros((self.vocab.vectors_length,), dtype='f')
|
||||||
|
for token in self.c[:self.length]:
|
||||||
|
vector += self.vocab.get_vector(token.lex.orth)
|
||||||
|
self._vector = vector / len(self)
|
||||||
return self._vector
|
return self._vector
|
||||||
elif self.tensor is not None:
|
elif self.tensor is not None:
|
||||||
self._vector = self.tensor.mean(axis=0)
|
self._vector = self.tensor.mean(axis=0)
|
||||||
|
|
|
@ -15,5 +15,5 @@ cdef class Span:
|
||||||
cdef public _vector
|
cdef public _vector
|
||||||
cdef public _vector_norm
|
cdef public _vector_norm
|
||||||
|
|
||||||
|
|
||||||
cpdef int _recalculate_indices(self) except -1
|
cpdef int _recalculate_indices(self) except -1
|
||||||
|
cpdef np.ndarray to_array(self, object features)
|
||||||
|
|
|
@ -7,7 +7,7 @@ import numpy
|
||||||
import numpy.linalg
|
import numpy.linalg
|
||||||
from libc.math cimport sqrt
|
from libc.math cimport sqrt
|
||||||
|
|
||||||
from .doc cimport token_by_start, token_by_end
|
from .doc cimport token_by_start, token_by_end, get_token_attr
|
||||||
from ..structs cimport TokenC, LexemeC
|
from ..structs cimport TokenC, LexemeC
|
||||||
from ..typedefs cimport flags_t, attr_t, hash_t
|
from ..typedefs cimport flags_t, attr_t, hash_t
|
||||||
from ..attrs cimport attr_id_t
|
from ..attrs cimport attr_id_t
|
||||||
|
@ -135,6 +135,29 @@ cdef class Span:
|
||||||
return 0.0
|
return 0.0
|
||||||
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
|
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
|
||||||
|
|
||||||
|
cpdef np.ndarray to_array(self, object py_attr_ids):
|
||||||
|
"""Given a list of M attribute IDs, export the tokens to a numpy
|
||||||
|
`ndarray` of shape `(N, M)`, where `N` is the length of the document.
|
||||||
|
The values will be 32-bit integers.
|
||||||
|
|
||||||
|
attr_ids (list[int]): A list of attribute ID ints.
|
||||||
|
RETURNS (numpy.ndarray[long, ndim=2]): A feature matrix, with one row
|
||||||
|
per word, and one column per attribute indicated in the input
|
||||||
|
`attr_ids`.
|
||||||
|
"""
|
||||||
|
cdef int i, j
|
||||||
|
cdef attr_id_t feature
|
||||||
|
cdef np.ndarray[attr_t, ndim=2] output
|
||||||
|
# Make an array from the attributes --- otherwise our inner loop is Python
|
||||||
|
# dict iteration.
|
||||||
|
cdef np.ndarray[attr_t, ndim=1] attr_ids = numpy.asarray(py_attr_ids, dtype=numpy.uint64)
|
||||||
|
cdef int length = self.end - self.start
|
||||||
|
output = numpy.ndarray(shape=(length, len(attr_ids)), dtype=numpy.uint64)
|
||||||
|
for i in range(self.start, self.end):
|
||||||
|
for j, feature in enumerate(attr_ids):
|
||||||
|
output[i-self.start, j] = get_token_attr(&self.doc.c[i], feature)
|
||||||
|
return output
|
||||||
|
|
||||||
cpdef int _recalculate_indices(self) except -1:
|
cpdef int _recalculate_indices(self) except -1:
|
||||||
if self.end > self.doc.length \
|
if self.end > self.doc.length \
|
||||||
or self.doc.c[self.start].idx != self.start_char \
|
or self.doc.c[self.start].idx != self.start_char \
|
||||||
|
|
|
@ -62,18 +62,26 @@ cdef class Token:
|
||||||
|
|
||||||
def __richcmp__(self, Token other, int op):
|
def __richcmp__(self, Token other, int op):
|
||||||
# http://cython.readthedocs.io/en/latest/src/userguide/special_methods.html
|
# http://cython.readthedocs.io/en/latest/src/userguide/special_methods.html
|
||||||
|
cdef Doc my_doc = self.doc
|
||||||
|
cdef Doc other_doc = other.doc
|
||||||
my = self.idx
|
my = self.idx
|
||||||
their = other.idx if other is not None else None
|
their = other.idx if other is not None else None
|
||||||
if op == 0:
|
if op == 0:
|
||||||
return my < their
|
return my < their
|
||||||
elif op == 2:
|
elif op == 2:
|
||||||
|
if my_doc is other_doc:
|
||||||
return my == their
|
return my == their
|
||||||
|
else:
|
||||||
|
return False
|
||||||
elif op == 4:
|
elif op == 4:
|
||||||
return my > their
|
return my > their
|
||||||
elif op == 1:
|
elif op == 1:
|
||||||
return my <= their
|
return my <= their
|
||||||
elif op == 3:
|
elif op == 3:
|
||||||
|
if my_doc is other_doc:
|
||||||
return my != their
|
return my != their
|
||||||
|
else:
|
||||||
|
return True
|
||||||
elif op == 5:
|
elif op == 5:
|
||||||
return my >= their
|
return my >= their
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -22,7 +22,7 @@ import ujson
|
||||||
|
|
||||||
from .symbols import ORTH
|
from .symbols import ORTH
|
||||||
from .compat import cupy, CudaStream, path2str, basestring_, input_, unicode_
|
from .compat import cupy, CudaStream, path2str, basestring_, input_, unicode_
|
||||||
from .compat import copy_array, normalize_string_keys, getattr_
|
from .compat import copy_array, normalize_string_keys, getattr_, import_file
|
||||||
|
|
||||||
|
|
||||||
LANGUAGES = {}
|
LANGUAGES = {}
|
||||||
|
@ -112,15 +112,13 @@ def load_model(name, **overrides):
|
||||||
|
|
||||||
def load_model_from_link(name, **overrides):
|
def load_model_from_link(name, **overrides):
|
||||||
"""Load a model from a shortcut link, or directory in spaCy data path."""
|
"""Load a model from a shortcut link, or directory in spaCy data path."""
|
||||||
init_file = get_data_path() / name / '__init__.py'
|
path = get_data_path() / name / '__init__.py'
|
||||||
spec = importlib.util.spec_from_file_location(name, init_file)
|
|
||||||
try:
|
try:
|
||||||
cls = importlib.util.module_from_spec(spec)
|
cls = import_file(name, path)
|
||||||
except AttributeError:
|
except AttributeError:
|
||||||
raise IOError(
|
raise IOError(
|
||||||
"Cant' load '%s'. If you're using a shortcut link, make sure it "
|
"Cant' load '%s'. If you're using a shortcut link, make sure it "
|
||||||
"points to a valid model package (not just a data directory)." % name)
|
"points to a valid model package (not just a data directory)." % name)
|
||||||
spec.loader.exec_module(cls)
|
|
||||||
return cls.load(**overrides)
|
return cls.load(**overrides)
|
||||||
|
|
||||||
|
|
||||||
|
@ -171,8 +169,8 @@ def get_model_meta(path):
|
||||||
raise IOError("Could not read meta.json from %s" % meta_path)
|
raise IOError("Could not read meta.json from %s" % meta_path)
|
||||||
meta = read_json(meta_path)
|
meta = read_json(meta_path)
|
||||||
for setting in ['lang', 'name', 'version']:
|
for setting in ['lang', 'name', 'version']:
|
||||||
if setting not in meta:
|
if setting not in meta or not meta[setting]:
|
||||||
raise ValueError('No %s setting found in model meta.json' % setting)
|
raise ValueError("No valid '%s' setting found in model meta.json" % setting)
|
||||||
return meta
|
return meta
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,18 +1,25 @@
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
from libc.stdint cimport int32_t, uint64_t
|
||||||
import numpy
|
import numpy
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
import msgpack
|
import msgpack
|
||||||
import msgpack_numpy
|
import msgpack_numpy
|
||||||
msgpack_numpy.patch()
|
msgpack_numpy.patch()
|
||||||
|
cimport numpy as np
|
||||||
|
|
||||||
|
from .typedefs cimport attr_t
|
||||||
from .strings cimport StringStore
|
from .strings cimport StringStore
|
||||||
from . import util
|
from . import util
|
||||||
|
from .compat import basestring_
|
||||||
|
|
||||||
|
|
||||||
cdef class Vectors:
|
cdef class Vectors:
|
||||||
'''Store, save and load word vectors.'''
|
'''Store, save and load word vectors.'''
|
||||||
cdef public object data
|
cdef public object data
|
||||||
cdef readonly StringStore strings
|
cdef readonly StringStore strings
|
||||||
cdef public object key2i
|
cdef public object key2row
|
||||||
|
cdef public object keys
|
||||||
|
cdef public int i
|
||||||
|
|
||||||
def __init__(self, strings, data_or_width):
|
def __init__(self, strings, data_or_width):
|
||||||
self.strings = StringStore()
|
self.strings = StringStore()
|
||||||
|
@ -21,10 +28,10 @@ cdef class Vectors:
|
||||||
dtype='f')
|
dtype='f')
|
||||||
else:
|
else:
|
||||||
data = data_or_width
|
data = data_or_width
|
||||||
|
self.i = 0
|
||||||
self.data = data
|
self.data = data
|
||||||
self.key2i = {}
|
self.key2row = {}
|
||||||
for i, string in enumerate(strings):
|
self.keys = np.ndarray((self.data.shape[0],), dtype='uint64')
|
||||||
self.key2i[self.strings.add(string)] = i
|
|
||||||
|
|
||||||
def __reduce__(self):
|
def __reduce__(self):
|
||||||
return (Vectors, (self.strings, self.data))
|
return (Vectors, (self.strings, self.data))
|
||||||
|
@ -32,7 +39,7 @@ cdef class Vectors:
|
||||||
def __getitem__(self, key):
|
def __getitem__(self, key):
|
||||||
if isinstance(key, basestring):
|
if isinstance(key, basestring):
|
||||||
key = self.strings[key]
|
key = self.strings[key]
|
||||||
i = self.key2i[key]
|
i = self.key2row[key]
|
||||||
if i is None:
|
if i is None:
|
||||||
raise KeyError(key)
|
raise KeyError(key)
|
||||||
else:
|
else:
|
||||||
|
@ -41,14 +48,36 @@ cdef class Vectors:
|
||||||
def __setitem__(self, key, vector):
|
def __setitem__(self, key, vector):
|
||||||
if isinstance(key, basestring):
|
if isinstance(key, basestring):
|
||||||
key = self.strings.add(key)
|
key = self.strings.add(key)
|
||||||
i = self.key2i[key]
|
i = self.key2row[key]
|
||||||
self.data[i] = vector
|
self.data[i] = vector
|
||||||
|
|
||||||
def __iter__(self):
|
def __iter__(self):
|
||||||
yield from self.data
|
yield from self.data
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
return len(self.strings)
|
return self.i
|
||||||
|
|
||||||
|
def __contains__(self, key):
|
||||||
|
if isinstance(key, basestring_):
|
||||||
|
key = self.strings[key]
|
||||||
|
return key in self.key2row
|
||||||
|
|
||||||
|
def add(self, key, vector=None):
|
||||||
|
if isinstance(key, basestring_):
|
||||||
|
key = self.strings.add(key)
|
||||||
|
if key not in self.key2row:
|
||||||
|
i = self.i
|
||||||
|
if i >= self.keys.shape[0]:
|
||||||
|
self.keys.resize((self.keys.shape[0]*2,))
|
||||||
|
self.data.resize((self.data.shape[0]*2, self.data.shape[1]))
|
||||||
|
self.key2row[key] = self.i
|
||||||
|
self.keys[self.i] = key
|
||||||
|
self.i += 1
|
||||||
|
else:
|
||||||
|
i = self.key2row[key]
|
||||||
|
if vector is not None:
|
||||||
|
self.data[i] = vector
|
||||||
|
return i
|
||||||
|
|
||||||
def items(self):
|
def items(self):
|
||||||
for i, string in enumerate(self.strings):
|
for i, string in enumerate(self.strings):
|
||||||
|
@ -61,34 +90,87 @@ cdef class Vectors:
|
||||||
def most_similar(self, key):
|
def most_similar(self, key):
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def to_disk(self, path):
|
def from_glove(self, path):
|
||||||
raise NotImplementedError
|
'''Load GloVe vectors from a directory. Assumes binary format,
|
||||||
|
that the vocab is in a vocab.txt, and that vectors are named
|
||||||
|
vectors.{size}.[fd].bin, e.g. vectors.128.f.bin for 128d float32
|
||||||
|
vectors, vectors.300.d.bin for 300d float64 (double) vectors, etc.
|
||||||
|
By default GloVe outputs 64-bit vectors.'''
|
||||||
|
path = util.ensure_path(path)
|
||||||
|
for name in path.iterdir():
|
||||||
|
if name.parts[-1].startswith('vectors'):
|
||||||
|
_, dims, dtype, _2 = name.parts[-1].split('.')
|
||||||
|
self.width = int(dims)
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
raise IOError("Expected file named e.g. vectors.128.f.bin")
|
||||||
|
bin_loc = path / 'vectors.{dims}.{dtype}.bin'.format(dims=dims,
|
||||||
|
dtype=dtype)
|
||||||
|
with bin_loc.open('rb') as file_:
|
||||||
|
self.data = numpy.fromfile(file_, dtype='float64')
|
||||||
|
self.data = numpy.ascontiguousarray(self.data, dtype='float32')
|
||||||
|
n = 0
|
||||||
|
with (path / 'vocab.txt').open('r') as file_:
|
||||||
|
for line in file_:
|
||||||
|
self.add(line.strip())
|
||||||
|
n += 1
|
||||||
|
if (self.data.size % self.width) == 0:
|
||||||
|
self.data
|
||||||
|
|
||||||
def from_disk(self, path):
|
def to_disk(self, path, **exclude):
|
||||||
raise NotImplementedError
|
serializers = OrderedDict((
|
||||||
|
('vectors', lambda p: numpy.save(p.open('wb'), self.data, allow_pickle=False)),
|
||||||
|
('keys', lambda p: numpy.save(p.open('wb'), self.keys, allow_pickle=False)),
|
||||||
|
))
|
||||||
|
return util.to_disk(path, serializers, exclude)
|
||||||
|
|
||||||
|
def from_disk(self, path, **exclude):
|
||||||
|
def load_keys(path):
|
||||||
|
if path.exists():
|
||||||
|
self.keys = numpy.load(path)
|
||||||
|
for i, key in enumerate(self.keys):
|
||||||
|
self.keys[i] = key
|
||||||
|
self.key2row[key] = i
|
||||||
|
|
||||||
|
def load_vectors(path):
|
||||||
|
if path.exists():
|
||||||
|
self.data = numpy.load(path)
|
||||||
|
|
||||||
|
serializers = OrderedDict((
|
||||||
|
('keys', load_keys),
|
||||||
|
('vectors', load_vectors),
|
||||||
|
))
|
||||||
|
util.from_disk(path, serializers, exclude)
|
||||||
|
return self
|
||||||
|
|
||||||
def to_bytes(self, **exclude):
|
def to_bytes(self, **exclude):
|
||||||
def serialize_weights():
|
def serialize_weights():
|
||||||
if hasattr(self.weights, 'to_bytes'):
|
if hasattr(self.data, 'to_bytes'):
|
||||||
return self.weights.to_bytes()
|
return self.data.to_bytes()
|
||||||
else:
|
else:
|
||||||
return msgpack.dumps(self.weights)
|
return msgpack.dumps(self.data)
|
||||||
|
|
||||||
serializers = OrderedDict((
|
serializers = OrderedDict((
|
||||||
('strings', lambda: self.strings.to_bytes()),
|
('keys', lambda: msgpack.dumps(self.keys)),
|
||||||
('weights', serialize_weights)
|
('vectors', serialize_weights)
|
||||||
))
|
))
|
||||||
return util.to_bytes(serializers, exclude)
|
return util.to_bytes(serializers, exclude)
|
||||||
|
|
||||||
def from_bytes(self, data, **exclude):
|
def from_bytes(self, data, **exclude):
|
||||||
def deserialize_weights(b):
|
def deserialize_weights(b):
|
||||||
if hasattr(self.weights, 'from_bytes'):
|
if hasattr(self.data, 'from_bytes'):
|
||||||
self.weights.from_bytes()
|
self.data.from_bytes()
|
||||||
else:
|
else:
|
||||||
self.weights = msgpack.loads(b)
|
self.data = msgpack.loads(b)
|
||||||
|
|
||||||
|
def load_keys(keys):
|
||||||
|
self.keys.resize((len(keys),))
|
||||||
|
for i, key in enumerate(keys):
|
||||||
|
self.keys[i] = key
|
||||||
|
self.key2row[key] = i
|
||||||
|
|
||||||
deserializers = OrderedDict((
|
deserializers = OrderedDict((
|
||||||
('strings', lambda b: self.strings.from_bytes(b)),
|
('keys', lambda b: load_keys(msgpack.loads(b))),
|
||||||
('weights', deserialize_weights)
|
('vectors', deserialize_weights)
|
||||||
))
|
))
|
||||||
return util.from_bytes(deserializers, exclude)
|
util.from_bytes(data, deserializers, exclude)
|
||||||
|
return self
|
||||||
|
|
|
@ -4,6 +4,7 @@ from __future__ import unicode_literals
|
||||||
import bz2
|
import bz2
|
||||||
import ujson
|
import ujson
|
||||||
import re
|
import re
|
||||||
|
import numpy
|
||||||
|
|
||||||
from libc.string cimport memset, memcpy
|
from libc.string cimport memset, memcpy
|
||||||
from libc.stdint cimport int32_t
|
from libc.stdint cimport int32_t
|
||||||
|
@ -19,9 +20,10 @@ from .tokens.token cimport Token
|
||||||
from .attrs cimport PROB, LANG
|
from .attrs cimport PROB, LANG
|
||||||
from .structs cimport SerializedLexemeC
|
from .structs cimport SerializedLexemeC
|
||||||
|
|
||||||
from .compat import copy_reg, pickle
|
from .compat import copy_reg, pickle, basestring_
|
||||||
from .lemmatizer import Lemmatizer
|
from .lemmatizer import Lemmatizer
|
||||||
from .attrs import intify_attrs
|
from .attrs import intify_attrs
|
||||||
|
from .vectors import Vectors
|
||||||
from . import util
|
from . import util
|
||||||
from . import attrs
|
from . import attrs
|
||||||
from . import symbols
|
from . import symbols
|
||||||
|
@ -63,6 +65,7 @@ cdef class Vocab:
|
||||||
self.strings.add(name)
|
self.strings.add(name)
|
||||||
self.lex_attr_getters = lex_attr_getters
|
self.lex_attr_getters = lex_attr_getters
|
||||||
self.morphology = Morphology(self.strings, tag_map, lemmatizer)
|
self.morphology = Morphology(self.strings, tag_map, lemmatizer)
|
||||||
|
self.vectors = Vectors(self.strings, 300)
|
||||||
|
|
||||||
property lang:
|
property lang:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
|
@ -242,13 +245,15 @@ cdef class Vocab:
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def vectors_length(self):
|
def vectors_length(self):
|
||||||
raise NotImplementedError
|
return self.vectors.data.shape[1]
|
||||||
|
|
||||||
def clear_vectors(self):
|
def clear_vectors(self, new_dim=None):
|
||||||
"""Drop the current vector table. Because all vectors must be the same
|
"""Drop the current vector table. Because all vectors must be the same
|
||||||
width, you have to call this to change the size of the vectors.
|
width, you have to call this to change the size of the vectors.
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError
|
if new_dim is None:
|
||||||
|
new_dim = self.vectors.data.shape[1]
|
||||||
|
self.vectors = Vectors(self.strings, new_dim)
|
||||||
|
|
||||||
def get_vector(self, orth):
|
def get_vector(self, orth):
|
||||||
"""Retrieve a vector for a word in the vocabulary.
|
"""Retrieve a vector for a word in the vocabulary.
|
||||||
|
@ -262,7 +267,12 @@ cdef class Vocab:
|
||||||
|
|
||||||
RAISES: If no vectors data is loaded, ValueError is raised.
|
RAISES: If no vectors data is loaded, ValueError is raised.
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError
|
if isinstance(orth, basestring_):
|
||||||
|
orth = self.strings.add(orth)
|
||||||
|
if orth in self.vectors.key2row:
|
||||||
|
return self.vectors[orth]
|
||||||
|
else:
|
||||||
|
return numpy.zeros((self.vectors_length,), dtype='f')
|
||||||
|
|
||||||
def set_vector(self, orth, vector):
|
def set_vector(self, orth, vector):
|
||||||
"""Set a vector for a word in the vocabulary.
|
"""Set a vector for a word in the vocabulary.
|
||||||
|
@ -272,15 +282,19 @@ cdef class Vocab:
|
||||||
RETURNS:
|
RETURNS:
|
||||||
None
|
None
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError
|
if not isinstance(orth, basestring_):
|
||||||
|
orth = self.strings[orth]
|
||||||
|
self.vectors.add(orth, vector=vector)
|
||||||
|
|
||||||
def has_vector(self, orth):
|
def has_vector(self, orth):
|
||||||
"""Check whether a word has a vector. Returns False if no
|
"""Check whether a word has a vector. Returns False if no
|
||||||
vectors have been loaded. Words can be looked up by string
|
vectors have been loaded. Words can be looked up by string
|
||||||
or int ID."""
|
or int ID."""
|
||||||
return False
|
if isinstance(orth, basestring_):
|
||||||
|
orth = self.strings.add(orth)
|
||||||
|
return orth in self.vectors
|
||||||
|
|
||||||
def to_disk(self, path):
|
def to_disk(self, path, **exclude):
|
||||||
"""Save the current state to a directory.
|
"""Save the current state to a directory.
|
||||||
|
|
||||||
path (unicode or Path): A path to a directory, which will be created if
|
path (unicode or Path): A path to a directory, which will be created if
|
||||||
|
@ -292,8 +306,10 @@ cdef class Vocab:
|
||||||
self.strings.to_disk(path / 'strings.json')
|
self.strings.to_disk(path / 'strings.json')
|
||||||
with (path / 'lexemes.bin').open('wb') as file_:
|
with (path / 'lexemes.bin').open('wb') as file_:
|
||||||
file_.write(self.lexemes_to_bytes())
|
file_.write(self.lexemes_to_bytes())
|
||||||
|
if self.vectors is not None:
|
||||||
|
self.vectors.to_disk(path)
|
||||||
|
|
||||||
def from_disk(self, path):
|
def from_disk(self, path, **exclude):
|
||||||
"""Loads state from a directory. Modifies the object in place and
|
"""Loads state from a directory. Modifies the object in place and
|
||||||
returns it.
|
returns it.
|
||||||
|
|
||||||
|
@ -305,6 +321,8 @@ cdef class Vocab:
|
||||||
self.strings.from_disk(path / 'strings.json')
|
self.strings.from_disk(path / 'strings.json')
|
||||||
with (path / 'lexemes.bin').open('rb') as file_:
|
with (path / 'lexemes.bin').open('rb') as file_:
|
||||||
self.lexemes_from_bytes(file_.read())
|
self.lexemes_from_bytes(file_.read())
|
||||||
|
if self.vectors is not None:
|
||||||
|
self.vectors.from_disk(path, exclude='strings.json')
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def to_bytes(self, **exclude):
|
def to_bytes(self, **exclude):
|
||||||
|
@ -313,9 +331,16 @@ cdef class Vocab:
|
||||||
**exclude: Named attributes to prevent from being serialized.
|
**exclude: Named attributes to prevent from being serialized.
|
||||||
RETURNS (bytes): The serialized form of the `Vocab` object.
|
RETURNS (bytes): The serialized form of the `Vocab` object.
|
||||||
"""
|
"""
|
||||||
|
def deserialize_vectors():
|
||||||
|
if self.vectors is None:
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
return self.vectors.to_bytes(exclude='strings.json')
|
||||||
|
|
||||||
getters = OrderedDict((
|
getters = OrderedDict((
|
||||||
('strings', lambda: self.strings.to_bytes()),
|
('strings', lambda: self.strings.to_bytes()),
|
||||||
('lexemes', lambda: self.lexemes_to_bytes()),
|
('lexemes', lambda: self.lexemes_to_bytes()),
|
||||||
|
('vectors', deserialize_vectors)
|
||||||
))
|
))
|
||||||
return util.to_bytes(getters, exclude)
|
return util.to_bytes(getters, exclude)
|
||||||
|
|
||||||
|
@ -326,9 +351,15 @@ cdef class Vocab:
|
||||||
**exclude: Named attributes to prevent from being loaded.
|
**exclude: Named attributes to prevent from being loaded.
|
||||||
RETURNS (Vocab): The `Vocab` object.
|
RETURNS (Vocab): The `Vocab` object.
|
||||||
"""
|
"""
|
||||||
|
def serialize_vectors(b):
|
||||||
|
if self.vectors is None:
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
return self.vectors.from_bytes(b, exclude='strings')
|
||||||
setters = OrderedDict((
|
setters = OrderedDict((
|
||||||
('strings', lambda b: self.strings.from_bytes(b)),
|
('strings', lambda b: self.strings.from_bytes(b)),
|
||||||
('lexemes', lambda b: self.lexemes_from_bytes(b)),
|
('lexemes', lambda b: self.lexemes_from_bytes(b)),
|
||||||
|
('vectors', lambda b: serialize_vectors(b))
|
||||||
))
|
))
|
||||||
util.from_bytes(bytes_data, setters, exclude)
|
util.from_bytes(bytes_data, setters, exclude)
|
||||||
return self
|
return self
|
||||||
|
|
|
@ -2,9 +2,8 @@
|
||||||
|
|
||||||
if [ "${VIA}" == "pypi" ]; then
|
if [ "${VIA}" == "pypi" ]; then
|
||||||
rm -rf *
|
rm -rf *
|
||||||
pip install spacy
|
pip install spacy-nightly
|
||||||
python -m spacy.en.download
|
python -m spacy download en
|
||||||
python -m spacy.de.download
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [[ "${VIA}" == "sdist" && "${TRAVIS_PULL_REQUEST}" == "false" ]]; then
|
if [[ "${VIA}" == "sdist" && "${TRAVIS_PULL_REQUEST}" == "false" ]]; then
|
||||||
|
|
|
@ -103,20 +103,20 @@ mixin button(url, trusted, ...style)
|
||||||
label - [string] aside title (optional or false for no label)
|
label - [string] aside title (optional or false for no label)
|
||||||
language - [string] language for syntax highlighting (default: "python")
|
language - [string] language for syntax highlighting (default: "python")
|
||||||
supports basic relevant languages available for PrismJS
|
supports basic relevant languages available for PrismJS
|
||||||
icon - [string] icon to display next to code block, mostly used for old/new
|
prompt - [string] prompt or icon to display next to code block, (mostly used for old/new)
|
||||||
height - [integer] optional height to clip code block to
|
height - [integer] optional height to clip code block to
|
||||||
|
|
||||||
mixin code(label, language, icon, height)
|
mixin code(label, language, prompt, height)
|
||||||
pre.c-code-block.o-block(class="lang-#{(language || DEFAULT_SYNTAX)}" class=icon ? "c-code-block--has-icon" : null style=height ? "height: #{height}px" : null)&attributes(attributes)
|
pre.c-code-block.o-block(class="lang-#{(language || DEFAULT_SYNTAX)}" class=icon ? "c-code-block--has-icon" : null style=height ? "height: #{height}px" : null)&attributes(attributes)
|
||||||
if label
|
if label
|
||||||
h4.u-text-label.u-text-label--dark=label
|
h4.u-text-label.u-text-label--dark=label
|
||||||
|
- var icon = (prompt == 'accept' || prompt == 'reject')
|
||||||
if icon
|
if icon
|
||||||
- var classes = {'accept': 'u-color-green', 'reject': 'u-color-red'}
|
- var classes = {'accept': 'u-color-green', 'reject': 'u-color-red'}
|
||||||
.c-code-block__icon(class=classes[icon] || null class=classes[icon] ? "c-code-block__icon--border" : null)
|
.c-code-block__icon(class=classes[icon] || null class=classes[icon] ? "c-code-block__icon--border" : null)
|
||||||
+icon(icon, 18)
|
+icon(icon, 18)
|
||||||
|
|
||||||
code.c-code-block__content
|
code.c-code-block__content(data-prompt=icon ? null : prompt)
|
||||||
block
|
block
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -112,6 +112,10 @@
|
||||||
.u-nowrap
|
.u-nowrap
|
||||||
white-space: nowrap
|
white-space: nowrap
|
||||||
|
|
||||||
|
.u-break.u-break
|
||||||
|
word-wrap: break-word
|
||||||
|
white-space: initial
|
||||||
|
|
||||||
.u-no-border
|
.u-no-border
|
||||||
border: none
|
border: none
|
||||||
|
|
||||||
|
|
|
@ -35,6 +35,13 @@
|
||||||
font: normal normal 1.1rem/#{2} $font-code
|
font: normal normal 1.1rem/#{2} $font-code
|
||||||
padding: 1em 2em
|
padding: 1em 2em
|
||||||
|
|
||||||
|
&[data-prompt]:before,
|
||||||
|
content: attr(data-prompt)
|
||||||
|
margin-right: 0.65em
|
||||||
|
display: inline-block
|
||||||
|
vertical-align: middle
|
||||||
|
opacity: 0.5
|
||||||
|
|
||||||
|
|
||||||
//- Inline code
|
//- Inline code
|
||||||
|
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user