mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 02:06:31 +03:00
Merge branch 'master' into develop
This commit is contained in:
commit
ad74245be9
|
@ -10,7 +10,7 @@ open-source software, released under the MIT license.
|
||||||
|
|
||||||
📊 **Help us improve the library!** `Take the spaCy user survey <https://survey.spacy.io>`_.
|
📊 **Help us improve the library!** `Take the spaCy user survey <https://survey.spacy.io>`_.
|
||||||
|
|
||||||
💫 **Version 1.7 out now!** `Read the release notes here. <https://github.com/explosion/spaCy/releases/>`_
|
💫 **Version 1.8 out now!** `Read the release notes here. <https://github.com/explosion/spaCy/releases/>`_
|
||||||
|
|
||||||
.. image:: https://img.shields.io/travis/explosion/spaCy/master.svg?style=flat-square
|
.. image:: https://img.shields.io/travis/explosion/spaCy/master.svg?style=flat-square
|
||||||
:target: https://travis-ci.org/explosion/spaCy
|
:target: https://travis-ci.org/explosion/spaCy
|
||||||
|
@ -320,6 +320,7 @@ and ``--model`` are optional and enable additional tests:
|
||||||
=========== ============== ===========
|
=========== ============== ===========
|
||||||
Version Date Description
|
Version Date Description
|
||||||
=========== ============== ===========
|
=========== ============== ===========
|
||||||
|
`v1.8.0`_ ``2017-04-16`` Better NER training, saving and loading
|
||||||
`v1.7.5`_ ``2017-04-07`` Bug fixes and new CLI commands
|
`v1.7.5`_ ``2017-04-07`` Bug fixes and new CLI commands
|
||||||
`v1.7.3`_ ``2017-03-26`` Alpha support for Hebrew, new CLI commands and bug fixes
|
`v1.7.3`_ ``2017-03-26`` Alpha support for Hebrew, new CLI commands and bug fixes
|
||||||
`v1.7.2`_ ``2017-03-20`` Small fixes to beam parser and model linking
|
`v1.7.2`_ ``2017-03-20`` Small fixes to beam parser and model linking
|
||||||
|
@ -350,6 +351,7 @@ Version Date Description
|
||||||
`v0.93`_ ``2015-09-22`` Bug fixes to word vectors
|
`v0.93`_ ``2015-09-22`` Bug fixes to word vectors
|
||||||
=========== ============== ===========
|
=========== ============== ===========
|
||||||
|
|
||||||
|
.. _v1.8.0: https://github.com/explosion/spaCy/releases/tag/v1.8.0
|
||||||
.. _v1.7.5: https://github.com/explosion/spaCy/releases/tag/v1.7.5
|
.. _v1.7.5: https://github.com/explosion/spaCy/releases/tag/v1.7.5
|
||||||
.. _v1.7.3: https://github.com/explosion/spaCy/releases/tag/v1.7.3
|
.. _v1.7.3: https://github.com/explosion/spaCy/releases/tag/v1.7.3
|
||||||
.. _v1.7.2: https://github.com/explosion/spaCy/releases/tag/v1.7.2
|
.. _v1.7.2: https://github.com/explosion/spaCy/releases/tag/v1.7.2
|
||||||
|
|
|
@ -1,7 +1,8 @@
|
||||||
'''Print part-of-speech tagged, true-cased, (very roughly) sentence-separated
|
"""
|
||||||
|
Print part-of-speech tagged, true-cased, (very roughly) sentence-separated
|
||||||
text, with each "sentence" on a newline, and spaces between tokens. Supports
|
text, with each "sentence" on a newline, and spaces between tokens. Supports
|
||||||
multi-processing.
|
multi-processing.
|
||||||
'''
|
"""
|
||||||
from __future__ import print_function, unicode_literals, division
|
from __future__ import print_function, unicode_literals, division
|
||||||
import io
|
import io
|
||||||
import bz2
|
import bz2
|
||||||
|
@ -22,14 +23,14 @@ def parallelize(func, iterator, n_jobs, extra):
|
||||||
|
|
||||||
|
|
||||||
def iter_texts_from_json_bz2(loc):
|
def iter_texts_from_json_bz2(loc):
|
||||||
'''
|
"""
|
||||||
Iterator of unicode strings, one per document (here, a comment).
|
Iterator of unicode strings, one per document (here, a comment).
|
||||||
|
|
||||||
Expects a a path to a BZ2 file, which should be new-line delimited JSON. The
|
Expects a a path to a BZ2 file, which should be new-line delimited JSON. The
|
||||||
document text should be in a string field titled 'body'.
|
document text should be in a string field titled 'body'.
|
||||||
|
|
||||||
This is the data format of the Reddit comments corpus.
|
This is the data format of the Reddit comments corpus.
|
||||||
'''
|
"""
|
||||||
with bz2.BZ2File(loc) as file_:
|
with bz2.BZ2File(loc) as file_:
|
||||||
for i, line in enumerate(file_):
|
for i, line in enumerate(file_):
|
||||||
yield ujson.loads(line)['body']
|
yield ujson.loads(line)['body']
|
||||||
|
@ -80,7 +81,7 @@ def is_sent_begin(word):
|
||||||
def main(in_loc, out_dir, n_workers=4, batch_size=100000):
|
def main(in_loc, out_dir, n_workers=4, batch_size=100000):
|
||||||
if not path.exists(out_dir):
|
if not path.exists(out_dir):
|
||||||
path.join(out_dir)
|
path.join(out_dir)
|
||||||
texts = partition(batch_size, iter_texts(in_loc))
|
texts = partition(batch_size, iter_texts_from_json_bz2(in_loc))
|
||||||
parallelize(transform_texts, enumerate(texts), n_workers, [out_dir])
|
parallelize(transform_texts, enumerate(texts), n_workers, [out_dir])
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,22 +1,45 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
"""
|
||||||
|
Example of training an additional entity type
|
||||||
|
|
||||||
|
This script shows how to add a new entity type to an existing pre-trained NER
|
||||||
|
model. To keep the example short and simple, only four sentences are provided
|
||||||
|
as examples. In practice, you'll need many more — a few hundred would be a
|
||||||
|
good start. You will also likely need to mix in examples of other entity
|
||||||
|
types, which might be obtained by running the entity recognizer over unlabelled
|
||||||
|
sentences, and adding their annotations to the training set.
|
||||||
|
|
||||||
|
The actual training is performed by looping over the examples, and calling
|
||||||
|
`nlp.entity.update()`. The `update()` method steps through the words of the
|
||||||
|
input. At each word, it makes a prediction. It then consults the annotations
|
||||||
|
provided on the GoldParse instance, to see whether it was right. If it was
|
||||||
|
wrong, it adjusts its weights so that the correct action will score higher
|
||||||
|
next time.
|
||||||
|
|
||||||
|
After training your model, you can save it to a directory. We recommend
|
||||||
|
wrapping models as Python packages, for ease of deployment.
|
||||||
|
|
||||||
|
For more details, see the documentation:
|
||||||
|
* Training the Named Entity Recognizer: https://spacy.io/docs/usage/train-ner
|
||||||
|
* Saving and loading models: https://spacy.io/docs/usage/saving-loading
|
||||||
|
|
||||||
|
Developed for: spaCy 1.7.6
|
||||||
|
Last tested for: spaCy 1.7.6
|
||||||
|
"""
|
||||||
|
# coding: utf8
|
||||||
from __future__ import unicode_literals, print_function
|
from __future__ import unicode_literals, print_function
|
||||||
import json
|
|
||||||
import pathlib
|
|
||||||
import random
|
import random
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
import spacy
|
import spacy
|
||||||
from spacy.pipeline import EntityRecognizer
|
from spacy.pipeline import EntityRecognizer
|
||||||
from spacy.gold import GoldParse
|
from spacy.gold import GoldParse
|
||||||
from spacy.tagger import Tagger
|
from spacy.tagger import Tagger
|
||||||
|
|
||||||
|
|
||||||
try:
|
|
||||||
unicode
|
|
||||||
except:
|
|
||||||
unicode = str
|
|
||||||
|
|
||||||
|
|
||||||
def train_ner(nlp, train_data, output_dir):
|
def train_ner(nlp, train_data, output_dir):
|
||||||
# Add new words to vocab.
|
# Add new words to vocab
|
||||||
for raw_text, _ in train_data:
|
for raw_text, _ in train_data:
|
||||||
doc = nlp.make_doc(raw_text)
|
doc = nlp.make_doc(raw_text)
|
||||||
for word in doc:
|
for word in doc:
|
||||||
|
@ -30,11 +53,14 @@ def train_ner(nlp, train_data, output_dir):
|
||||||
nlp.tagger(doc)
|
nlp.tagger(doc)
|
||||||
loss = nlp.entity.update(doc, gold)
|
loss = nlp.entity.update(doc, gold)
|
||||||
nlp.end_training()
|
nlp.end_training()
|
||||||
nlp.save_to_directory(output_dir)
|
if output_dir:
|
||||||
|
nlp.save_to_directory(output_dir)
|
||||||
|
|
||||||
|
|
||||||
def main(model_name, output_directory=None):
|
def main(model_name, output_directory=None):
|
||||||
nlp = spacy.load(model_name)
|
nlp = spacy.load(model_name)
|
||||||
|
if output_directory is not None:
|
||||||
|
output_directory = Path(output_directory)
|
||||||
|
|
||||||
train_data = [
|
train_data = [
|
||||||
(
|
(
|
||||||
|
@ -55,18 +81,18 @@ def main(model_name, output_directory=None):
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
nlp.entity.add_label('ANIMAL')
|
nlp.entity.add_label('ANIMAL')
|
||||||
if output_directory is not None:
|
|
||||||
output_directory = pathlib.Path(output_directory)
|
|
||||||
ner = train_ner(nlp, train_data, output_directory)
|
ner = train_ner(nlp, train_data, output_directory)
|
||||||
|
|
||||||
|
# Test that the entity is recognized
|
||||||
doc = nlp('Do you like horses?')
|
doc = nlp('Do you like horses?')
|
||||||
for ent in doc.ents:
|
for ent in doc.ents:
|
||||||
print(ent.label_, ent.text)
|
print(ent.label_, ent.text)
|
||||||
nlp2 = spacy.load('en', path=output_directory)
|
if output_directory:
|
||||||
nlp2.entity.add_label('ANIMAL')
|
nlp2 = spacy.load('en', path=output_directory)
|
||||||
doc2 = nlp2('Do you like horses?')
|
nlp2.entity.add_label('ANIMAL')
|
||||||
for ent in doc2.ents:
|
doc2 = nlp2('Do you like horses?')
|
||||||
print(ent.label_, ent.text)
|
for ent in doc2.ents:
|
||||||
|
print(ent.label_, ent.text)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
|
@ -11,3 +11,4 @@ ujson>=1.35
|
||||||
dill>=0.2,<0.3
|
dill>=0.2,<0.3
|
||||||
requests>=2.13.0,<3.0.0
|
requests>=2.13.0,<3.0.0
|
||||||
regex==2017.4.5
|
regex==2017.4.5
|
||||||
|
pytest>=3.0.6,<4.0.0
|
||||||
|
|
|
@ -1,39 +1,40 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from pathlib import Path
|
from . import util
|
||||||
|
|
||||||
from .util import set_lang_class, get_lang_class, parse_package_meta
|
|
||||||
from .deprecated import resolve_model_name
|
from .deprecated import resolve_model_name
|
||||||
from .cli import info
|
from .cli import info
|
||||||
|
|
||||||
from . import en, de, zh, es, it, hu, fr, pt, nl, sv, fi, bn, he
|
from . import en, de, zh, es, it, hu, fr, pt, nl, sv, fi, bn, he
|
||||||
|
|
||||||
|
|
||||||
set_lang_class(en.English.lang, en.English)
|
_languages = (en.English, de.German, es.Spanish, pt.Portuguese, fr.French,
|
||||||
set_lang_class(de.German.lang, de.German)
|
it.Italian, hu.Hungarian, zh.Chinese, nl.Dutch, sv.Swedish,
|
||||||
set_lang_class(es.Spanish.lang, es.Spanish)
|
fi.Finnish, bn.Bengali, he.Hebrew)
|
||||||
set_lang_class(pt.Portuguese.lang, pt.Portuguese)
|
|
||||||
set_lang_class(fr.French.lang, fr.French)
|
|
||||||
set_lang_class(it.Italian.lang, it.Italian)
|
for _lang in _languages:
|
||||||
set_lang_class(hu.Hungarian.lang, hu.Hungarian)
|
util.set_lang_class(_lang.lang, _lang)
|
||||||
set_lang_class(zh.Chinese.lang, zh.Chinese)
|
|
||||||
set_lang_class(nl.Dutch.lang, nl.Dutch)
|
|
||||||
set_lang_class(sv.Swedish.lang, sv.Swedish)
|
|
||||||
set_lang_class(fi.Finnish.lang, fi.Finnish)
|
|
||||||
set_lang_class(bn.Bengali.lang, bn.Bengali)
|
|
||||||
set_lang_class(he.Hebrew.lang, he.Hebrew)
|
|
||||||
|
|
||||||
|
|
||||||
def load(name, **overrides):
|
def load(name, **overrides):
|
||||||
data_path = overrides.get('path', util.get_data_path())
|
if overrides.get('path') in (None, False, True):
|
||||||
model_name = resolve_model_name(name)
|
data_path = util.get_data_path()
|
||||||
meta = parse_package_meta(data_path, model_name, require=False)
|
model_name = resolve_model_name(name)
|
||||||
|
model_path = data_path / model_name
|
||||||
|
if not model_path.exists():
|
||||||
|
lang_name = util.get_lang_class(name).lang
|
||||||
|
model_path = None
|
||||||
|
util.print_msg(
|
||||||
|
"Only loading the '{}' tokenizer.".format(lang_name),
|
||||||
|
title="Warning: no model found for '{}'".format(name))
|
||||||
|
else:
|
||||||
|
model_path = util.ensure_path(overrides['path'])
|
||||||
|
data_path = model_path.parent
|
||||||
|
model_name = ''
|
||||||
|
meta = util.parse_package_meta(data_path, model_name, require=False)
|
||||||
lang = meta['lang'] if meta and 'lang' in meta else name
|
lang = meta['lang'] if meta and 'lang' in meta else name
|
||||||
cls = get_lang_class(lang)
|
cls = util.get_lang_class(lang)
|
||||||
overrides['meta'] = meta
|
overrides['meta'] = meta
|
||||||
model_path = Path(data_path / model_name)
|
overrides['path'] = model_path
|
||||||
if model_path.exists():
|
|
||||||
overrides['path'] = model_path
|
|
||||||
|
|
||||||
return cls(**overrides)
|
return cls(**overrides)
|
||||||
|
|
|
@ -63,15 +63,16 @@ class CLI(object):
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
input_dir=("directory with model data", "positional", None, str),
|
input_dir=("directory with model data", "positional", None, str),
|
||||||
output_dir=("output parent directory", "positional", None, str),
|
output_dir=("output parent directory", "positional", None, str),
|
||||||
|
meta=("path to meta.json", "option", "m", str),
|
||||||
force=("force overwriting of existing folder in output directory", "flag", "f", bool)
|
force=("force overwriting of existing folder in output directory", "flag", "f", bool)
|
||||||
)
|
)
|
||||||
def package(self, input_dir, output_dir, force=False):
|
def package(self, input_dir, output_dir, meta=None, force=False):
|
||||||
"""
|
"""
|
||||||
Generate Python package for model data, including meta and required
|
Generate Python package for model data, including meta and required
|
||||||
installation files. A new directory will be created in the specified
|
installation files. A new directory will be created in the specified
|
||||||
output directory, and model data will be copied over.
|
output directory, and model data will be copied over.
|
||||||
"""
|
"""
|
||||||
cli_package(input_dir, output_dir, force)
|
cli_package(input_dir, output_dir, meta, force)
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
# https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
|
# https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
|
||||||
|
|
||||||
__title__ = 'spacy'
|
__title__ = 'spacy'
|
||||||
__version__ = '1.7.5'
|
__version__ = '1.8.0'
|
||||||
__summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython'
|
__summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython'
|
||||||
__uri__ = 'https://spacy.io'
|
__uri__ = 'https://spacy.io'
|
||||||
__author__ = 'Matthew Honnibal'
|
__author__ = 'Matthew Honnibal'
|
||||||
|
|
|
@ -29,7 +29,7 @@ def link_package(package_name, link_name, force=False):
|
||||||
|
|
||||||
def symlink(model_path, link_name, force):
|
def symlink(model_path, link_name, force):
|
||||||
model_path = Path(model_path)
|
model_path = Path(model_path)
|
||||||
if not Path(model_path).exists():
|
if not model_path.exists():
|
||||||
util.sys_exit(
|
util.sys_exit(
|
||||||
"The data should be located in {p}".format(p=model_path),
|
"The data should be located in {p}".format(p=model_path),
|
||||||
title="Can't locate model data")
|
title="Can't locate model data")
|
||||||
|
@ -48,12 +48,16 @@ def symlink(model_path, link_name, force):
|
||||||
except:
|
except:
|
||||||
# This is quite dirty, but just making sure other errors are caught so
|
# This is quite dirty, but just making sure other errors are caught so
|
||||||
# users at least see a proper message.
|
# users at least see a proper message.
|
||||||
util.sys_exit(
|
util.print_msg(
|
||||||
"Creating a symlink in spacy/data failed. You can still import "
|
"Creating a symlink in spacy/data failed. Make sure you have the "
|
||||||
"the model as a Python package and call its load() method, or "
|
"required permissions and try re-running the command as admin, or "
|
||||||
"create the symlink manually:",
|
"use a virtualenv to install spaCy in a user directory, instead of "
|
||||||
|
"doing a system installation.",
|
||||||
|
"You can still import the model as a Python package and call its "
|
||||||
|
"load() method, or create the symlink manually:",
|
||||||
"{a} --> {b}".format(a=unicode_(model_path), b=unicode_(link_path)),
|
"{a} --> {b}".format(a=unicode_(model_path), b=unicode_(link_path)),
|
||||||
title="Error: Couldn't link model to '{l}'".format(l=link_name))
|
title="Error: Couldn't link model to '{l}'".format(l=link_name))
|
||||||
|
raise
|
||||||
|
|
||||||
util.print_msg(
|
util.print_msg(
|
||||||
"{a} --> {b}".format(a=model_path.as_posix(), b=link_path.as_posix()),
|
"{a} --> {b}".format(a=model_path.as_posix(), b=link_path.as_posix()),
|
||||||
|
|
|
@ -9,16 +9,24 @@ from ..compat import unicode_, json_dumps
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
||||||
|
|
||||||
def package(input_dir, output_dir, force):
|
def package(input_dir, output_dir, meta_path, force):
|
||||||
input_path = Path(input_dir)
|
input_path = Path(input_dir)
|
||||||
output_path = Path(output_dir)
|
output_path = Path(output_dir)
|
||||||
check_dirs(input_path, output_path)
|
meta_path = util.ensure_path(meta_path)
|
||||||
|
check_dirs(input_path, output_path, meta_path)
|
||||||
|
|
||||||
template_setup = get_template('setup.py')
|
template_setup = get_template('setup.py')
|
||||||
template_manifest = get_template('MANIFEST.in')
|
template_manifest = get_template('MANIFEST.in')
|
||||||
template_init = get_template('en_model_name/__init__.py')
|
template_init = get_template('en_model_name/__init__.py')
|
||||||
meta = generate_meta()
|
|
||||||
|
|
||||||
|
meta_path = meta_path or input_path / 'meta.json'
|
||||||
|
if meta_path.is_file():
|
||||||
|
util.print_msg(unicode_(meta_path), title="Reading meta.json from file")
|
||||||
|
meta = util.read_json(meta_path)
|
||||||
|
else:
|
||||||
|
meta = generate_meta()
|
||||||
|
|
||||||
|
validate_meta(meta, ['lang', 'name', 'version'])
|
||||||
model_name = meta['lang'] + '_' + meta['name']
|
model_name = meta['lang'] + '_' + meta['name']
|
||||||
model_name_v = model_name + '-' + meta['version']
|
model_name_v = model_name + '-' + meta['version']
|
||||||
main_path = output_path / model_name_v
|
main_path = output_path / model_name_v
|
||||||
|
@ -37,20 +45,23 @@ def package(input_dir, output_dir, force):
|
||||||
title="Successfully created package {p}".format(p=model_name_v))
|
title="Successfully created package {p}".format(p=model_name_v))
|
||||||
|
|
||||||
|
|
||||||
def check_dirs(input_path, output_path):
|
def check_dirs(input_path, output_path, meta_path):
|
||||||
if not input_path.exists():
|
if not input_path.exists():
|
||||||
util.sys_exit(unicode_(input_path.as_poisx), title="Model directory not found")
|
util.sys_exit(unicode_(input_path.as_poisx), title="Model directory not found")
|
||||||
if not output_path.exists():
|
if not output_path.exists():
|
||||||
util.sys_exit(unicode_(output_path), title="Output directory not found")
|
util.sys_exit(unicode_(output_path), title="Output directory not found")
|
||||||
|
if meta_path and not meta_path.exists():
|
||||||
|
util.sys_exit(unicode_(meta_path), title="meta.json not found")
|
||||||
|
|
||||||
|
|
||||||
def create_dirs(package_path, force):
|
def create_dirs(package_path, force):
|
||||||
if package_path.exists():
|
if package_path.exists():
|
||||||
if force:
|
if force:
|
||||||
shutil.rmtree(unicode_(package_path.as_posix))
|
shutil.rmtree(unicode_(package_path))
|
||||||
else:
|
else:
|
||||||
util.sys_exit(unicode_(package_path.as_posix),
|
util.sys_exit(unicode_(package_path),
|
||||||
"Please delete the directory and try again.",
|
"Please delete the directory and try again, or use the --force "
|
||||||
|
"flag to overwrite existing directories.",
|
||||||
title="Package directory already exists")
|
title="Package directory already exists")
|
||||||
Path.mkdir(package_path, parents=True)
|
Path.mkdir(package_path, parents=True)
|
||||||
|
|
||||||
|
@ -80,6 +91,14 @@ def generate_meta():
|
||||||
return meta
|
return meta
|
||||||
|
|
||||||
|
|
||||||
|
def validate_meta(meta, keys):
|
||||||
|
for key in keys:
|
||||||
|
if key not in meta or meta[key] == '':
|
||||||
|
util.sys_exit(
|
||||||
|
"This setting is required to build your package.",
|
||||||
|
title='No "{k}" setting found in meta.json'.format(k=key))
|
||||||
|
|
||||||
|
|
||||||
def get_template(filepath):
|
def get_template(filepath):
|
||||||
url = 'https://raw.githubusercontent.com/explosion/spacy-dev-resources/master/templates/model/'
|
url = 'https://raw.githubusercontent.com/explosion/spacy-dev-resources/master/templates/model/'
|
||||||
r = requests.get(url + filepath)
|
r = requests.get(url + filepath)
|
||||||
|
|
|
@ -2,8 +2,8 @@
|
||||||
from __future__ import unicode_literals, division, print_function
|
from __future__ import unicode_literals, division, print_function
|
||||||
|
|
||||||
import json
|
import json
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
|
from ..util import ensure_path
|
||||||
from ..scorer import Scorer
|
from ..scorer import Scorer
|
||||||
from ..gold import GoldParse, merge_sents
|
from ..gold import GoldParse, merge_sents
|
||||||
from ..gold import read_json_file as read_gold_json
|
from ..gold import read_json_file as read_gold_json
|
||||||
|
@ -12,9 +12,9 @@ from .. import util
|
||||||
|
|
||||||
def train(language, output_dir, train_data, dev_data, n_iter, tagger, parser, ner,
|
def train(language, output_dir, train_data, dev_data, n_iter, tagger, parser, ner,
|
||||||
parser_L1):
|
parser_L1):
|
||||||
output_path = Path(output_dir)
|
output_path = ensure_path(output_dir)
|
||||||
train_path = Path(train_data)
|
train_path = ensure_path(train_data)
|
||||||
dev_path = Path(dev_data)
|
dev_path = ensure_path(dev_data)
|
||||||
check_dirs(output_path, train_path, dev_path)
|
check_dirs(output_path, train_path, dev_path)
|
||||||
|
|
||||||
lang = util.get_lang_class(language)
|
lang = util.get_lang_class(language)
|
||||||
|
@ -43,7 +43,7 @@ def train(language, output_dir, train_data, dev_data, n_iter, tagger, parser, ne
|
||||||
|
|
||||||
|
|
||||||
def train_config(config):
|
def train_config(config):
|
||||||
config_path = Path(config)
|
config_path = ensure_path(config)
|
||||||
if not config_path.is_file():
|
if not config_path.is_file():
|
||||||
util.sys_exit(config_path.as_posix(), title="Config file not found")
|
util.sys_exit(config_path.as_posix(), title="Config file not found")
|
||||||
config = json.load(config_path)
|
config = json.load(config_path)
|
||||||
|
@ -57,7 +57,8 @@ def train_model(Language, train_data, dev_data, output_path, tagger_cfg, parser_
|
||||||
entity_cfg, n_iter):
|
entity_cfg, n_iter):
|
||||||
print("Itn.\tN weight\tN feats\tUAS\tNER F.\tTag %\tToken %")
|
print("Itn.\tN weight\tN feats\tUAS\tNER F.\tTag %\tToken %")
|
||||||
|
|
||||||
with Language.train(output_path, train_data, tagger_cfg, parser_cfg, entity_cfg) as trainer:
|
with Language.train(output_path, train_data,
|
||||||
|
pos=tagger_cfg, deps=parser_cfg, ner=entity_cfg) as trainer:
|
||||||
for itn, epoch in enumerate(trainer.epochs(n_iter, augment_data=None)):
|
for itn, epoch in enumerate(trainer.epochs(n_iter, augment_data=None)):
|
||||||
for doc, gold in epoch:
|
for doc, gold in epoch:
|
||||||
trainer.update(doc, gold)
|
trainer.update(doc, gold)
|
||||||
|
|
|
@ -107,7 +107,7 @@ def fix_glove_vectors_loading(overrides):
|
||||||
def resolve_model_name(name):
|
def resolve_model_name(name):
|
||||||
"""
|
"""
|
||||||
If spaCy is loaded with 'de', check if symlink already exists. If
|
If spaCy is loaded with 'de', check if symlink already exists. If
|
||||||
not, user have upgraded from older version and have old models installed.
|
not, user may have upgraded from older version and have old models installed.
|
||||||
Check if old model directory exists and if so, return that instead and create
|
Check if old model directory exists and if so, return that instead and create
|
||||||
shortcut link. If English model is found and no shortcut exists, raise error
|
shortcut link. If English model is found and no shortcut exists, raise error
|
||||||
and tell user to install new model.
|
and tell user to install new model.
|
||||||
|
|
|
@ -5,9 +5,9 @@ from __future__ import unicode_literals, print_function
|
||||||
import io
|
import io
|
||||||
import re
|
import re
|
||||||
import ujson
|
import ujson
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
from .syntax import nonproj
|
from .syntax import nonproj
|
||||||
|
from .util import ensure_path
|
||||||
|
|
||||||
|
|
||||||
def tags_to_entities(tags):
|
def tags_to_entities(tags):
|
||||||
|
@ -139,12 +139,12 @@ def _min_edit_path(cand_words, gold_words):
|
||||||
|
|
||||||
|
|
||||||
def read_json_file(loc, docs_filter=None):
|
def read_json_file(loc, docs_filter=None):
|
||||||
loc = Path(loc)
|
loc = ensure_path(loc)
|
||||||
if loc.is_dir():
|
if loc.is_dir():
|
||||||
for filename in loc.iterdir():
|
for filename in loc.iterdir():
|
||||||
yield from read_json_file(loc / filename)
|
yield from read_json_file(loc / filename)
|
||||||
else:
|
else:
|
||||||
with io.open(loc, 'r', encoding='utf8') as file_:
|
with loc.open('r', encoding='utf8') as file_:
|
||||||
docs = ujson.load(file_)
|
docs = ujson.load(file_)
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
if docs_filter is not None and not docs_filter(doc):
|
if docs_filter is not None and not docs_filter(doc):
|
||||||
|
|
|
@ -204,15 +204,18 @@ class Language(object):
|
||||||
@classmethod
|
@classmethod
|
||||||
@contextmanager
|
@contextmanager
|
||||||
def train(cls, path, gold_tuples, **configs):
|
def train(cls, path, gold_tuples, **configs):
|
||||||
if parser_cfg['pseudoprojective']:
|
parser_cfg = configs.get('deps', {})
|
||||||
|
if parser_cfg.get('pseudoprojective'):
|
||||||
# preprocess training data here before ArcEager.get_labels() is called
|
# preprocess training data here before ArcEager.get_labels() is called
|
||||||
gold_tuples = PseudoProjectivity.preprocess_training_data(gold_tuples)
|
gold_tuples = PseudoProjectivity.preprocess_training_data(gold_tuples)
|
||||||
|
|
||||||
for subdir in ('deps', 'ner', 'pos'):
|
for subdir in ('deps', 'ner', 'pos'):
|
||||||
if subdir not in configs:
|
if subdir not in configs:
|
||||||
configs[subdir] = {}
|
configs[subdir] = {}
|
||||||
configs['deps']['actions'] = ArcEager.get_actions(gold_parses=gold_tuples)
|
if parser_cfg:
|
||||||
configs['ner']['actions'] = BiluoPushDown.get_actions(gold_parses=gold_tuples)
|
configs['deps']['actions'] = ArcEager.get_actions(gold_parses=gold_tuples)
|
||||||
|
if 'ner' in configs:
|
||||||
|
configs['ner']['actions'] = BiluoPushDown.get_actions(gold_parses=gold_tuples)
|
||||||
|
|
||||||
cls.setup_directory(path, **configs)
|
cls.setup_directory(path, **configs)
|
||||||
|
|
||||||
|
@ -236,8 +239,7 @@ class Language(object):
|
||||||
self.pipeline = self.Defaults.create_pipeline(self)
|
self.pipeline = self.Defaults.create_pipeline(self)
|
||||||
yield Trainer(self, gold_tuples)
|
yield Trainer(self, gold_tuples)
|
||||||
self.end_training()
|
self.end_training()
|
||||||
self.save_to_directory(path, deps=self.parser.cfg, ner=self.entity.cfg,
|
self.save_to_directory(path)
|
||||||
pos=self.tagger.cfg)
|
|
||||||
|
|
||||||
def __init__(self, **overrides):
|
def __init__(self, **overrides):
|
||||||
if 'data_dir' in overrides and 'path' not in overrides:
|
if 'data_dir' in overrides and 'path' not in overrides:
|
||||||
|
|
|
@ -40,7 +40,7 @@ from ..strings cimport StringStore
|
||||||
from ..gold cimport GoldParse
|
from ..gold cimport GoldParse
|
||||||
|
|
||||||
|
|
||||||
USE_FTRL = False
|
USE_FTRL = True
|
||||||
DEBUG = False
|
DEBUG = False
|
||||||
def set_debug(val):
|
def set_debug(val):
|
||||||
global DEBUG
|
global DEBUG
|
||||||
|
|
|
@ -16,6 +16,7 @@ def test_tagger_lemmatizer_noun_lemmas(lemmatizer, text, lemmas):
|
||||||
assert lemmatizer.noun(text) == set(lemmas)
|
assert lemmatizer.noun(text) == set(lemmas)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.xfail
|
||||||
@pytest.mark.models
|
@pytest.mark.models
|
||||||
def test_tagger_lemmatizer_base_forms(lemmatizer):
|
def test_tagger_lemmatizer_base_forms(lemmatizer):
|
||||||
if lemmatizer is None:
|
if lemmatizer is None:
|
||||||
|
|
|
@ -3,9 +3,8 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
from ...vocab import Vocab
|
from ...vocab import Vocab
|
||||||
from ...tokenizer import Tokenizer
|
from ...tokenizer import Tokenizer
|
||||||
from ...util import utf8open
|
from ... import util
|
||||||
|
|
||||||
from os import path
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
@ -75,8 +74,8 @@ Phasellus tincidunt, augue quis porta finibus, massa sapien consectetur augue, n
|
||||||
|
|
||||||
@pytest.mark.parametrize('file_name', ["sun.txt"])
|
@pytest.mark.parametrize('file_name', ["sun.txt"])
|
||||||
def test_tokenizer_handle_text_from_file(tokenizer, file_name):
|
def test_tokenizer_handle_text_from_file(tokenizer, file_name):
|
||||||
loc = path.join(path.dirname(__file__), file_name)
|
loc = util.ensure_path(__file__).parent / file_name
|
||||||
text = utf8open(loc).read()
|
text = loc.open('r', encoding='utf8').read()
|
||||||
assert len(text) != 0
|
assert len(text) != 0
|
||||||
tokens = tokenizer(text)
|
tokens = tokenizer(text)
|
||||||
assert len(tokens) > 100
|
assert len(tokens) > 100
|
||||||
|
|
|
@ -192,6 +192,8 @@ cdef class Token:
|
||||||
property lemma:
|
property lemma:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.c.lemma
|
return self.c.lemma
|
||||||
|
def __set__(self, int lemma):
|
||||||
|
self.c.lemma = lemma
|
||||||
|
|
||||||
property pos:
|
property pos:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
|
@ -570,6 +572,8 @@ cdef class Token:
|
||||||
property lemma_:
|
property lemma_:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.vocab.strings[self.c.lemma]
|
return self.vocab.strings[self.c.lemma]
|
||||||
|
def __set__(self, unicode lemma_):
|
||||||
|
self.c.lemma = self.vocab.strings[lemma_]
|
||||||
|
|
||||||
property pos_:
|
property pos_:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals, print_function
|
from __future__ import unicode_literals, print_function
|
||||||
|
|
||||||
import io
|
|
||||||
import ujson
|
import ujson
|
||||||
import re
|
import re
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
@ -21,9 +20,11 @@ def set_lang_class(name, cls):
|
||||||
|
|
||||||
|
|
||||||
def get_lang_class(name):
|
def get_lang_class(name):
|
||||||
|
if name in LANGUAGES:
|
||||||
|
return LANGUAGES[name]
|
||||||
lang = re.split('[^a-zA-Z0-9]', name, 1)[0]
|
lang = re.split('[^a-zA-Z0-9]', name, 1)[0]
|
||||||
if lang not in LANGUAGES:
|
if lang not in LANGUAGES:
|
||||||
raise RuntimeError('Language not supported: %s' % lang)
|
raise RuntimeError('Language not supported: %s' % name)
|
||||||
return LANGUAGES[lang]
|
return LANGUAGES[lang]
|
||||||
|
|
||||||
|
|
||||||
|
@ -46,15 +47,6 @@ def ensure_path(path):
|
||||||
return path
|
return path
|
||||||
|
|
||||||
|
|
||||||
def or_(val1, val2):
|
|
||||||
if val1 is not None:
|
|
||||||
return val1
|
|
||||||
elif callable(val2):
|
|
||||||
return val2()
|
|
||||||
else:
|
|
||||||
return val2
|
|
||||||
|
|
||||||
|
|
||||||
def read_regex(path):
|
def read_regex(path):
|
||||||
path = ensure_path(path)
|
path = ensure_path(path)
|
||||||
with path.open() as file_:
|
with path.open() as file_:
|
||||||
|
@ -103,22 +95,28 @@ def normalize_slice(length, start, stop, step=None):
|
||||||
return start, stop
|
return start, stop
|
||||||
|
|
||||||
|
|
||||||
def utf8open(loc, mode='r'):
|
|
||||||
return io.open(loc, mode, encoding='utf8')
|
|
||||||
|
|
||||||
|
|
||||||
def check_renamed_kwargs(renamed, kwargs):
|
def check_renamed_kwargs(renamed, kwargs):
|
||||||
for old, new in renamed.items():
|
for old, new in renamed.items():
|
||||||
if old in kwargs:
|
if old in kwargs:
|
||||||
raise TypeError("Keyword argument %s now renamed to %s" % (old, new))
|
raise TypeError("Keyword argument %s now renamed to %s" % (old, new))
|
||||||
|
|
||||||
|
|
||||||
|
def read_json(location):
|
||||||
|
with location.open('r', encoding='utf8') as f:
|
||||||
|
return ujson.load(f)
|
||||||
|
|
||||||
|
|
||||||
def parse_package_meta(package_path, package, require=True):
|
def parse_package_meta(package_path, package, require=True):
|
||||||
|
"""
|
||||||
|
Check if a meta.json exists in a package and return its contents as a
|
||||||
|
dictionary. If require is set to True, raise an error if no meta.json found.
|
||||||
|
"""
|
||||||
|
# TODO: Allow passing in full model path and only require one argument
|
||||||
|
# instead of path and package name. This lets us avoid passing in an awkward
|
||||||
|
# empty string in spacy.load() if user supplies full model path.
|
||||||
location = package_path / package / 'meta.json'
|
location = package_path / package / 'meta.json'
|
||||||
if location.is_file():
|
if location.is_file():
|
||||||
with location.open('r', encoding='utf8') as f:
|
return read_json(location)
|
||||||
meta = ujson.load(f)
|
|
||||||
return meta
|
|
||||||
elif require:
|
elif require:
|
||||||
raise IOError("Could not read meta.json from %s" % location)
|
raise IOError("Could not read meta.json from %s" % location)
|
||||||
else:
|
else:
|
||||||
|
@ -126,10 +124,11 @@ def parse_package_meta(package_path, package, require=True):
|
||||||
|
|
||||||
|
|
||||||
def get_raw_input(description, default=False):
|
def get_raw_input(description, default=False):
|
||||||
"""Get user input via raw_input / input and return input value. Takes a
|
"""
|
||||||
|
Get user input via raw_input / input and return input value. Takes a
|
||||||
description for the prompt, and an optional default value that's displayed
|
description for the prompt, and an optional default value that's displayed
|
||||||
with the prompt."""
|
with the prompt.
|
||||||
|
"""
|
||||||
additional = ' (default: {d})'.format(d=default) if default else ''
|
additional = ' (default: {d})'.format(d=default) if default else ''
|
||||||
prompt = ' {d}{a}: '.format(d=description, a=additional)
|
prompt = ' {d}{a}: '.format(d=description, a=additional)
|
||||||
user_input = input_(prompt)
|
user_input = input_(prompt)
|
||||||
|
@ -137,9 +136,10 @@ def get_raw_input(description, default=False):
|
||||||
|
|
||||||
|
|
||||||
def print_table(data, **kwargs):
|
def print_table(data, **kwargs):
|
||||||
"""Print data in table format. Can either take a list of tuples or a
|
"""
|
||||||
dictionary, which will be converted to a list of tuples."""
|
Print data in table format. Can either take a list of tuples or a
|
||||||
|
dictionary, which will be converted to a list of tuples.
|
||||||
|
"""
|
||||||
if type(data) == dict:
|
if type(data) == dict:
|
||||||
data = list(data.items())
|
data = list(data.items())
|
||||||
|
|
||||||
|
@ -155,10 +155,11 @@ def print_table(data, **kwargs):
|
||||||
|
|
||||||
|
|
||||||
def print_markdown(data, **kwargs):
|
def print_markdown(data, **kwargs):
|
||||||
"""Print listed data in GitHub-flavoured Markdown format so it can be
|
"""
|
||||||
|
Print listed data in GitHub-flavoured Markdown format so it can be
|
||||||
copy-pasted into issues. Can either take a list of tuples or a dictionary,
|
copy-pasted into issues. Can either take a list of tuples or a dictionary,
|
||||||
which will be converted to a list of tuples."""
|
which will be converted to a list of tuples.
|
||||||
|
"""
|
||||||
def excl_value(value):
|
def excl_value(value):
|
||||||
# don't print value if it contains absolute path of directory (i.e.
|
# don't print value if it contains absolute path of directory (i.e.
|
||||||
# personal info). Other conditions can be included here if necessary.
|
# personal info). Other conditions can be included here if necessary.
|
||||||
|
@ -175,16 +176,16 @@ def print_markdown(data, **kwargs):
|
||||||
|
|
||||||
if 'title' in kwargs and kwargs['title']:
|
if 'title' in kwargs and kwargs['title']:
|
||||||
print(tpl_title.format(msg=kwargs['title']))
|
print(tpl_title.format(msg=kwargs['title']))
|
||||||
|
|
||||||
print(tpl_msg.format(msg=markdown))
|
print(tpl_msg.format(msg=markdown))
|
||||||
|
|
||||||
|
|
||||||
def print_msg(*text, **kwargs):
|
def print_msg(*text, **kwargs):
|
||||||
"""Print formatted message. Each positional argument is rendered as newline-
|
"""
|
||||||
|
Print formatted message. Each positional argument is rendered as newline-
|
||||||
separated paragraph. If kwarg 'title' exist, title is printed above the text
|
separated paragraph. If kwarg 'title' exist, title is printed above the text
|
||||||
and highlighted (using ANSI escape sequences manually to avoid unnecessary
|
and highlighted (using ANSI escape sequences manually to avoid unnecessary
|
||||||
dependency)."""
|
dependency).
|
||||||
|
"""
|
||||||
message = '\n\n'.join([_wrap_text(t) for t in text])
|
message = '\n\n'.join([_wrap_text(t) for t in text])
|
||||||
tpl_msg = '\n{msg}\n'
|
tpl_msg = '\n{msg}\n'
|
||||||
tpl_title = '\n\033[93m{msg}\033[0m'
|
tpl_title = '\n\033[93m{msg}\033[0m'
|
||||||
|
@ -196,9 +197,10 @@ def print_msg(*text, **kwargs):
|
||||||
|
|
||||||
|
|
||||||
def _wrap_text(text):
|
def _wrap_text(text):
|
||||||
"""Wrap text at given width using textwrap module. Indent should consist of
|
"""
|
||||||
spaces. Its length is deducted from wrap width to ensure exact wrapping."""
|
Wrap text at given width using textwrap module. Indent should consist of
|
||||||
|
spaces. Its length is deducted from wrap width to ensure exact wrapping.
|
||||||
|
"""
|
||||||
wrap_max = 80
|
wrap_max = 80
|
||||||
indent = ' '
|
indent = ' '
|
||||||
wrap_width = wrap_max - len(indent)
|
wrap_width = wrap_max - len(indent)
|
||||||
|
@ -208,10 +210,11 @@ def _wrap_text(text):
|
||||||
|
|
||||||
|
|
||||||
def sys_exit(*messages, **kwargs):
|
def sys_exit(*messages, **kwargs):
|
||||||
"""Performs SystemExit. For modules used from the command line, like
|
"""
|
||||||
|
Performs SystemExit. For modules used from the command line, like
|
||||||
download and link. To print message, use the same arguments as for
|
download and link. To print message, use the same arguments as for
|
||||||
print_msg()."""
|
print_msg().
|
||||||
|
"""
|
||||||
if messages:
|
if messages:
|
||||||
print_msg(*messages, **kwargs)
|
print_msg(*messages, **kwargs)
|
||||||
sys.exit(0)
|
sys.exit(0)
|
||||||
|
|
|
@ -12,7 +12,7 @@
|
||||||
"COMPANY_URL": "https://explosion.ai",
|
"COMPANY_URL": "https://explosion.ai",
|
||||||
"DEMOS_URL": "https://demos.explosion.ai",
|
"DEMOS_URL": "https://demos.explosion.ai",
|
||||||
|
|
||||||
"SPACY_VERSION": "1.7",
|
"SPACY_VERSION": "1.8",
|
||||||
"LATEST_NEWS": {
|
"LATEST_NEWS": {
|
||||||
"url": "https://survey.spacy.io/",
|
"url": "https://survey.spacy.io/",
|
||||||
"title": "Take the spaCy user survey and help us improve the library!"
|
"title": "Take the spaCy user survey and help us improve the library!"
|
||||||
|
|
|
@ -20,8 +20,10 @@
|
||||||
"Word vectors": "word-vectors-similarities",
|
"Word vectors": "word-vectors-similarities",
|
||||||
"Deep learning": "deep-learning",
|
"Deep learning": "deep-learning",
|
||||||
"Custom tokenization": "customizing-tokenizer",
|
"Custom tokenization": "customizing-tokenizer",
|
||||||
|
"Adding languages": "adding-languages",
|
||||||
"Training": "training",
|
"Training": "training",
|
||||||
"Adding languages": "adding-languages"
|
"Training NER": "training-ner",
|
||||||
|
"Saving & loading": "saving-loading"
|
||||||
},
|
},
|
||||||
"Examples": {
|
"Examples": {
|
||||||
"Tutorials": "tutorials",
|
"Tutorials": "tutorials",
|
||||||
|
@ -101,11 +103,21 @@
|
||||||
|
|
||||||
"customizing-tokenizer": {
|
"customizing-tokenizer": {
|
||||||
"title": "Customizing the tokenizer",
|
"title": "Customizing the tokenizer",
|
||||||
"next": "training"
|
"next": "adding-languages"
|
||||||
},
|
},
|
||||||
|
|
||||||
"training": {
|
"training": {
|
||||||
"title": "Training the tagger, parser and entity recognizer"
|
"title": "Training spaCy's statistical models",
|
||||||
|
"next": "saving-loading"
|
||||||
|
},
|
||||||
|
|
||||||
|
"training-ner": {
|
||||||
|
"title": "Training the Named Entity Recognizer",
|
||||||
|
"next": "saving-loading"
|
||||||
|
},
|
||||||
|
|
||||||
|
"saving-loading": {
|
||||||
|
"title": "Saving and loading models"
|
||||||
},
|
},
|
||||||
|
|
||||||
"pos-tagging": {
|
"pos-tagging": {
|
||||||
|
@ -356,6 +368,18 @@
|
||||||
},
|
},
|
||||||
|
|
||||||
"code": {
|
"code": {
|
||||||
|
"Training a new entity type": {
|
||||||
|
"url": "https://github.com/explosion/spaCy/blob/master/examples/training/train_new_entity_type.py",
|
||||||
|
"author": "Matthew Honnibal",
|
||||||
|
"tags": ["ner", "training"]
|
||||||
|
},
|
||||||
|
|
||||||
|
"Training an NER system from scratch": {
|
||||||
|
"url": "https://github.com/explosion/spaCy/blob/master/examples/training/train_ner_standalone.py",
|
||||||
|
"author": "Matthew Honnibal",
|
||||||
|
"tags": ["ner", "training"]
|
||||||
|
},
|
||||||
|
|
||||||
"Information extraction": {
|
"Information extraction": {
|
||||||
"url": "https://github.com/explosion/spaCy/blob/master/examples/information_extraction.py",
|
"url": "https://github.com/explosion/spaCy/blob/master/examples/information_extraction.py",
|
||||||
"author": "Matthew Honnibal",
|
"author": "Matthew Honnibal",
|
||||||
|
|
|
@ -63,14 +63,16 @@ p
|
||||||
tag_map = TAG_MAP
|
tag_map = TAG_MAP
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
p Additionally, the new #[code Language] class needs to be registered in #[+src(gh("spaCy", "spacy/__init__.py")) spacy/__init__.py] using the #[code set_lang_class()] function, so that you can use #[code spacy.load()].
|
p
|
||||||
|
| Additionally, the new #[code Language] class needs to be added to the
|
||||||
|
| list of available languages in #[+src(gh("spaCy", "spacy/__init__.py")) __init__.py].
|
||||||
|
| The languages are then registered using the #[code set_lang_class()] function.
|
||||||
|
|
||||||
+code("spacy/__init__.py").
|
+code("spacy/__init__.py").
|
||||||
from . import en
|
from . import en
|
||||||
from . import xx
|
from . import xx
|
||||||
|
|
||||||
set_lang_class(en.English.lang, en.English)
|
_languages = (en.English, ..., xx.Xxxxx)
|
||||||
set_lang_class(xx.Xxxxx.lang, xx.Xxxxx)
|
|
||||||
|
|
||||||
p You'll also need to list the new package in #[+src(gh("spaCy", "spacy/setup.py")) setup.py]:
|
p You'll also need to list the new package in #[+src(gh("spaCy", "spacy/setup.py")) setup.py]:
|
||||||
|
|
||||||
|
|
|
@ -248,15 +248,17 @@ p
|
||||||
+tag experimental
|
+tag experimental
|
||||||
|
|
||||||
p
|
p
|
||||||
| Generate a #[+a("/docs/usage/models#own-models") model Python package]
|
| Generate a #[+a("/docs/usage/saving-loading#generating") model Python package]
|
||||||
| from an existing model data directory. All data files are copied over,
|
| from an existing model data directory. All data files are copied over.
|
||||||
| and the meta data can be entered directly from the command line. While
|
| If the path to a meta.json is supplied, or a meta.json is found in the
|
||||||
| this feature is still experimental, the required file templates are
|
| input directory, this file is used. Otherwise, the data can be entered
|
||||||
| downloaded from #[+src(gh("spacy-dev-resources", "templates/model")) GitHub].
|
| directly from the command line. While this feature is still experimental,
|
||||||
| This means you need to be connected to the internet to use this command.
|
| the required file templates are downloaded from
|
||||||
|
| #[+src(gh("spacy-dev-resources", "templates/model")) GitHub]. This means
|
||||||
|
| you need to be connected to the internet to use this command.
|
||||||
|
|
||||||
+code(false, "bash").
|
+code(false, "bash").
|
||||||
python -m spacy package [input_dir] [output_dir] [--force]
|
python -m spacy package [input_dir] [output_dir] [--meta] [--force]
|
||||||
|
|
||||||
+table(["Argument", "Type", "Description"])
|
+table(["Argument", "Type", "Description"])
|
||||||
+row
|
+row
|
||||||
|
@ -269,6 +271,11 @@ p
|
||||||
+cell positional
|
+cell positional
|
||||||
+cell Directory to create package folder in.
|
+cell Directory to create package folder in.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code meta]
|
||||||
|
+cell option
|
||||||
|
+cell Path to meta.json file (optional).
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code --force], #[code -f]
|
+cell #[code --force], #[code -f]
|
||||||
+cell flag
|
+cell flag
|
||||||
|
|
|
@ -137,7 +137,7 @@ p
|
||||||
return word.ent_type != 0
|
return word.ent_type != 0
|
||||||
|
|
||||||
def count_parent_verb_by_person(docs):
|
def count_parent_verb_by_person(docs):
|
||||||
counts = defaultdict(defaultdict(int))
|
counts = defaultdict(lambda: defaultdict(int))
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
for ent in doc.ents:
|
for ent in doc.ents:
|
||||||
if ent.label_ == 'PERSON' and ent.root.head.pos == VERB:
|
if ent.label_ == 'PERSON' and ent.root.head.pos == VERB:
|
||||||
|
|
|
@ -235,62 +235,13 @@ p
|
||||||
|
|
||||||
p
|
p
|
||||||
| If you've trained your own model, for example for
|
| If you've trained your own model, for example for
|
||||||
| #[+a("/docs/usage/adding-languages") additional languages], you can
|
| #[+a("/docs/usage/adding-languages") additional languages] or
|
||||||
| create a shortuct link for it by pointing #[code spacy.link] to the
|
| #[+a("/docs/usage/train-ner") custom named entities], you can save its
|
||||||
| model's data directory. To allow your model to be downloaded and
|
| state using the #[code Language.save_to_directory()] method. To make the
|
||||||
| installed via pip, you'll also need to generate a package for it. You can
|
| model more convenient to deploy, we recommend wrapping it as a Python
|
||||||
| do this manually, or via the new
|
| package.
|
||||||
| #[+a("/docs/usage/cli#package") #[code spacy package] command] that will
|
|
||||||
| create all required files, and walk you through generating the meta data.
|
|
||||||
|
|
||||||
|
+infobox("Saving and loading models")
|
||||||
+infobox("Important note")
|
| For more information and a detailed guide on how to package your model,
|
||||||
| The model packages are #[strong not suitable] for the public
|
| see the documentation on
|
||||||
| #[+a("https://pypi.python.org") pypi.python.org] directory, which is not
|
| #[+a("/docs/usage/saving-loading") saving and loading models].
|
||||||
| designed for binary data and files over 50 MB. However, if your company
|
|
||||||
| is running an internal installation of pypi, publishing your models on
|
|
||||||
| there can be a convenient solution to share them with your team.
|
|
||||||
|
|
||||||
p The model directory should look like this:
|
|
||||||
|
|
||||||
+code("Directory structure", "yaml").
|
|
||||||
└── /
|
|
||||||
├── MANIFEST.in # to include meta.json
|
|
||||||
├── meta.json # model meta data
|
|
||||||
├── setup.py # setup file for pip installation
|
|
||||||
└── en_core_web_md # model directory
|
|
||||||
├── __init__.py # init for pip installation
|
|
||||||
└── en_core_web_md-1.2.0 # model data
|
|
||||||
|
|
||||||
p
|
|
||||||
| You can find templates for all files in our
|
|
||||||
| #[+a(gh("spacy-dev-resouces", "templates/model")) spaCy dev resources].
|
|
||||||
| Unless you want to customise installation and loading, the only file
|
|
||||||
| you'll need to modify is #[code meta.json], which includes the model's
|
|
||||||
| meta data. It will later be copied into the package and data directory.
|
|
||||||
|
|
||||||
+code("meta.json", "json").
|
|
||||||
{
|
|
||||||
"name": "core_web_md",
|
|
||||||
"lang": "en",
|
|
||||||
"version": "1.2.0",
|
|
||||||
"spacy_version": "1.7.0",
|
|
||||||
"description": "English model for spaCy",
|
|
||||||
"author": "Explosion AI",
|
|
||||||
"email": "contact@explosion.ai",
|
|
||||||
"license": "MIT"
|
|
||||||
}
|
|
||||||
|
|
||||||
p
|
|
||||||
| Keep in mind that the directories need to be named according to the
|
|
||||||
| naming conventions. The #[code lang] setting is also used to create the
|
|
||||||
| respective #[code Language] class in spaCy, which will later be returned
|
|
||||||
| by the model's #[code load()] method.
|
|
||||||
|
|
||||||
p
|
|
||||||
| To generate the package, run the following command from within the
|
|
||||||
| directory. This will create a #[code .tar.gz] archive in a directory
|
|
||||||
| #[code /dist].
|
|
||||||
|
|
||||||
+code(false, "bash").
|
|
||||||
python setup.py sdist
|
|
||||||
|
|
108
website/docs/usage/saving-loading.jade
Normal file
108
website/docs/usage/saving-loading.jade
Normal file
|
@ -0,0 +1,108 @@
|
||||||
|
include ../../_includes/_mixins
|
||||||
|
|
||||||
|
p
|
||||||
|
| After training your model, you'll usually want to save its state, and load
|
||||||
|
| it back later. You can do this with the #[code Language.save_to_directory()]
|
||||||
|
| method:
|
||||||
|
|
||||||
|
+code.
|
||||||
|
nlp.save_to_directory('/home/me/data/en_example_model')
|
||||||
|
|
||||||
|
p
|
||||||
|
| The directory will be created if it doesn't exist, and the whole pipeline
|
||||||
|
| will be written out. To make the model more convenient to deploy, we
|
||||||
|
| recommend wrapping it as a Python package.
|
||||||
|
|
||||||
|
+h(2, "generating") Generating a model package
|
||||||
|
|
||||||
|
+infobox("Important note")
|
||||||
|
| The model packages are #[strong not suitable] for the public
|
||||||
|
| #[+a("https://pypi.python.org") pypi.python.org] directory, which is not
|
||||||
|
| designed for binary data and files over 50 MB. However, if your company
|
||||||
|
| is running an internal installation of pypi, publishing your models on
|
||||||
|
| there can be a convenient solution to share them with your team.
|
||||||
|
|
||||||
|
p
|
||||||
|
| spaCy comes with a handy CLI command that will create all required files,
|
||||||
|
| and walk you through generating the meta data. You can also create the
|
||||||
|
| meta.json manually and place it in the model data directory, or supply a
|
||||||
|
| path to it using the #[code --meta] flag. For more info on this, see the
|
||||||
|
| #[+a("/docs/usage/cli/#package") #[code package] command] documentation.
|
||||||
|
|
||||||
|
+aside-code("meta.json", "json").
|
||||||
|
{
|
||||||
|
"name": "example_model",
|
||||||
|
"lang": "en",
|
||||||
|
"version": "1.0.0",
|
||||||
|
"spacy_version": ">=1.7.0,<2.0.0",
|
||||||
|
"description": "Example model for spaCy",
|
||||||
|
"author": "You",
|
||||||
|
"email": "you@example.com",
|
||||||
|
"license": "CC BY-SA 3.0"
|
||||||
|
}
|
||||||
|
|
||||||
|
+code(false, "bash").
|
||||||
|
python -m spacy package /home/me/data/en_example_model /home/me/my_models
|
||||||
|
|
||||||
|
p This command will create a model package directory that should look like this:
|
||||||
|
|
||||||
|
+code("Directory structure", "yaml").
|
||||||
|
└── /
|
||||||
|
├── MANIFEST.in # to include meta.json
|
||||||
|
├── meta.json # model meta data
|
||||||
|
├── setup.py # setup file for pip installation
|
||||||
|
└── en_example_model # model directory
|
||||||
|
├── __init__.py # init for pip installation
|
||||||
|
└── en_example_model-1.0.0 # model data
|
||||||
|
|
||||||
|
p
|
||||||
|
| You can also find templates for all files in our
|
||||||
|
| #[+a(gh("spacy-dev-resouces", "templates/model")) spaCy dev resources].
|
||||||
|
| If you're creating the package manually, keep in mind that the directories
|
||||||
|
| need to be named according to the naming conventions of
|
||||||
|
| #[code [language]_[type]] and #[code [language]_[type]-[version]]. The
|
||||||
|
| #[code lang] setting in the meta.json is also used to create the
|
||||||
|
| respective #[code Language] class in spaCy, which will later be returned
|
||||||
|
| by the model's #[code load()] method.
|
||||||
|
|
||||||
|
+h(2, "building") Building a model package
|
||||||
|
|
||||||
|
p
|
||||||
|
| To build the package, run the following command from within the
|
||||||
|
| directory. This will create a #[code .tar.gz] archive in a directory
|
||||||
|
| #[code /dist].
|
||||||
|
|
||||||
|
+code(false, "bash").
|
||||||
|
python setup.py sdist
|
||||||
|
|
||||||
|
p
|
||||||
|
| For more information on building Python packages, see the
|
||||||
|
| #[+a("https://setuptools.readthedocs.io/en/latest/") Python Setuptools documentation].
|
||||||
|
|
||||||
|
|
||||||
|
+h(2, "loading") Loading a model package
|
||||||
|
|
||||||
|
p
|
||||||
|
| Model packages can be installed by pointing pip to the model's
|
||||||
|
| #[code .tar.gz] archive:
|
||||||
|
|
||||||
|
+code(false, "bash").
|
||||||
|
pip install /path/to/en_example_model-1.0.0.tar.gz
|
||||||
|
|
||||||
|
p You'll then be able to load the model as follows:
|
||||||
|
|
||||||
|
+code.
|
||||||
|
import en_example_model
|
||||||
|
nlp = en_example_model.load()
|
||||||
|
|
||||||
|
p
|
||||||
|
| To load the model via #[code spacy.load()], you can also
|
||||||
|
| create a #[+a("/docs/usage/models#usage") shortcut link] that maps the
|
||||||
|
| package name to a custom model name of your choice:
|
||||||
|
|
||||||
|
+code(false, "bash").
|
||||||
|
python -m spacy link en_example_model example
|
||||||
|
|
||||||
|
+code.
|
||||||
|
import spacy
|
||||||
|
nlp = spacy.load('example')
|
174
website/docs/usage/training-ner.jade
Normal file
174
website/docs/usage/training-ner.jade
Normal file
|
@ -0,0 +1,174 @@
|
||||||
|
include ../../_includes/_mixins
|
||||||
|
|
||||||
|
p
|
||||||
|
| All #[+a("/docs/usage/models") spaCy models] support online learning, so
|
||||||
|
| you can update a pre-trained model with new examples. You can even add
|
||||||
|
| new classes to an existing model, to recognise a new entity type,
|
||||||
|
| part-of-speech, or syntactic relation. Updating an existing model is
|
||||||
|
| particularly useful as a "quick and dirty solution", if you have only a
|
||||||
|
| few corrections or annotations.
|
||||||
|
|
||||||
|
+h(2, "improving-accuracy") Improving accuracy on existing entity types
|
||||||
|
|
||||||
|
p
|
||||||
|
| To update the model, you first need to create an instance of
|
||||||
|
| #[+api("goldparse") #[code spacy.gold.GoldParse]], with the entity labels
|
||||||
|
| you want to learn. You will then pass this instance to the
|
||||||
|
| #[+api("entityrecognizer#update") #[code EntityRecognizer.update()]]
|
||||||
|
| method. For example:
|
||||||
|
|
||||||
|
+code.
|
||||||
|
import spacy
|
||||||
|
from spacy.gold import GoldParse
|
||||||
|
|
||||||
|
nlp = spacy.load('en')
|
||||||
|
doc = nlp.make_doc(u'Facebook released React in 2014')
|
||||||
|
gold = GoldParse(doc, entities=['U-ORG', 'O', 'U-TECHNOLOGY', 'O', 'U-DATE'])
|
||||||
|
nlp.entity.update(doc, gold)
|
||||||
|
|
||||||
|
p
|
||||||
|
| You'll usually need to provide many examples to meaningfully improve the
|
||||||
|
| system — a few hundred is a good start, although more is better. You
|
||||||
|
| should avoid iterating over the same few examples multiple times, or the
|
||||||
|
| model is likely to "forget" how to annotate other examples. If you
|
||||||
|
| iterate over the same few examples, you're effectively changing the loss
|
||||||
|
| function. The optimizer will find a way to minimize the loss on your
|
||||||
|
| examples, without regard for the consequences on the examples it's no
|
||||||
|
| longer paying attention to.
|
||||||
|
|
||||||
|
p
|
||||||
|
| One way to avoid this "catastrophic forgetting" problem is to "remind"
|
||||||
|
| the model of other examples by augmenting your annotations with sentences
|
||||||
|
| annotated with entities automatically recognised by the original model.
|
||||||
|
| Ultimately, this is an empirical process: you'll need to
|
||||||
|
| #[strong experiment on your own data] to find a solution that works best
|
||||||
|
| for you.
|
||||||
|
|
||||||
|
+h(2, "adding") Adding a new entity type
|
||||||
|
|
||||||
|
p
|
||||||
|
| You can add new entity types to an existing model. Let's say we want to
|
||||||
|
| recognise the category #[code TECHNOLOGY]. The new category will include
|
||||||
|
| programming languages, frameworks and platforms. First, we need to
|
||||||
|
| register the new entity type:
|
||||||
|
|
||||||
|
+code.
|
||||||
|
nlp.entity.add_label('TECHNOLOGY')
|
||||||
|
|
||||||
|
p
|
||||||
|
| Next, iterate over your examples, calling #[code entity.update()]. As
|
||||||
|
| above, we want to avoid iterating over only a small number of sentences.
|
||||||
|
| A useful compromise is to run the model over a number of plain-text
|
||||||
|
| sentences, and pass the entities to #[code GoldParse], as "true"
|
||||||
|
| annotations. This encourages the optimizer to find a solution that
|
||||||
|
| predicts the new category with minimal difference from the previous
|
||||||
|
| output.
|
||||||
|
|
||||||
|
+h(2, "saving-loading") Saving and loading
|
||||||
|
|
||||||
|
p
|
||||||
|
| After training our model, you'll usually want to save its state, and load
|
||||||
|
| it back later. You can do this with the #[code Language.save_to_directory()]
|
||||||
|
| method:
|
||||||
|
|
||||||
|
+code.
|
||||||
|
nlp.save_to_directory('/home/me/data/en_technology')
|
||||||
|
|
||||||
|
p
|
||||||
|
| To make the model more convenient to deploy, we recommend wrapping it as
|
||||||
|
| a Python package, so that you can install it via pip and load it as a
|
||||||
|
| module. spaCy comes with a handy #[+a("/docs/usage/cli#package") CLI command]
|
||||||
|
| to create all required files and directories.
|
||||||
|
|
||||||
|
+code(false, "bash").
|
||||||
|
python -m spacy package /home/me/data/en_technology /home/me/my_models
|
||||||
|
|
||||||
|
p
|
||||||
|
| To build the package and create a #[code .tar.gz] archive, run
|
||||||
|
| #[code python setup.py sdist] from within its directory.
|
||||||
|
|
||||||
|
+infobox("Saving and loading models")
|
||||||
|
| For more information and a detailed guide on how to package your model,
|
||||||
|
| see the documentation on
|
||||||
|
| #[+a("/docs/usage/saving-loading") saving and loading models].
|
||||||
|
|
||||||
|
p
|
||||||
|
| After you've generated and installed the package, you'll be able to
|
||||||
|
| load the model as follows:
|
||||||
|
|
||||||
|
+code.
|
||||||
|
import en_technology
|
||||||
|
nlp = en_technology.load()
|
||||||
|
|
||||||
|
+h(2, "example") Example: Adding and training an #[code ANIMAL] entity
|
||||||
|
|
||||||
|
p
|
||||||
|
| This script shows how to add a new entity type to an existing pre-trained
|
||||||
|
| NER model. To keep the example short and simple, only four sentences are
|
||||||
|
| provided as examples. In practice, you'll need many more —
|
||||||
|
| #[strong a few hundred] would be a good start. You will also likely need
|
||||||
|
| to mix in #[strong examples of other entity types], which might be
|
||||||
|
| obtained by running the entity recognizer over unlabelled sentences, and
|
||||||
|
| adding their annotations to the training set.
|
||||||
|
|
||||||
|
p
|
||||||
|
| For the full, runnable script of this example, see
|
||||||
|
| #[+src(gh("spacy", "examples/training/train_new_entity_type.py")) train_new_entity_type.py].
|
||||||
|
|
||||||
|
+code("Training the entity recognizer").
|
||||||
|
import spacy
|
||||||
|
from spacy.pipeline import EntityRecognizer
|
||||||
|
from spacy.gold import GoldParse
|
||||||
|
from spacy.tagger import Tagger
|
||||||
|
import random
|
||||||
|
|
||||||
|
model_name = 'en'
|
||||||
|
entity_label = 'ANIMAL'
|
||||||
|
output_directory = '/path/to/model'
|
||||||
|
train_data = [
|
||||||
|
("Horses are too tall and they pretend to care about your feelings",
|
||||||
|
[(0, 6, 'ANIMAL')]),
|
||||||
|
("horses are too tall and they pretend to care about your feelings",
|
||||||
|
[(0, 6, 'ANIMAL')]),
|
||||||
|
("horses pretend to care about your feelings",
|
||||||
|
[(0, 6, 'ANIMAL')]),
|
||||||
|
("they pretend to care about your feelings, those horses",
|
||||||
|
[(48, 54, 'ANIMAL')])
|
||||||
|
]
|
||||||
|
|
||||||
|
nlp = spacy.load(model_name)
|
||||||
|
nlp.entity.add_label(entity_label)
|
||||||
|
ner = train_ner(nlp, train_data, output_directory)
|
||||||
|
|
||||||
|
def train_ner(nlp, train_data, output_dir):
|
||||||
|
# Add new words to vocab
|
||||||
|
for raw_text, _ in train_data:
|
||||||
|
doc = nlp.make_doc(raw_text)
|
||||||
|
for word in doc:
|
||||||
|
_ = nlp.vocab[word.orth]
|
||||||
|
|
||||||
|
for itn in range(20):
|
||||||
|
random.shuffle(train_data)
|
||||||
|
for raw_text, entity_offsets in train_data:
|
||||||
|
gold = GoldParse(doc, entities=entity_offsets)
|
||||||
|
doc = nlp.make_doc(raw_text)
|
||||||
|
nlp.tagger(doc)
|
||||||
|
loss = nlp.entity.update(doc, gold)
|
||||||
|
nlp.end_training()
|
||||||
|
nlp.save_to_directory(output_dir)
|
||||||
|
|
||||||
|
p
|
||||||
|
+button(gh("spaCy", "examples/training/train_new_entity_type.py"), false, "secondary") Full example
|
||||||
|
|
||||||
|
p
|
||||||
|
| The actual training is performed by looping over the examples, and
|
||||||
|
| calling #[code nlp.entity.update()]. The #[code update()] method steps
|
||||||
|
| through the words of the input. At each word, it makes a prediction. It
|
||||||
|
| then consults the annotations provided on the #[code GoldParse] instance,
|
||||||
|
| to see whether it was right. If it was wrong, it adjusts its weights so
|
||||||
|
| that the correct action will score higher next time.
|
||||||
|
|
||||||
|
p
|
||||||
|
| After training your model, you can
|
||||||
|
| #[+a("/docs/usage/saving-loading") save it to a directory]. We recommend wrapping
|
||||||
|
| models as Python packages, for ease of deployment.
|
|
@ -1,13 +1,10 @@
|
||||||
include ../../_includes/_mixins
|
include ../../_includes/_mixins
|
||||||
|
|
||||||
p
|
p
|
||||||
| This tutorial describes how to train new statistical models for spaCy's
|
| This workflow describes how to train new statistical models for spaCy's
|
||||||
| part-of-speech tagger, named entity recognizer and dependency parser.
|
| part-of-speech tagger, named entity recognizer and dependency parser.
|
||||||
|
| Once the model is trained, you can then
|
||||||
p
|
| #[+a("/docs/usage/saving-loading") save and load] it.
|
||||||
| I'll start with some quick code examples, that describe how to train
|
|
||||||
| each model. I'll then provide a bit of background about the algorithms,
|
|
||||||
| and explain how the data and feature templates work.
|
|
||||||
|
|
||||||
+h(2, "train-pos-tagger") Training the part-of-speech tagger
|
+h(2, "train-pos-tagger") Training the part-of-speech tagger
|
||||||
|
|
||||||
|
@ -48,7 +45,21 @@ p
|
||||||
p
|
p
|
||||||
+button(gh("spaCy", "examples/training/train_ner.py"), false, "secondary") Full example
|
+button(gh("spaCy", "examples/training/train_ner.py"), false, "secondary") Full example
|
||||||
|
|
||||||
+h(2, "train-entity") Training the dependency parser
|
+h(2, "extend-entity") Extending the named entity recognizer
|
||||||
|
|
||||||
|
p
|
||||||
|
| All #[+a("/docs/usage/models") spaCy models] support online learning, so
|
||||||
|
| you can update a pre-trained model with new examples. You can even add
|
||||||
|
| new classes to an existing model, to recognise a new entity type,
|
||||||
|
| part-of-speech, or syntactic relation. Updating an existing model is
|
||||||
|
| particularly useful as a "quick and dirty solution", if you have only a
|
||||||
|
| few corrections or annotations.
|
||||||
|
|
||||||
|
p.o-inline-list
|
||||||
|
+button(gh("spaCy", "examples/training/train_new_entity_type.py"), true, "secondary") Full example
|
||||||
|
+button("/docs/usage/training-ner", false, "secondary") Usage Workflow
|
||||||
|
|
||||||
|
+h(2, "train-dependency") Training the dependency parser
|
||||||
|
|
||||||
+code.
|
+code.
|
||||||
from spacy.vocab import Vocab
|
from spacy.vocab import Vocab
|
||||||
|
@ -67,7 +78,7 @@ p
|
||||||
p
|
p
|
||||||
+button(gh("spaCy", "examples/training/train_parser.py"), false, "secondary") Full example
|
+button(gh("spaCy", "examples/training/train_parser.py"), false, "secondary") Full example
|
||||||
|
|
||||||
+h(2, 'feature-templates') Customizing the feature extraction
|
+h(2, "feature-templates") Customizing the feature extraction
|
||||||
|
|
||||||
p
|
p
|
||||||
| spaCy currently uses linear models for the tagger, parser and entity
|
| spaCy currently uses linear models for the tagger, parser and entity
|
||||||
|
|
Loading…
Reference in New Issue
Block a user