Update feature/lemmatizer from develop

This commit is contained in:
Matthew Honnibal 2019-03-10 02:45:33 +01:00
commit 78aba46530
12 changed files with 207 additions and 97 deletions

View File

@ -1,21 +0,0 @@
environment:
matrix:
- PYTHON: "C:\\Python35-x64"
- PYTHON: "C:\\Python36-x64"
- PYTHON: "C:\\Python37-x64"
install:
# We need wheel installed to build wheels
- "%PYTHON%\\python.exe -m pip install wheel"
- "%PYTHON%\\python.exe -m pip install cython"
- "%PYTHON%\\python.exe -m pip install -r requirements.txt"
- "%PYTHON%\\python.exe -m pip install -e ."
build: off
test_script:
- "%PYTHON%\\python.exe -m pytest spacy/ --no-print-logs"
after_test:
- "%PYTHON%\\python.exe setup.py bdist_wheel"
artifacts:
- path: dist\*
branches:
except:
- spacy.io

View File

@ -5,23 +5,16 @@ dist: trusty
group: edge
python:
- "2.7"
- "3.5"
- "3.6"
os:
- linux
env:
- VIA=compile
- VIA=flake8
install:
- "./travis.sh"
- pip install flake8
- "pip install -r requirements.txt"
- "python setup.py build_ext --inplace"
- "pip install -e ."
script:
- "cat /proc/cpuinfo | grep flags | head -n 1"
- "pip install pytest pytest-timeout"
- if [[ "${VIA}" == "compile" ]]; then python -m pytest --tb=native spacy; fi
- if [[ "${VIA}" == "flake8" ]]; then flake8 . --count --exclude=spacy/compat.py,spacy/lang --select=E901,E999,F821,F822,F823 --show-source --statistics; fi
- if [[ "${VIA}" == "pypi_nightly" ]]; then python -m pytest --tb=native --models --en `python -c "import os.path; import spacy; print(os.path.abspath(os.path.dirname(spacy.__file__)))"`; fi
- if [[ "${VIA}" == "sdist" ]]; then python -m pytest --tb=native `python -c "import os.path; import spacy; print(os.path.abspath(os.path.dirname(spacy.__file__)))"`; fi
- "python -m pytest --tb=native spacy"
branches:
except:
- spacy.io

View File

@ -14,8 +14,8 @@ released under the MIT license.
💫 **Version 2.1 out now!** [Check out the release notes here.](https://github.com/explosion/spaCy/releases)
[![Azure Pipelines](https://img.shields.io/azure-devops/build/explosion-ai/public/8/master.svg?logo=azure-devops&style=flat-square)](https://dev.azure.com/explosion-ai/public/_build?definitionId=8)
[![Travis Build Status](https://img.shields.io/travis/explosion/spaCy/master.svg?style=flat-square&logo=travis)](https://travis-ci.org/explosion/spaCy)
[![Appveyor Build Status](https://img.shields.io/appveyor/ci/explosion/spaCy/master.svg?style=flat-square&logo=appveyor)](https://ci.appveyor.com/project/explosion/spaCy)
[![Current Release Version](https://img.shields.io/github/release/explosion/spacy.svg?style=flat-square)](https://github.com/explosion/spaCy/releases)
[![pypi Version](https://img.shields.io/pypi/v/spacy.svg?style=flat-square)](https://pypi.python.org/pypi/spacy)
[![conda Version](https://img.shields.io/conda/vn/conda-forge/spacy.svg?style=flat-square)](https://anaconda.org/conda-forge/spacy)

92
azure-pipelines.yml Normal file
View File

@ -0,0 +1,92 @@
trigger:
batch: true
branches:
include:
- '*'
exclude:
- 'spacy.io'
paths:
exclude:
- 'website/*'
- '*.md'
jobs:
# Perform basic checks for most important errors (syntax etc.) Uses the config
# defined in .flake8 and overwrites the selected codes.
- job: 'Validate'
pool:
vmImage: 'ubuntu-16.04'
steps:
- task: UsePythonVersion@0
inputs:
versionSpec: '3.7'
- script: |
pip install flake8
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823 --show-source --statistics
displayName: 'flake8'
- job: 'Test'
dependsOn: 'Validate'
strategy:
matrix:
# Python 2.7 currently doesn't work because it seems to be a narrow
# unicode build, which causes problems with the regular expressions
# Python27Linux:
# imageName: 'ubuntu-16.04'
# python.version: '2.7'
# Python27Mac:
# imageName: 'macos-10.13'
# python.version: '2.7'
Python35Linux:
imageName: 'ubuntu-16.04'
python.version: '3.5'
Python35Windows:
imageName: 'vs2017-win2016'
python.version: '3.5'
Python35Mac:
imageName: 'macos-10.13'
python.version: '3.5'
Python36Linux:
imageName: 'ubuntu-16.04'
python.version: '3.6'
Python36Windows:
imageName: 'vs2017-win2016'
python.version: '3.6'
Python36Mac:
imageName: 'macos-10.13'
python.version: '3.6'
Python37Linux:
imageName: 'ubuntu-16.04'
python.version: '3.7'
Python37Windows:
imageName: 'vs2017-win2016'
python.version: '3.7'
Python37Mac:
imageName: 'macos-10.13'
python.version: '3.7'
maxParallel: 4
pool:
vmImage: $(imageName)
steps:
- task: UsePythonVersion@0
inputs:
versionSpec: '$(python.version)'
architecture: 'x64'
# Downgrading pip is necessary to prevent a wheel version incompatiblity.
# Might be fixed in the future or some other way, so investigate again.
- script: |
python -m pip install --upgrade pip==18.1
pip install -r requirements.txt
displayName: 'Install dependencies'
- script: |
python setup.py build_ext --inplace
pip install -e .
displayName: 'Build and install'
- script: python -m pytest --tb=native spacy
displayName: 'Run tests'

View File

@ -84,16 +84,54 @@ def _zero_init(model):
@layerize
def _preprocess_doc(docs, drop=0.0):
keys = [doc.to_array(LOWER) for doc in docs]
ops = Model.ops
# The dtype here matches what thinc is expecting -- which differs per
# platform (by int definition). This should be fixed once the problem
# is fixed on Thinc's side.
lengths = ops.asarray([arr.shape[0] for arr in keys], dtype=numpy.int_)
keys = ops.xp.concatenate(keys)
vals = ops.allocate(keys.shape) + 1.0
lengths = numpy.array([arr.shape[0] for arr in keys], dtype=numpy.int_)
keys = numpy.concatenate(keys)
vals = numpy.zeros(keys.shape, dtype='f')
return (keys, vals, lengths), None
def with_cpu(ops, model):
"""Wrap a model that should run on CPU, transferring inputs and outputs
as necessary."""
model.to_cpu()
def with_cpu_forward(inputs, drop=0.):
cpu_outputs, backprop = model.begin_update(_to_cpu(inputs), drop=drop)
gpu_outputs = _to_device(ops, cpu_outputs)
def with_cpu_backprop(d_outputs, sgd=None):
cpu_d_outputs = _to_cpu(d_outputs)
return backprop(cpu_d_outputs, sgd=sgd)
return gpu_outputs, with_cpu_backprop
return wrap(with_cpu_forward, model)
def _to_cpu(X):
if isinstance(X, numpy.ndarray):
return X
elif isinstance(X, tuple):
return tuple([_to_cpu(x) for x in X])
elif isinstance(X, list):
return [_to_cpu(x) for x in X]
elif hasattr(X, 'get'):
return X.get()
else:
return X
def _to_device(ops, X):
if isinstance(X, tuple):
return tuple([_to_device(ops, x) for x in X])
elif isinstance(X, list):
return [_to_device(ops, x) for x in X]
else:
return ops.asarray(X)
@layerize
def _preprocess_doc_bigrams(docs, drop=0.0):
unigrams = [doc.to_array(LOWER) for doc in docs]
@ -655,7 +693,10 @@ def build_text_classifier(nr_class, width=64, **cfg):
>> zero_init(Affine(nr_class, width, drop_factor=0.0))
)
linear_model = _preprocess_doc >> LinearModel(nr_class)
linear_model = (
_preprocess_doc
>> with_cpu(Model.ops, LinearModel(nr_class))
)
if cfg.get("exclusive_classes"):
output_layer = Softmax(nr_class, nr_class * 2)
else:

View File

@ -23,15 +23,16 @@ CONVERTERS = {
}
# File types
FILE_TYPES = ("json", "jsonl")
FILE_TYPES = ("json", "jsonl", "msg")
FILE_TYPES_STDOUT = ("json", "jsonl")
@plac.annotations(
input_file=("Input file", "positional", None, str),
output_dir=("Output directory for converted file", "positional", None, str),
file_type=("Type of data to produce: 'jsonl' or 'json'", "option", "t", str),
output_dir=("Output directory. '-' for stdout.", "positional", None, str),
file_type=("Type of data to produce: {}".format(FILE_TYPES), "option", "t", str),
n_sents=("Number of sentences per doc", "option", "n", int),
converter=("Name of converter (auto, iob, conllu or ner)", "option", "c", str),
converter=("Converter: {}".format(tuple(CONVERTERS.keys())), "option", "c", str),
lang=("Language (if tokenizer required)", "option", "l", str),
morphology=("Enable appending morphology to tags", "flag", "m", bool),
)
@ -58,6 +59,13 @@ def convert(
"Supported file types: '{}'".format(", ".join(FILE_TYPES)),
exits=1,
)
if file_type not in FILE_TYPES_STDOUT and output_dir == "-":
# TODO: support msgpack via stdout in srsly?
msg.fail(
"Can't write .{} data to stdout.".format(file_type),
"Please specify an output directory.",
exits=1,
)
if not input_path.exists():
msg.fail("Input file not found", input_path, exits=1)
if output_dir != "-" and not Path(output_dir).exists():
@ -78,6 +86,8 @@ def convert(
srsly.write_json(output_file, data)
elif file_type == "jsonl":
srsly.write_jsonl(output_file, data)
elif file_type == "msg":
srsly.write_msgpack(output_file, data)
msg.good("Generated output file ({} documents)".format(len(data)), output_file)
else:
# Print to stdout

View File

@ -342,7 +342,7 @@ class Errors(object):
"equal to span length ({span_len}).")
E122 = ("Cannot find token to be split. Did it get merged?")
E123 = ("Cannot find head of token to be split. Did it get merged?")
E124 = ("Cannot read from file: {path}. Supported formats: .json, .msg")
E124 = ("Cannot read from file: {path}. Supported formats: {formats}")
E125 = ("Unexpected value: {value}")
E126 = ("Unexpected matcher predicate: '{bad}'. Expected one of: {good}. "
"This is likely a bug in spaCy, so feel free to open an issue.")

View File

@ -153,10 +153,13 @@ class GoldCorpus(object):
loc = util.ensure_path(loc)
if loc.parts[-1].endswith("json"):
gold_tuples = read_json_file(loc)
elif loc.parts[-1].endswith("jsonl"):
gold_tuples = srsly.read_jsonl(loc)
elif loc.parts[-1].endswith("msg"):
gold_tuples = srsly.read_msgpack(loc)
else:
raise ValueError(Errors.E124.format(path=path2str(loc)))
supported = ("json", "jsonl", "msg")
raise ValueError(Errors.E124.format(path=path2str(loc), formats=supported))
for item in gold_tuples:
yield item
i += len(item[1])

View File

@ -21,7 +21,9 @@ _suffixes = (
r"(?<=[0-9])%", # 4% -> ["4", "%"]
r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
r"(?<=[0-9])(?:{u})".format(u=UNITS),
r"(?<=[0-9{al}{e}(?:{q})])\.".format(al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES),
r"(?<=[0-9{al}{e}(?:{q})])\.".format(
al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES
),
r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
]
)

View File

@ -379,7 +379,7 @@ _regular_exp = [
_regular_exp += [
"^{prefix}[{hyphen}][{al}][{hyphen}{al}{elision}]*$".format(
prefix=p,
hyphen=HYPHENS, # putting the - first in the [] range avoids having to use a backslash
hyphen=HYPHENS, # putting the - first in the [] range avoids having to use a backslash
elision=ELISION,
al=ALPHA_LOWER,
)
@ -423,5 +423,5 @@ _regular_exp.append(URL_PATTERN)
TOKENIZER_EXCEPTIONS = _exc
TOKEN_MATCH = re.compile(
"|".join("(?:{})".format(m) for m in _regular_exp), re.IGNORECASE
"|".join("(?:{})".format(m) for m in _regular_exp), re.IGNORECASE | re.UNICODE
).match

View File

@ -1,32 +0,0 @@
#!/bin/bash
if [ "${VIA}" == "pypi" ]; then
rm -rf *
pip install spacy-nightly
python -m spacy download en
fi
if [[ "${VIA}" == "sdist" && "${TRAVIS_PULL_REQUEST}" == "false" ]]; then
rm -rf *
pip uninstall spacy
wget https://api.explosion.ai/build/spacy/sdist/$TRAVIS_COMMIT
mv $TRAVIS_COMMIT sdist.tgz
pip install -U sdist.tgz
fi
if [ "${VIA}" == "compile" ]; then
pip install -r requirements.txt
python setup.py build_ext --inplace
pip install -e .
fi
# mkdir -p corpora/en
# cd corpora/en
# wget --no-check-certificate http://wordnetcode.princeton.edu/3.0/WordNet-3.0.tar.gz
# tar -xzf WordNet-3.0.tar.gz
# mv WordNet-3.0 wordnet
# cd ../../
# mkdir models/
# python bin/init_model.py en lang_data/ corpora/ models/en
#fi

View File

@ -134,28 +134,50 @@ converter can be specified on the command line, or chosen based on the file
extension of the input file.
```bash
$ python -m spacy convert [input_file] [output_dir] [--converter] [--n-sents]
[--morphology]
$ python -m spacy convert [input_file] [output_dir] [--file-type] [--converter]
[--n-sents] [--morphology] [--lang]
```
| Argument | Type | Description |
| -------------------------------------------- | ---------- | ---------------------------------------------------------- |
| `input_file` | positional | Input file. |
| `output_dir` | positional | Output directory for converted JSON file. |
| `converter`, `-c` <Tag variant="new">2</Tag> | option | Name of converter to use (see below). |
| `--n-sents`, `-n` | option | Number of sentences per document. |
| `--morphology`, `-m` | option | Enable appending morphology to tags. |
| `--help`, `-h` | flag | Show help message and available arguments. |
| **CREATES** | JSON | Data in spaCy's [JSON format](/api/annotation#json-input). |
| Argument | Type | Description |
| ------------------------------------------------ | ---------- | ------------------------------------------------------------------------------------------------- |
| `input_file` | positional | Input file. |
| `output_dir` | positional | Output directory for converted file. Defaults to `"-"`, meaning data will be written to `stdout`. |
| `--file-type`, `-t` <Tag variant="new">2.1</Tag> | option | Type of file to create (see below). |
| `--converter`, `-c` <Tag variant="new">2</Tag> | option | Name of converter to use (see below). |
| `--n-sents`, `-n` | option | Number of sentences per document. |
| `--morphology`, `-m` | option | Enable appending morphology to tags. |
| `--lang`, `-l` <Tag variant="new">2.1</Tag> | option | Language code (if tokenizer required). |
| `--help`, `-h` | flag | Show help message and available arguments. |
| **CREATES** | JSON | Data in spaCy's [JSON format](/api/annotation#json-input). |
The following file format converters are available:
### Output file types {new="2.1"}
| ID | Description |
| ----------------- | --------------------------------------------------------------- |
| `auto` | Automatically pick converter based on file extension (default). |
| `conllu`, `conll` | Universal Dependencies `.conllu` or `.conll` format. |
| `ner` | Tab-based named entity recognition format. |
| `iob` | IOB or IOB2 named entity recognition format. |
> #### Which format should I choose?
>
> If you're not sure, go with the default `jsonl`. Newline-delimited JSON means
> that there's one JSON object per line. Unlike a regular JSON file, it can also
> be read in line-by-line and you won't have to parse the _entire file_ first.
> This makes it a very convenient format for larger corpora.
All output files generated by this command are compatible with
[`spacy train`](/api/cli#train).
| ID | Description |
| ------- | --------------------------------- |
| `jsonl` | Newline-delimited JSON (default). |
| `json` | Regular JSON. |
| `msg` | Binary MessagePack format. |
### Converter options
<!-- TODO: document jsonl option maybe update it? -->
| ID | Description |
| ------------------------------ | --------------------------------------------------------------- |
| `auto` | Automatically pick converter based on file extension (default). |
| `conll`, `conllu`, `conllubio` | Universal Dependencies `.conllu` or `.conll` format. |
| `ner` | Tab-based named entity recognition format. |
| `iob` | IOB or IOB2 named entity recognition format. |
## Train {#train}