mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 01:04:34 +03:00
Update feature/lemmatizer from develop
This commit is contained in:
commit
78aba46530
|
@ -1,21 +0,0 @@
|
|||
environment:
|
||||
matrix:
|
||||
- PYTHON: "C:\\Python35-x64"
|
||||
- PYTHON: "C:\\Python36-x64"
|
||||
- PYTHON: "C:\\Python37-x64"
|
||||
install:
|
||||
# We need wheel installed to build wheels
|
||||
- "%PYTHON%\\python.exe -m pip install wheel"
|
||||
- "%PYTHON%\\python.exe -m pip install cython"
|
||||
- "%PYTHON%\\python.exe -m pip install -r requirements.txt"
|
||||
- "%PYTHON%\\python.exe -m pip install -e ."
|
||||
build: off
|
||||
test_script:
|
||||
- "%PYTHON%\\python.exe -m pytest spacy/ --no-print-logs"
|
||||
after_test:
|
||||
- "%PYTHON%\\python.exe setup.py bdist_wheel"
|
||||
artifacts:
|
||||
- path: dist\*
|
||||
branches:
|
||||
except:
|
||||
- spacy.io
|
15
.travis.yml
15
.travis.yml
|
@ -5,23 +5,16 @@ dist: trusty
|
|||
group: edge
|
||||
python:
|
||||
- "2.7"
|
||||
- "3.5"
|
||||
- "3.6"
|
||||
os:
|
||||
- linux
|
||||
env:
|
||||
- VIA=compile
|
||||
- VIA=flake8
|
||||
install:
|
||||
- "./travis.sh"
|
||||
- pip install flake8
|
||||
- "pip install -r requirements.txt"
|
||||
- "python setup.py build_ext --inplace"
|
||||
- "pip install -e ."
|
||||
script:
|
||||
- "cat /proc/cpuinfo | grep flags | head -n 1"
|
||||
- "pip install pytest pytest-timeout"
|
||||
- if [[ "${VIA}" == "compile" ]]; then python -m pytest --tb=native spacy; fi
|
||||
- if [[ "${VIA}" == "flake8" ]]; then flake8 . --count --exclude=spacy/compat.py,spacy/lang --select=E901,E999,F821,F822,F823 --show-source --statistics; fi
|
||||
- if [[ "${VIA}" == "pypi_nightly" ]]; then python -m pytest --tb=native --models --en `python -c "import os.path; import spacy; print(os.path.abspath(os.path.dirname(spacy.__file__)))"`; fi
|
||||
- if [[ "${VIA}" == "sdist" ]]; then python -m pytest --tb=native `python -c "import os.path; import spacy; print(os.path.abspath(os.path.dirname(spacy.__file__)))"`; fi
|
||||
- "python -m pytest --tb=native spacy"
|
||||
branches:
|
||||
except:
|
||||
- spacy.io
|
||||
|
|
|
@ -14,8 +14,8 @@ released under the MIT license.
|
|||
|
||||
💫 **Version 2.1 out now!** [Check out the release notes here.](https://github.com/explosion/spaCy/releases)
|
||||
|
||||
[![Azure Pipelines](https://img.shields.io/azure-devops/build/explosion-ai/public/8/master.svg?logo=azure-devops&style=flat-square)](https://dev.azure.com/explosion-ai/public/_build?definitionId=8)
|
||||
[![Travis Build Status](https://img.shields.io/travis/explosion/spaCy/master.svg?style=flat-square&logo=travis)](https://travis-ci.org/explosion/spaCy)
|
||||
[![Appveyor Build Status](https://img.shields.io/appveyor/ci/explosion/spaCy/master.svg?style=flat-square&logo=appveyor)](https://ci.appveyor.com/project/explosion/spaCy)
|
||||
[![Current Release Version](https://img.shields.io/github/release/explosion/spacy.svg?style=flat-square)](https://github.com/explosion/spaCy/releases)
|
||||
[![pypi Version](https://img.shields.io/pypi/v/spacy.svg?style=flat-square)](https://pypi.python.org/pypi/spacy)
|
||||
[![conda Version](https://img.shields.io/conda/vn/conda-forge/spacy.svg?style=flat-square)](https://anaconda.org/conda-forge/spacy)
|
||||
|
|
92
azure-pipelines.yml
Normal file
92
azure-pipelines.yml
Normal file
|
@ -0,0 +1,92 @@
|
|||
trigger:
|
||||
batch: true
|
||||
branches:
|
||||
include:
|
||||
- '*'
|
||||
exclude:
|
||||
- 'spacy.io'
|
||||
paths:
|
||||
exclude:
|
||||
- 'website/*'
|
||||
- '*.md'
|
||||
|
||||
jobs:
|
||||
|
||||
# Perform basic checks for most important errors (syntax etc.) Uses the config
|
||||
# defined in .flake8 and overwrites the selected codes.
|
||||
- job: 'Validate'
|
||||
pool:
|
||||
vmImage: 'ubuntu-16.04'
|
||||
steps:
|
||||
- task: UsePythonVersion@0
|
||||
inputs:
|
||||
versionSpec: '3.7'
|
||||
- script: |
|
||||
pip install flake8
|
||||
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823 --show-source --statistics
|
||||
displayName: 'flake8'
|
||||
|
||||
- job: 'Test'
|
||||
dependsOn: 'Validate'
|
||||
strategy:
|
||||
matrix:
|
||||
# Python 2.7 currently doesn't work because it seems to be a narrow
|
||||
# unicode build, which causes problems with the regular expressions
|
||||
|
||||
# Python27Linux:
|
||||
# imageName: 'ubuntu-16.04'
|
||||
# python.version: '2.7'
|
||||
# Python27Mac:
|
||||
# imageName: 'macos-10.13'
|
||||
# python.version: '2.7'
|
||||
Python35Linux:
|
||||
imageName: 'ubuntu-16.04'
|
||||
python.version: '3.5'
|
||||
Python35Windows:
|
||||
imageName: 'vs2017-win2016'
|
||||
python.version: '3.5'
|
||||
Python35Mac:
|
||||
imageName: 'macos-10.13'
|
||||
python.version: '3.5'
|
||||
Python36Linux:
|
||||
imageName: 'ubuntu-16.04'
|
||||
python.version: '3.6'
|
||||
Python36Windows:
|
||||
imageName: 'vs2017-win2016'
|
||||
python.version: '3.6'
|
||||
Python36Mac:
|
||||
imageName: 'macos-10.13'
|
||||
python.version: '3.6'
|
||||
Python37Linux:
|
||||
imageName: 'ubuntu-16.04'
|
||||
python.version: '3.7'
|
||||
Python37Windows:
|
||||
imageName: 'vs2017-win2016'
|
||||
python.version: '3.7'
|
||||
Python37Mac:
|
||||
imageName: 'macos-10.13'
|
||||
python.version: '3.7'
|
||||
maxParallel: 4
|
||||
pool:
|
||||
vmImage: $(imageName)
|
||||
|
||||
steps:
|
||||
- task: UsePythonVersion@0
|
||||
inputs:
|
||||
versionSpec: '$(python.version)'
|
||||
architecture: 'x64'
|
||||
|
||||
# Downgrading pip is necessary to prevent a wheel version incompatiblity.
|
||||
# Might be fixed in the future or some other way, so investigate again.
|
||||
- script: |
|
||||
python -m pip install --upgrade pip==18.1
|
||||
pip install -r requirements.txt
|
||||
displayName: 'Install dependencies'
|
||||
|
||||
- script: |
|
||||
python setup.py build_ext --inplace
|
||||
pip install -e .
|
||||
displayName: 'Build and install'
|
||||
|
||||
- script: python -m pytest --tb=native spacy
|
||||
displayName: 'Run tests'
|
51
spacy/_ml.py
51
spacy/_ml.py
|
@ -84,16 +84,54 @@ def _zero_init(model):
|
|||
@layerize
|
||||
def _preprocess_doc(docs, drop=0.0):
|
||||
keys = [doc.to_array(LOWER) for doc in docs]
|
||||
ops = Model.ops
|
||||
# The dtype here matches what thinc is expecting -- which differs per
|
||||
# platform (by int definition). This should be fixed once the problem
|
||||
# is fixed on Thinc's side.
|
||||
lengths = ops.asarray([arr.shape[0] for arr in keys], dtype=numpy.int_)
|
||||
keys = ops.xp.concatenate(keys)
|
||||
vals = ops.allocate(keys.shape) + 1.0
|
||||
lengths = numpy.array([arr.shape[0] for arr in keys], dtype=numpy.int_)
|
||||
keys = numpy.concatenate(keys)
|
||||
vals = numpy.zeros(keys.shape, dtype='f')
|
||||
return (keys, vals, lengths), None
|
||||
|
||||
|
||||
def with_cpu(ops, model):
|
||||
"""Wrap a model that should run on CPU, transferring inputs and outputs
|
||||
as necessary."""
|
||||
model.to_cpu()
|
||||
def with_cpu_forward(inputs, drop=0.):
|
||||
cpu_outputs, backprop = model.begin_update(_to_cpu(inputs), drop=drop)
|
||||
gpu_outputs = _to_device(ops, cpu_outputs)
|
||||
|
||||
def with_cpu_backprop(d_outputs, sgd=None):
|
||||
cpu_d_outputs = _to_cpu(d_outputs)
|
||||
return backprop(cpu_d_outputs, sgd=sgd)
|
||||
|
||||
return gpu_outputs, with_cpu_backprop
|
||||
|
||||
return wrap(with_cpu_forward, model)
|
||||
|
||||
|
||||
def _to_cpu(X):
|
||||
if isinstance(X, numpy.ndarray):
|
||||
return X
|
||||
elif isinstance(X, tuple):
|
||||
return tuple([_to_cpu(x) for x in X])
|
||||
elif isinstance(X, list):
|
||||
return [_to_cpu(x) for x in X]
|
||||
elif hasattr(X, 'get'):
|
||||
return X.get()
|
||||
else:
|
||||
return X
|
||||
|
||||
|
||||
def _to_device(ops, X):
|
||||
if isinstance(X, tuple):
|
||||
return tuple([_to_device(ops, x) for x in X])
|
||||
elif isinstance(X, list):
|
||||
return [_to_device(ops, x) for x in X]
|
||||
else:
|
||||
return ops.asarray(X)
|
||||
|
||||
|
||||
@layerize
|
||||
def _preprocess_doc_bigrams(docs, drop=0.0):
|
||||
unigrams = [doc.to_array(LOWER) for doc in docs]
|
||||
|
@ -655,7 +693,10 @@ def build_text_classifier(nr_class, width=64, **cfg):
|
|||
>> zero_init(Affine(nr_class, width, drop_factor=0.0))
|
||||
)
|
||||
|
||||
linear_model = _preprocess_doc >> LinearModel(nr_class)
|
||||
linear_model = (
|
||||
_preprocess_doc
|
||||
>> with_cpu(Model.ops, LinearModel(nr_class))
|
||||
)
|
||||
if cfg.get("exclusive_classes"):
|
||||
output_layer = Softmax(nr_class, nr_class * 2)
|
||||
else:
|
||||
|
|
|
@ -23,15 +23,16 @@ CONVERTERS = {
|
|||
}
|
||||
|
||||
# File types
|
||||
FILE_TYPES = ("json", "jsonl")
|
||||
FILE_TYPES = ("json", "jsonl", "msg")
|
||||
FILE_TYPES_STDOUT = ("json", "jsonl")
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
input_file=("Input file", "positional", None, str),
|
||||
output_dir=("Output directory for converted file", "positional", None, str),
|
||||
file_type=("Type of data to produce: 'jsonl' or 'json'", "option", "t", str),
|
||||
output_dir=("Output directory. '-' for stdout.", "positional", None, str),
|
||||
file_type=("Type of data to produce: {}".format(FILE_TYPES), "option", "t", str),
|
||||
n_sents=("Number of sentences per doc", "option", "n", int),
|
||||
converter=("Name of converter (auto, iob, conllu or ner)", "option", "c", str),
|
||||
converter=("Converter: {}".format(tuple(CONVERTERS.keys())), "option", "c", str),
|
||||
lang=("Language (if tokenizer required)", "option", "l", str),
|
||||
morphology=("Enable appending morphology to tags", "flag", "m", bool),
|
||||
)
|
||||
|
@ -58,6 +59,13 @@ def convert(
|
|||
"Supported file types: '{}'".format(", ".join(FILE_TYPES)),
|
||||
exits=1,
|
||||
)
|
||||
if file_type not in FILE_TYPES_STDOUT and output_dir == "-":
|
||||
# TODO: support msgpack via stdout in srsly?
|
||||
msg.fail(
|
||||
"Can't write .{} data to stdout.".format(file_type),
|
||||
"Please specify an output directory.",
|
||||
exits=1,
|
||||
)
|
||||
if not input_path.exists():
|
||||
msg.fail("Input file not found", input_path, exits=1)
|
||||
if output_dir != "-" and not Path(output_dir).exists():
|
||||
|
@ -78,6 +86,8 @@ def convert(
|
|||
srsly.write_json(output_file, data)
|
||||
elif file_type == "jsonl":
|
||||
srsly.write_jsonl(output_file, data)
|
||||
elif file_type == "msg":
|
||||
srsly.write_msgpack(output_file, data)
|
||||
msg.good("Generated output file ({} documents)".format(len(data)), output_file)
|
||||
else:
|
||||
# Print to stdout
|
||||
|
|
|
@ -342,7 +342,7 @@ class Errors(object):
|
|||
"equal to span length ({span_len}).")
|
||||
E122 = ("Cannot find token to be split. Did it get merged?")
|
||||
E123 = ("Cannot find head of token to be split. Did it get merged?")
|
||||
E124 = ("Cannot read from file: {path}. Supported formats: .json, .msg")
|
||||
E124 = ("Cannot read from file: {path}. Supported formats: {formats}")
|
||||
E125 = ("Unexpected value: {value}")
|
||||
E126 = ("Unexpected matcher predicate: '{bad}'. Expected one of: {good}. "
|
||||
"This is likely a bug in spaCy, so feel free to open an issue.")
|
||||
|
|
|
@ -153,10 +153,13 @@ class GoldCorpus(object):
|
|||
loc = util.ensure_path(loc)
|
||||
if loc.parts[-1].endswith("json"):
|
||||
gold_tuples = read_json_file(loc)
|
||||
elif loc.parts[-1].endswith("jsonl"):
|
||||
gold_tuples = srsly.read_jsonl(loc)
|
||||
elif loc.parts[-1].endswith("msg"):
|
||||
gold_tuples = srsly.read_msgpack(loc)
|
||||
else:
|
||||
raise ValueError(Errors.E124.format(path=path2str(loc)))
|
||||
supported = ("json", "jsonl", "msg")
|
||||
raise ValueError(Errors.E124.format(path=path2str(loc), formats=supported))
|
||||
for item in gold_tuples:
|
||||
yield item
|
||||
i += len(item[1])
|
||||
|
|
|
@ -21,7 +21,9 @@ _suffixes = (
|
|||
r"(?<=[0-9])%", # 4% -> ["4", "%"]
|
||||
r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
|
||||
r"(?<=[0-9])(?:{u})".format(u=UNITS),
|
||||
r"(?<=[0-9{al}{e}(?:{q})])\.".format(al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES),
|
||||
r"(?<=[0-9{al}{e}(?:{q})])\.".format(
|
||||
al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES
|
||||
),
|
||||
r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
|
||||
]
|
||||
)
|
||||
|
|
|
@ -379,7 +379,7 @@ _regular_exp = [
|
|||
_regular_exp += [
|
||||
"^{prefix}[{hyphen}][{al}][{hyphen}{al}{elision}]*$".format(
|
||||
prefix=p,
|
||||
hyphen=HYPHENS, # putting the - first in the [] range avoids having to use a backslash
|
||||
hyphen=HYPHENS, # putting the - first in the [] range avoids having to use a backslash
|
||||
elision=ELISION,
|
||||
al=ALPHA_LOWER,
|
||||
)
|
||||
|
@ -423,5 +423,5 @@ _regular_exp.append(URL_PATTERN)
|
|||
|
||||
TOKENIZER_EXCEPTIONS = _exc
|
||||
TOKEN_MATCH = re.compile(
|
||||
"|".join("(?:{})".format(m) for m in _regular_exp), re.IGNORECASE
|
||||
"|".join("(?:{})".format(m) for m in _regular_exp), re.IGNORECASE | re.UNICODE
|
||||
).match
|
||||
|
|
32
travis.sh
32
travis.sh
|
@ -1,32 +0,0 @@
|
|||
#!/bin/bash
|
||||
|
||||
if [ "${VIA}" == "pypi" ]; then
|
||||
rm -rf *
|
||||
pip install spacy-nightly
|
||||
python -m spacy download en
|
||||
fi
|
||||
|
||||
if [[ "${VIA}" == "sdist" && "${TRAVIS_PULL_REQUEST}" == "false" ]]; then
|
||||
rm -rf *
|
||||
pip uninstall spacy
|
||||
wget https://api.explosion.ai/build/spacy/sdist/$TRAVIS_COMMIT
|
||||
mv $TRAVIS_COMMIT sdist.tgz
|
||||
pip install -U sdist.tgz
|
||||
fi
|
||||
|
||||
|
||||
if [ "${VIA}" == "compile" ]; then
|
||||
pip install -r requirements.txt
|
||||
python setup.py build_ext --inplace
|
||||
pip install -e .
|
||||
fi
|
||||
|
||||
# mkdir -p corpora/en
|
||||
# cd corpora/en
|
||||
# wget --no-check-certificate http://wordnetcode.princeton.edu/3.0/WordNet-3.0.tar.gz
|
||||
# tar -xzf WordNet-3.0.tar.gz
|
||||
# mv WordNet-3.0 wordnet
|
||||
# cd ../../
|
||||
# mkdir models/
|
||||
# python bin/init_model.py en lang_data/ corpora/ models/en
|
||||
#fi
|
|
@ -134,28 +134,50 @@ converter can be specified on the command line, or chosen based on the file
|
|||
extension of the input file.
|
||||
|
||||
```bash
|
||||
$ python -m spacy convert [input_file] [output_dir] [--converter] [--n-sents]
|
||||
[--morphology]
|
||||
$ python -m spacy convert [input_file] [output_dir] [--file-type] [--converter]
|
||||
[--n-sents] [--morphology] [--lang]
|
||||
```
|
||||
|
||||
| Argument | Type | Description |
|
||||
| -------------------------------------------- | ---------- | ---------------------------------------------------------- |
|
||||
| `input_file` | positional | Input file. |
|
||||
| `output_dir` | positional | Output directory for converted JSON file. |
|
||||
| `converter`, `-c` <Tag variant="new">2</Tag> | option | Name of converter to use (see below). |
|
||||
| `--n-sents`, `-n` | option | Number of sentences per document. |
|
||||
| `--morphology`, `-m` | option | Enable appending morphology to tags. |
|
||||
| `--help`, `-h` | flag | Show help message and available arguments. |
|
||||
| **CREATES** | JSON | Data in spaCy's [JSON format](/api/annotation#json-input). |
|
||||
| Argument | Type | Description |
|
||||
| ------------------------------------------------ | ---------- | ------------------------------------------------------------------------------------------------- |
|
||||
| `input_file` | positional | Input file. |
|
||||
| `output_dir` | positional | Output directory for converted file. Defaults to `"-"`, meaning data will be written to `stdout`. |
|
||||
| `--file-type`, `-t` <Tag variant="new">2.1</Tag> | option | Type of file to create (see below). |
|
||||
| `--converter`, `-c` <Tag variant="new">2</Tag> | option | Name of converter to use (see below). |
|
||||
| `--n-sents`, `-n` | option | Number of sentences per document. |
|
||||
| `--morphology`, `-m` | option | Enable appending morphology to tags. |
|
||||
| `--lang`, `-l` <Tag variant="new">2.1</Tag> | option | Language code (if tokenizer required). |
|
||||
| `--help`, `-h` | flag | Show help message and available arguments. |
|
||||
| **CREATES** | JSON | Data in spaCy's [JSON format](/api/annotation#json-input). |
|
||||
|
||||
The following file format converters are available:
|
||||
### Output file types {new="2.1"}
|
||||
|
||||
| ID | Description |
|
||||
| ----------------- | --------------------------------------------------------------- |
|
||||
| `auto` | Automatically pick converter based on file extension (default). |
|
||||
| `conllu`, `conll` | Universal Dependencies `.conllu` or `.conll` format. |
|
||||
| `ner` | Tab-based named entity recognition format. |
|
||||
| `iob` | IOB or IOB2 named entity recognition format. |
|
||||
> #### Which format should I choose?
|
||||
>
|
||||
> If you're not sure, go with the default `jsonl`. Newline-delimited JSON means
|
||||
> that there's one JSON object per line. Unlike a regular JSON file, it can also
|
||||
> be read in line-by-line and you won't have to parse the _entire file_ first.
|
||||
> This makes it a very convenient format for larger corpora.
|
||||
|
||||
All output files generated by this command are compatible with
|
||||
[`spacy train`](/api/cli#train).
|
||||
|
||||
| ID | Description |
|
||||
| ------- | --------------------------------- |
|
||||
| `jsonl` | Newline-delimited JSON (default). |
|
||||
| `json` | Regular JSON. |
|
||||
| `msg` | Binary MessagePack format. |
|
||||
|
||||
### Converter options
|
||||
|
||||
<!-- TODO: document jsonl option – maybe update it? -->
|
||||
|
||||
| ID | Description |
|
||||
| ------------------------------ | --------------------------------------------------------------- |
|
||||
| `auto` | Automatically pick converter based on file extension (default). |
|
||||
| `conll`, `conllu`, `conllubio` | Universal Dependencies `.conllu` or `.conll` format. |
|
||||
| `ner` | Tab-based named entity recognition format. |
|
||||
| `iob` | IOB or IOB2 named entity recognition format. |
|
||||
|
||||
## Train {#train}
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user