Update feature/lemmatizer from develop

2025-07-14 18:22:27 +03:00 · 2019-03-10 02:45:33 +01:00 · 2019-03-10 02:45:33 +01:00 · 78aba46530
commit 78aba46530
parent 5431c47b91 ce1fe8a510
12 changed files with 207 additions and 97 deletions
--- a/.appveyor.yml
+++ b/.appveyor.yml
@ -1,21 +0,0 @@
 environment:
  matrix:
    - PYTHON: "C:\\Python35-x64"
    - PYTHON: "C:\\Python36-x64"
    - PYTHON: "C:\\Python37-x64"
 install:
  # We need wheel installed to build wheels
  - "%PYTHON%\\python.exe -m pip install wheel"
  - "%PYTHON%\\python.exe -m pip install cython"
  - "%PYTHON%\\python.exe -m pip install -r requirements.txt"
  - "%PYTHON%\\python.exe -m pip install -e ."
 build: off
 test_script:
  - "%PYTHON%\\python.exe -m pytest spacy/ --no-print-logs"
 after_test:
  - "%PYTHON%\\python.exe setup.py bdist_wheel"
 artifacts:
  - path: dist\*
 branches:
  except:
    - spacy.io
--- a/.travis.yml
+++ b/.travis.yml
@ -5,23 +5,16 @@ dist: trusty
 group: edge
 python:
   - "2.7"
   - "3.5"
   - "3.6"
 os:
  - linux
 env:
  - VIA=compile
  - VIA=flake8
 install:
-  - "./travis.sh"
+  - "pip install -r requirements.txt"
-  - pip install flake8
+  - "python setup.py build_ext --inplace"
  - "pip install -e ."
 script:
  - "cat /proc/cpuinfo | grep flags | head -n 1"
  - "pip install pytest pytest-timeout"
-  - if [[ "${VIA}" == "compile" ]]; then python -m pytest --tb=native spacy; fi
+  - "python -m pytest --tb=native spacy"
  - if [[ "${VIA}" == "flake8" ]]; then flake8 . --count --exclude=spacy/compat.py,spacy/lang --select=E901,E999,F821,F822,F823 --show-source --statistics; fi
  - if [[ "${VIA}" == "pypi_nightly" ]]; then python -m pytest --tb=native --models --en `python -c "import os.path; import spacy; print(os.path.abspath(os.path.dirname(spacy.__file__)))"`; fi
  - if [[ "${VIA}" == "sdist" ]]; then python -m pytest --tb=native `python -c "import os.path; import spacy; print(os.path.abspath(os.path.dirname(spacy.__file__)))"`; fi
 branches:
  except:
    - spacy.io
--- a/README.md
+++ b/README.md
@ -14,8 +14,8 @@ released under the MIT license.
 💫 **Version 2.1 out now!** [Check out the release notes here.](https://github.com/explosion/spaCy/releases)
 [![Azure Pipelines](https://img.shields.io/azure-devops/build/explosion-ai/public/8/master.svg?logo=azure-devops&style=flat-square)](https://dev.azure.com/explosion-ai/public/_build?definitionId=8)
 [![Travis Build Status](https://img.shields.io/travis/explosion/spaCy/master.svg?style=flat-square&logo=travis)](https://travis-ci.org/explosion/spaCy)
 [![Appveyor Build Status](https://img.shields.io/appveyor/ci/explosion/spaCy/master.svg?style=flat-square&logo=appveyor)](https://ci.appveyor.com/project/explosion/spaCy)
 [![Current Release Version](https://img.shields.io/github/release/explosion/spacy.svg?style=flat-square)](https://github.com/explosion/spaCy/releases)
 [![pypi Version](https://img.shields.io/pypi/v/spacy.svg?style=flat-square)](https://pypi.python.org/pypi/spacy)
 [![conda Version](https://img.shields.io/conda/vn/conda-forge/spacy.svg?style=flat-square)](https://anaconda.org/conda-forge/spacy)
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@ -0,0 +1,92 @@
 trigger:
  batch: true
  branches:
    include:
    - '*'
    exclude:
    - 'spacy.io'
  paths:
    exclude:
    - 'website/*'
    - '*.md'
 jobs:
 # Perform basic checks for most important errors (syntax etc.) Uses the config
 # defined in .flake8 and overwrites the selected codes.
 - job: 'Validate'
  pool:
    vmImage: 'ubuntu-16.04'
  steps:
  - task: UsePythonVersion@0
    inputs:
      versionSpec: '3.7'
  - script: |
      pip install flake8
      python -m flake8 spacy --count --select=E901,E999,F821,F822,F823 --show-source --statistics
    displayName: 'flake8'
 - job: 'Test'
  dependsOn: 'Validate'
  strategy:
    matrix:
      # Python 2.7 currently doesn't work because it seems to be a narrow
      # unicode build, which causes problems with the regular expressions
      # Python27Linux:
      #   imageName: 'ubuntu-16.04'
      #   python.version: '2.7'
      # Python27Mac:
      #   imageName: 'macos-10.13'
      #   python.version: '2.7'
      Python35Linux:
        imageName: 'ubuntu-16.04'
        python.version: '3.5'
      Python35Windows:
        imageName: 'vs2017-win2016'
        python.version: '3.5'
      Python35Mac:
        imageName: 'macos-10.13'
        python.version: '3.5'
      Python36Linux:
        imageName: 'ubuntu-16.04'
        python.version: '3.6'
      Python36Windows:
        imageName: 'vs2017-win2016'
        python.version: '3.6'
      Python36Mac:
        imageName: 'macos-10.13'
        python.version: '3.6'
      Python37Linux:
        imageName: 'ubuntu-16.04'
        python.version: '3.7'
      Python37Windows:
        imageName: 'vs2017-win2016'
        python.version: '3.7'
      Python37Mac:
        imageName: 'macos-10.13'
        python.version: '3.7'
    maxParallel: 4
  pool:
    vmImage: $(imageName)
  steps:
  - task: UsePythonVersion@0
    inputs:
      versionSpec: '$(python.version)'
      architecture: 'x64'
  # Downgrading pip is necessary to prevent a wheel version incompatiblity.
  # Might be fixed in the future or some other way, so investigate again.
  - script: |
      python -m pip install --upgrade pip==18.1
      pip install -r requirements.txt
    displayName: 'Install dependencies'
  - script: |
      python setup.py build_ext --inplace
      pip install -e .
    displayName: 'Build and install'
  - script: python -m pytest --tb=native spacy
    displayName: 'Run tests'
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@ -84,16 +84,54 @@ def _zero_init(model):
@layerize
 def _preprocess_doc(docs, drop=0.0):
    keys = [doc.to_array(LOWER) for doc in docs]
    ops = Model.ops
    # The dtype here matches what thinc is expecting -- which differs per
    # platform (by int definition). This should be fixed once the problem
    # is fixed on Thinc's side.
-    lengths = ops.asarray([arr.shape[0] for arr in keys], dtype=numpy.int_)
+    lengths = numpy.array([arr.shape[0] for arr in keys], dtype=numpy.int_)
-    keys = ops.xp.concatenate(keys)
+    keys = numpy.concatenate(keys)
-    vals = ops.allocate(keys.shape) + 1.0
+    vals = numpy.zeros(keys.shape, dtype='f')
    return (keys, vals, lengths), None
 def with_cpu(ops, model):
    """Wrap a model that should run on CPU, transferring inputs and outputs
    as necessary."""
    model.to_cpu()
    def with_cpu_forward(inputs, drop=0.):
        cpu_outputs, backprop = model.begin_update(_to_cpu(inputs), drop=drop)
        gpu_outputs = _to_device(ops, cpu_outputs)
        def with_cpu_backprop(d_outputs, sgd=None):
            cpu_d_outputs = _to_cpu(d_outputs)
            return backprop(cpu_d_outputs, sgd=sgd)
        return gpu_outputs, with_cpu_backprop
    return wrap(with_cpu_forward, model)
 def _to_cpu(X):
    if isinstance(X, numpy.ndarray):
        return X
    elif isinstance(X, tuple):
        return tuple([_to_cpu(x) for x in X])
    elif isinstance(X, list):
        return [_to_cpu(x) for x in X]
    elif hasattr(X, 'get'):
        return X.get()
    else:
        return X
 def _to_device(ops, X):
    if isinstance(X, tuple):
        return tuple([_to_device(ops, x) for x in X])
    elif isinstance(X, list):
        return [_to_device(ops, x) for x in X]
    else:
        return ops.asarray(X)
@layerize
 def _preprocess_doc_bigrams(docs, drop=0.0):
    unigrams = [doc.to_array(LOWER) for doc in docs]
@ -655,7 +693,10 @@ def build_text_classifier(nr_class, width=64, **cfg):
            >> zero_init(Affine(nr_class, width, drop_factor=0.0))
        )
-        linear_model = _preprocess_doc >> LinearModel(nr_class)
+        linear_model = (
            _preprocess_doc
            >> with_cpu(Model.ops, LinearModel(nr_class))
        )
        if cfg.get("exclusive_classes"):
            output_layer = Softmax(nr_class, nr_class * 2)
        else:
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@ -23,15 +23,16 @@ CONVERTERS = {
 }
 # File types
-FILE_TYPES = ("json", "jsonl")
+FILE_TYPES = ("json", "jsonl", "msg")
 FILE_TYPES_STDOUT = ("json", "jsonl")
@plac.annotations(
    input_file=("Input file", "positional", None, str),
-    output_dir=("Output directory for converted file", "positional", None, str),
+    output_dir=("Output directory. '-' for stdout.", "positional", None, str),
-    file_type=("Type of data to produce: 'jsonl' or 'json'", "option", "t", str),
+    file_type=("Type of data to produce: {}".format(FILE_TYPES), "option", "t", str),
    n_sents=("Number of sentences per doc", "option", "n", int),
-    converter=("Name of converter (auto, iob, conllu or ner)", "option", "c", str),
+    converter=("Converter: {}".format(tuple(CONVERTERS.keys())), "option", "c", str),
    lang=("Language (if tokenizer required)", "option", "l", str),
    morphology=("Enable appending morphology to tags", "flag", "m", bool),
 )
@ -58,6 +59,13 @@ def convert(
            "Supported file types: '{}'".format(", ".join(FILE_TYPES)),
            exits=1,
        )
    if file_type not in FILE_TYPES_STDOUT and output_dir == "-":
        # TODO: support msgpack via stdout in srsly?
        msg.fail(
            "Can't write .{} data to stdout.".format(file_type),
            "Please specify an output directory.",
            exits=1,
        )
    if not input_path.exists():
        msg.fail("Input file not found", input_path, exits=1)
    if output_dir != "-" and not Path(output_dir).exists():
@ -78,6 +86,8 @@ def convert(
            srsly.write_json(output_file, data)
        elif file_type == "jsonl":
            srsly.write_jsonl(output_file, data)
        elif file_type == "msg":
            srsly.write_msgpack(output_file, data)
        msg.good("Generated output file ({} documents)".format(len(data)), output_file)
    else:
        # Print to stdout
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -342,7 +342,7 @@ class Errors(object):
            "equal to span length ({span_len}).")
    E122 = ("Cannot find token to be split. Did it get merged?")
    E123 = ("Cannot find head of token to be split. Did it get merged?")
-    E124 = ("Cannot read from file: {path}. Supported formats: .json, .msg")
+    E124 = ("Cannot read from file: {path}. Supported formats: {formats}")
    E125 = ("Unexpected value: {value}")
    E126 = ("Unexpected matcher predicate: '{bad}'. Expected one of: {good}. "
            "This is likely a bug in spaCy, so feel free to open an issue.")
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@ -153,10 +153,13 @@ class GoldCorpus(object):
            loc = util.ensure_path(loc)
            if loc.parts[-1].endswith("json"):
                gold_tuples = read_json_file(loc)
            elif loc.parts[-1].endswith("jsonl"):
                gold_tuples = srsly.read_jsonl(loc)
            elif loc.parts[-1].endswith("msg"):
                gold_tuples = srsly.read_msgpack(loc)
            else:
-                raise ValueError(Errors.E124.format(path=path2str(loc)))
+                supported = ("json", "jsonl", "msg")
                raise ValueError(Errors.E124.format(path=path2str(loc), formats=supported))
            for item in gold_tuples:
                yield item
                i += len(item[1])
--- a/spacy/lang/fr/punctuation.py
+++ b/spacy/lang/fr/punctuation.py
@ -21,7 +21,9 @@ _suffixes = (
        r"(?<=[0-9])%",  # 4% -> ["4", "%"]
        r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
        r"(?<=[0-9])(?:{u})".format(u=UNITS),
-        r"(?<=[0-9{al}{e}(?:{q})])\.".format(al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES),
+        r"(?<=[0-9{al}{e}(?:{q})])\.".format(
            al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES
        ),
        r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
    ]
 )
--- a/spacy/lang/fr/tokenizer_exceptions.py
+++ b/spacy/lang/fr/tokenizer_exceptions.py
@ -379,7 +379,7 @@ _regular_exp = [
 _regular_exp += [
    "^{prefix}[{hyphen}][{al}][{hyphen}{al}{elision}]*$".format(
        prefix=p,
-        hyphen=HYPHENS,   # putting the - first in the [] range avoids having to use a backslash
+        hyphen=HYPHENS,  # putting the - first in the [] range avoids having to use a backslash
        elision=ELISION,
        al=ALPHA_LOWER,
    )
@ -423,5 +423,5 @@ _regular_exp.append(URL_PATTERN)
 TOKENIZER_EXCEPTIONS = _exc
 TOKEN_MATCH = re.compile(
-    "|".join("(?:{})".format(m) for m in _regular_exp), re.IGNORECASE
+    "|".join("(?:{})".format(m) for m in _regular_exp), re.IGNORECASE | re.UNICODE
 ).match
--- a/travis.sh
+++ b/travis.sh
@ -1,32 +0,0 @@
 #!/bin/bash
 if [ "${VIA}" == "pypi" ]; then
    rm -rf *
    pip install spacy-nightly
    python -m spacy download en
 fi
 if [[ "${VIA}" == "sdist" && "${TRAVIS_PULL_REQUEST}" == "false" ]]; then
  rm -rf *
  pip uninstall spacy
  wget https://api.explosion.ai/build/spacy/sdist/$TRAVIS_COMMIT
  mv $TRAVIS_COMMIT sdist.tgz
  pip install -U sdist.tgz
 fi
 if [ "${VIA}" == "compile" ]; then
  pip install -r requirements.txt
  python setup.py build_ext --inplace
  pip install -e .
 fi
 #  mkdir -p corpora/en
 #  cd corpora/en
 #  wget --no-check-certificate http://wordnetcode.princeton.edu/3.0/WordNet-3.0.tar.gz
 #  tar -xzf WordNet-3.0.tar.gz
 #  mv WordNet-3.0 wordnet
 #  cd ../../
 #  mkdir models/
 #  python bin/init_model.py en lang_data/ corpora/ models/en
 #fi
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@ -134,28 +134,50 @@ converter can be specified on the command line, or chosen based on the file
 extension of the input file.
 ```bash
-$ python -m spacy convert [input_file] [output_dir] [--converter] [--n-sents]
+$ python -m spacy convert [input_file] [output_dir] [--file-type] [--converter]
-[--morphology]
+[--n-sents] [--morphology] [--lang]
 ```
-| Argument                                     | Type       | Description                                                |
+| Argument                                         | Type       | Description                                                                                       |
-| -------------------------------------------- | ---------- | ---------------------------------------------------------- |
+| ------------------------------------------------ | ---------- | ------------------------------------------------------------------------------------------------- |
-| `input_file`                                 | positional | Input file.                                                |
+| `input_file`                                     | positional | Input file.                                                                                       |
-| `output_dir`                                 | positional | Output directory for converted JSON file.                  |
+| `output_dir`                                     | positional | Output directory for converted file. Defaults to `"-"`, meaning data will be written to `stdout`. |
-| `converter`, `-c` <Tag variant="new">2</Tag> | option     | Name of converter to use (see below).                      |
+| `--file-type`, `-t` <Tag variant="new">2.1</Tag> | option     | Type of file to create (see below).                                                               |
-| `--n-sents`, `-n`                            | option     | Number of sentences per document.                          |
+| `--converter`, `-c` <Tag variant="new">2</Tag>   | option     | Name of converter to use (see below).                                                             |
-| `--morphology`, `-m`                         | option     | Enable appending morphology to tags.                       |
+| `--n-sents`, `-n`                                | option     | Number of sentences per document.                                                                 |
-| `--help`, `-h`                               | flag       | Show help message and available arguments.                 |
+| `--morphology`, `-m`                             | option     | Enable appending morphology to tags.                                                              |
-| **CREATES**                                  | JSON       | Data in spaCy's [JSON format](/api/annotation#json-input). |
+| `--lang`, `-l` <Tag variant="new">2.1</Tag>      | option     | Language code (if tokenizer required).                                                            |
 | `--help`, `-h`                                   | flag       | Show help message and available arguments.                                                        |
 | **CREATES**                                      | JSON       | Data in spaCy's [JSON format](/api/annotation#json-input).                                        |
-The following file format converters are available:
+### Output file types {new="2.1"}
-| ID                | Description                                                     |
+> #### Which format should I choose?
-| ----------------- | --------------------------------------------------------------- |
+>
-| `auto`            | Automatically pick converter based on file extension (default). |
+> If you're not sure, go with the default `jsonl`. Newline-delimited JSON means
-| `conllu`, `conll` | Universal Dependencies `.conllu` or `.conll` format.            |
+> that there's one JSON object per line. Unlike a regular JSON file, it can also
-| `ner`             | Tab-based named entity recognition format.                      |
+> be read in line-by-line and you won't have to parse the _entire file_ first.
-| `iob`             | IOB or IOB2 named entity recognition format.                    |
+> This makes it a very convenient format for larger corpora.
 All output files generated by this command are compatible with
 [`spacy train`](/api/cli#train).
 | ID      | Description                       |
 | ------- | --------------------------------- |
 | `jsonl` | Newline-delimited JSON (default). |
 | `json`  | Regular JSON.                     |
 | `msg`   | Binary MessagePack format.        |
 ### Converter options
 <!-- TODO: document jsonl option – maybe update it? -->
 | ID                             | Description                                                     |
 | ------------------------------ | --------------------------------------------------------------- |
 | `auto`                         | Automatically pick converter based on file extension (default). |
 | `conll`, `conllu`, `conllubio` | Universal Dependencies `.conllu` or `.conll` format.            |
 | `ner`                          | Tab-based named entity recognition format.                      |
 | `iob`                          | IOB or IOB2 named entity recognition format.                    |
 ## Train {#train}