mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	Merge branch 'develop' into pr/5008
This commit is contained in:
		
						commit
						5d21d3e8b9
					
				
							
								
								
									
										106
									
								
								.github/contributors/Jan-711.md
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										106
									
								
								.github/contributors/Jan-711.md
									
									
									
									
										vendored
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,106 @@
 | 
				
			||||||
 | 
					# spaCy contributor agreement
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					This spaCy Contributor Agreement (**"SCA"**) is based on the
 | 
				
			||||||
 | 
					[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
 | 
				
			||||||
 | 
					The SCA applies to any contribution that you make to any product or project
 | 
				
			||||||
 | 
					managed by us (the **"project"**), and sets out the intellectual property rights
 | 
				
			||||||
 | 
					you grant to us in the contributed materials. The term **"us"** shall mean
 | 
				
			||||||
 | 
					[ExplosionAI GmbH](https://explosion.ai/legal). The term
 | 
				
			||||||
 | 
					**"you"** shall mean the person or entity identified below.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					If you agree to be bound by these terms, fill in the information requested
 | 
				
			||||||
 | 
					below and include the filled-in version with your first pull request, under the
 | 
				
			||||||
 | 
					folder [`.github/contributors/`](/.github/contributors/). The name of the file
 | 
				
			||||||
 | 
					should be your GitHub username, with the extension `.md`. For example, the user
 | 
				
			||||||
 | 
					example_user would create the file `.github/contributors/example_user.md`.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Read this agreement carefully before signing. These terms and conditions
 | 
				
			||||||
 | 
					constitute a binding legal agreement.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## Contributor Agreement
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					1. The term "contribution" or "contributed materials" means any source code,
 | 
				
			||||||
 | 
					object code, patch, tool, sample, graphic, specification, manual,
 | 
				
			||||||
 | 
					documentation, or any other material posted or submitted by you to the project.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					2. With respect to any worldwide copyrights, or copyright applications and
 | 
				
			||||||
 | 
					registrations, in your contribution:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * you hereby assign to us joint ownership, and to the extent that such
 | 
				
			||||||
 | 
					    assignment is or becomes invalid, ineffective or unenforceable, you hereby
 | 
				
			||||||
 | 
					    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
 | 
				
			||||||
 | 
					    royalty-free, unrestricted license to exercise all rights under those
 | 
				
			||||||
 | 
					    copyrights. This includes, at our option, the right to sublicense these same
 | 
				
			||||||
 | 
					    rights to third parties through multiple levels of sublicensees or other
 | 
				
			||||||
 | 
					    licensing arrangements;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * you agree that each of us can do all things in relation to your
 | 
				
			||||||
 | 
					    contribution as if each of us were the sole owners, and if one of us makes
 | 
				
			||||||
 | 
					    a derivative work of your contribution, the one who makes the derivative
 | 
				
			||||||
 | 
					    work (or has it made will be the sole owner of that derivative work;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * you agree that you will not assert any moral rights in your contribution
 | 
				
			||||||
 | 
					    against us, our licensees or transferees;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * you agree that we may register a copyright in your contribution and
 | 
				
			||||||
 | 
					    exercise all ownership rights associated with it; and
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * you agree that neither of us has any duty to consult with, obtain the
 | 
				
			||||||
 | 
					    consent of, pay or render an accounting to the other for any use or
 | 
				
			||||||
 | 
					    distribution of your contribution.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					3. With respect to any patents you own, or that you can license without payment
 | 
				
			||||||
 | 
					to any third party, you hereby grant to us a perpetual, irrevocable,
 | 
				
			||||||
 | 
					non-exclusive, worldwide, no-charge, royalty-free license to:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * make, have made, use, sell, offer to sell, import, and otherwise transfer
 | 
				
			||||||
 | 
					    your contribution in whole or in part, alone or in combination with or
 | 
				
			||||||
 | 
					    included in any product, work or materials arising out of the project to
 | 
				
			||||||
 | 
					    which your contribution was submitted, and
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * at our option, to sublicense these same rights to third parties through
 | 
				
			||||||
 | 
					    multiple levels of sublicensees or other licensing arrangements.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					4. Except as set out above, you keep all right, title, and interest in your
 | 
				
			||||||
 | 
					contribution. The rights that you grant to us under these terms are effective
 | 
				
			||||||
 | 
					on the date you first submitted a contribution to us, even if your submission
 | 
				
			||||||
 | 
					took place before the date you sign these terms.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					5. You covenant, represent, warrant and agree that:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * Each contribution that you submit is and shall be an original work of
 | 
				
			||||||
 | 
					    authorship and you can legally grant the rights set out in this SCA;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * to the best of your knowledge, each contribution will not violate any
 | 
				
			||||||
 | 
					    third party's copyrights, trademarks, patents, or other intellectual
 | 
				
			||||||
 | 
					    property rights; and
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * each contribution shall be in compliance with U.S. export control laws and
 | 
				
			||||||
 | 
					    other applicable export and import laws. You agree to notify us if you
 | 
				
			||||||
 | 
					    become aware of any circumstance which would make any of the foregoing
 | 
				
			||||||
 | 
					    representations inaccurate in any respect. We may publicly disclose your
 | 
				
			||||||
 | 
					    participation in the project, including the fact that you have signed the SCA.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					6. This SCA is governed by the laws of the State of California and applicable
 | 
				
			||||||
 | 
					U.S. Federal law. Any choice of law rules will not apply.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					7. Please place an “x” on one of the applicable statement below. Please do NOT
 | 
				
			||||||
 | 
					mark both statements:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * [x] I am signing on behalf of myself as an individual and no other person
 | 
				
			||||||
 | 
					    or entity, including my employer, has or will have rights with respect to my
 | 
				
			||||||
 | 
					    contributions.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * [ ] I am signing on behalf of my employer or a legal entity and I have the
 | 
				
			||||||
 | 
					    actual authority to contractually bind that entity.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## Contributor Details
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					| Field                          | Entry                |
 | 
				
			||||||
 | 
					|------------------------------- | -------------------- |
 | 
				
			||||||
 | 
					| Name                           | Jan Jessewitsch      |
 | 
				
			||||||
 | 
					| Company name (if applicable)   |                      |
 | 
				
			||||||
 | 
					| Title or role (if applicable)  |                      |
 | 
				
			||||||
 | 
					| Date                           | 16.02.2020           |
 | 
				
			||||||
 | 
					| GitHub username                | Jan-711              |
 | 
				
			||||||
 | 
					| Website (optional)             |                      |
 | 
				
			||||||
							
								
								
									
										4
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										4
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							| 
						 | 
					@ -39,6 +39,7 @@ __pycache__/
 | 
				
			||||||
.env*
 | 
					.env*
 | 
				
			||||||
.~env/
 | 
					.~env/
 | 
				
			||||||
.venv
 | 
					.venv
 | 
				
			||||||
 | 
					env3.6/
 | 
				
			||||||
venv/
 | 
					venv/
 | 
				
			||||||
.dev
 | 
					.dev
 | 
				
			||||||
.denv
 | 
					.denv
 | 
				
			||||||
| 
						 | 
					@ -111,3 +112,6 @@ Desktop.ini
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Pycharm project files
 | 
					# Pycharm project files
 | 
				
			||||||
*.idea
 | 
					*.idea
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# IPython
 | 
				
			||||||
 | 
					.ipynb_checkpoints/
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										23
									
								
								.travis.yml
									
									
									
									
									
								
							
							
						
						
									
										23
									
								
								.travis.yml
									
									
									
									
									
								
							| 
						 | 
					@ -1,23 +0,0 @@
 | 
				
			||||||
language: python
 | 
					 | 
				
			||||||
sudo: false
 | 
					 | 
				
			||||||
cache: pip
 | 
					 | 
				
			||||||
dist: trusty
 | 
					 | 
				
			||||||
group: edge
 | 
					 | 
				
			||||||
python:
 | 
					 | 
				
			||||||
   - "2.7"
 | 
					 | 
				
			||||||
os:
 | 
					 | 
				
			||||||
  - linux
 | 
					 | 
				
			||||||
install:
 | 
					 | 
				
			||||||
  - "pip install -r requirements.txt"
 | 
					 | 
				
			||||||
  - "python setup.py build_ext --inplace"
 | 
					 | 
				
			||||||
  - "pip install -e ."
 | 
					 | 
				
			||||||
script:
 | 
					 | 
				
			||||||
  - "cat /proc/cpuinfo | grep flags | head -n 1"
 | 
					 | 
				
			||||||
  - "python -m pytest --tb=native spacy"
 | 
					 | 
				
			||||||
branches:
 | 
					 | 
				
			||||||
  except:
 | 
					 | 
				
			||||||
    - spacy.io
 | 
					 | 
				
			||||||
notifications:
 | 
					 | 
				
			||||||
  slack:
 | 
					 | 
				
			||||||
    secure: F8GvqnweSdzImuLL64TpfG0i5rYl89liyr9tmFVsHl4c0DNiDuGhZivUz0M1broS8svE3OPOllLfQbACG/4KxD890qfF9MoHzvRDlp7U+RtwMV/YAkYn8MGWjPIbRbX0HpGdY7O2Rc9Qy4Kk0T8ZgiqXYIqAz2Eva9/9BlSmsJQ=
 | 
					 | 
				
			||||||
  email: false
 | 
					 | 
				
			||||||
| 
						 | 
					@ -280,23 +280,7 @@ except:  # noqa: E722
 | 
				
			||||||
 | 
					
 | 
				
			||||||
### Python conventions
 | 
					### Python conventions
 | 
				
			||||||
 | 
					
 | 
				
			||||||
All Python code must be written in an **intersection of Python 2 and Python 3**.
 | 
					All Python code must be written **compatible with Python 3.6+**.
 | 
				
			||||||
This is easy in Cython, but somewhat ugly in Python. Logic that deals with
 | 
					 | 
				
			||||||
Python or platform compatibility should only live in
 | 
					 | 
				
			||||||
[`spacy.compat`](spacy/compat.py). To distinguish them from the builtin
 | 
					 | 
				
			||||||
functions, replacement functions are suffixed with an underscore, for example
 | 
					 | 
				
			||||||
`unicode_`. If you need to access the user's version or platform information,
 | 
					 | 
				
			||||||
for example to show more specific error messages, you can use the `is_config()`
 | 
					 | 
				
			||||||
helper function.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
```python
 | 
					 | 
				
			||||||
from .compat import unicode_, is_config
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
compatible_unicode = unicode_('hello world')
 | 
					 | 
				
			||||||
if is_config(windows=True, python2=True):
 | 
					 | 
				
			||||||
    print("You are using Python 2 on Windows.")
 | 
					 | 
				
			||||||
```
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
Code that interacts with the file-system should accept objects that follow the
 | 
					Code that interacts with the file-system should accept objects that follow the
 | 
				
			||||||
`pathlib.Path` API, without assuming that the object inherits from `pathlib.Path`.
 | 
					`pathlib.Path` API, without assuming that the object inherits from `pathlib.Path`.
 | 
				
			||||||
If the function is user-facing and takes a path as an argument, it should check
 | 
					If the function is user-facing and takes a path as an argument, it should check
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										14
									
								
								README.md
									
									
									
									
									
								
							
							
						
						
									
										14
									
								
								README.md
									
									
									
									
									
								
							| 
						 | 
					@ -15,7 +15,6 @@ It's commercial open-source software, released under the MIT license.
 | 
				
			||||||
[Check out the release notes here.](https://github.com/explosion/spaCy/releases)
 | 
					[Check out the release notes here.](https://github.com/explosion/spaCy/releases)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
[>)](https://dev.azure.com/explosion-ai/public/_build?definitionId=8)
 | 
					[>)](https://dev.azure.com/explosion-ai/public/_build?definitionId=8)
 | 
				
			||||||
[>)](https://travis-ci.org/explosion/spaCy)
 | 
					 | 
				
			||||||
[](https://github.com/explosion/spaCy/releases)
 | 
					[](https://github.com/explosion/spaCy/releases)
 | 
				
			||||||
[](https://pypi.org/project/spacy/)
 | 
					[](https://pypi.org/project/spacy/)
 | 
				
			||||||
[](https://anaconda.org/conda-forge/spacy)
 | 
					[](https://anaconda.org/conda-forge/spacy)
 | 
				
			||||||
| 
						 | 
					@ -98,12 +97,19 @@ For detailed installation instructions, see the
 | 
				
			||||||
 | 
					
 | 
				
			||||||
- **Operating system**: macOS / OS X · Linux · Windows (Cygwin, MinGW, Visual
 | 
					- **Operating system**: macOS / OS X · Linux · Windows (Cygwin, MinGW, Visual
 | 
				
			||||||
  Studio)
 | 
					  Studio)
 | 
				
			||||||
- **Python version**: Python 2.7, 3.5+ (only 64 bit)
 | 
					- **Python version**: Python 3.6+ (only 64 bit)
 | 
				
			||||||
- **Package managers**: [pip] · [conda] (via `conda-forge`)
 | 
					- **Package managers**: [pip] · [conda] (via `conda-forge`)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
[pip]: https://pypi.org/project/spacy/
 | 
					[pip]: https://pypi.org/project/spacy/
 | 
				
			||||||
[conda]: https://anaconda.org/conda-forge/spacy
 | 
					[conda]: https://anaconda.org/conda-forge/spacy
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					> ⚠️ **Important note for Python 3.8:** We can't yet ship pre-compiled binary
 | 
				
			||||||
 | 
					> wheels for spaCy that work on Python 3.8, as we're still waiting for our CI
 | 
				
			||||||
 | 
					> providers and other tooling to support it. This means that in order to run
 | 
				
			||||||
 | 
					> spaCy on Python 3.8, you'll need [a compiler installed](#source) and compile
 | 
				
			||||||
 | 
					> the library and its Cython dependencies locally. If this is causing problems
 | 
				
			||||||
 | 
					> for you, the easiest solution is to **use Python 3.7** in the meantime.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
### pip
 | 
					### pip
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Using pip, spaCy releases are available as source packages and binary wheels (as
 | 
					Using pip, spaCy releases are available as source packages and binary wheels (as
 | 
				
			||||||
| 
						 | 
					@ -262,9 +268,7 @@ and git preinstalled.
 | 
				
			||||||
Install a version of the
 | 
					Install a version of the
 | 
				
			||||||
[Visual C++ Build Tools](https://visualstudio.microsoft.com/visual-cpp-build-tools/)
 | 
					[Visual C++ Build Tools](https://visualstudio.microsoft.com/visual-cpp-build-tools/)
 | 
				
			||||||
or [Visual Studio Express](https://visualstudio.microsoft.com/vs/express/) that
 | 
					or [Visual Studio Express](https://visualstudio.microsoft.com/vs/express/) that
 | 
				
			||||||
matches the version that was used to compile your Python interpreter. For
 | 
					matches the version that was used to compile your Python interpreter.
 | 
				
			||||||
official distributions these are VS 2008 (Python 2.7), VS 2010 (Python 3.4) and
 | 
					 | 
				
			||||||
VS 2015 (Python 3.5).
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
## Run tests
 | 
					## Run tests
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -35,12 +35,6 @@ jobs:
 | 
				
			||||||
  dependsOn: 'Validate'
 | 
					  dependsOn: 'Validate'
 | 
				
			||||||
  strategy:
 | 
					  strategy:
 | 
				
			||||||
    matrix:
 | 
					    matrix:
 | 
				
			||||||
      Python35Linux:
 | 
					 | 
				
			||||||
        imageName: 'ubuntu-16.04'
 | 
					 | 
				
			||||||
        python.version: '3.5'
 | 
					 | 
				
			||||||
      Python35Windows:
 | 
					 | 
				
			||||||
        imageName: 'vs2017-win2016'
 | 
					 | 
				
			||||||
        python.version: '3.5'
 | 
					 | 
				
			||||||
      Python36Linux:
 | 
					      Python36Linux:
 | 
				
			||||||
        imageName: 'ubuntu-16.04'
 | 
					        imageName: 'ubuntu-16.04'
 | 
				
			||||||
        python.version: '3.6'
 | 
					        python.version: '3.6'
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										169
									
								
								bin/cythonize.py
									
									
									
									
									
								
							
							
						
						
									
										169
									
								
								bin/cythonize.py
									
									
									
									
									
								
							| 
						 | 
					@ -1,169 +0,0 @@
 | 
				
			||||||
#!/usr/bin/env python
 | 
					 | 
				
			||||||
""" cythonize.py
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
Cythonize pyx files into C++ files as needed.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
Usage: cythonize.py [root]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
Checks pyx files to see if they have been changed relative to their
 | 
					 | 
				
			||||||
corresponding C++ files. If they have, then runs cython on these files to
 | 
					 | 
				
			||||||
recreate the C++ files.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
Additionally, checks pxd files and setup.py if they have been changed. If
 | 
					 | 
				
			||||||
they have, rebuilds everything.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
Change detection based on file hashes stored in JSON format.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
For now, this script should be run by developers when changing Cython files
 | 
					 | 
				
			||||||
and the resulting C++ files checked in, so that end-users (and Python-only
 | 
					 | 
				
			||||||
developers) do not get the Cython dependencies.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
Based upon:
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
https://raw.github.com/dagss/private-scipy-refactor/cythonize/cythonize.py
 | 
					 | 
				
			||||||
https://raw.githubusercontent.com/numpy/numpy/master/tools/cythonize.py
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
Note: this script does not check any of the dependent C++ libraries.
 | 
					 | 
				
			||||||
"""
 | 
					 | 
				
			||||||
from __future__ import print_function
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
import os
 | 
					 | 
				
			||||||
import sys
 | 
					 | 
				
			||||||
import json
 | 
					 | 
				
			||||||
import hashlib
 | 
					 | 
				
			||||||
import subprocess
 | 
					 | 
				
			||||||
import argparse
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
HASH_FILE = "cythonize.json"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def process_pyx(fromfile, tofile, language_level="-2"):
 | 
					 | 
				
			||||||
    print("Processing %s" % fromfile)
 | 
					 | 
				
			||||||
    try:
 | 
					 | 
				
			||||||
        from Cython.Compiler.Version import version as cython_version
 | 
					 | 
				
			||||||
        from distutils.version import LooseVersion
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        if LooseVersion(cython_version) < LooseVersion("0.19"):
 | 
					 | 
				
			||||||
            raise Exception("Require Cython >= 0.19")
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    except ImportError:
 | 
					 | 
				
			||||||
        pass
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    flags = ["--fast-fail", language_level]
 | 
					 | 
				
			||||||
    if tofile.endswith(".cpp"):
 | 
					 | 
				
			||||||
        flags += ["--cplus"]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    try:
 | 
					 | 
				
			||||||
        try:
 | 
					 | 
				
			||||||
            r = subprocess.call(
 | 
					 | 
				
			||||||
                ["cython"] + flags + ["-o", tofile, fromfile], env=os.environ
 | 
					 | 
				
			||||||
            )  # See Issue #791
 | 
					 | 
				
			||||||
            if r != 0:
 | 
					 | 
				
			||||||
                raise Exception("Cython failed")
 | 
					 | 
				
			||||||
        except OSError:
 | 
					 | 
				
			||||||
            # There are ways of installing Cython that don't result in a cython
 | 
					 | 
				
			||||||
            # executable on the path, see gh-2397.
 | 
					 | 
				
			||||||
            r = subprocess.call(
 | 
					 | 
				
			||||||
                [
 | 
					 | 
				
			||||||
                    sys.executable,
 | 
					 | 
				
			||||||
                    "-c",
 | 
					 | 
				
			||||||
                    "import sys; from Cython.Compiler.Main import "
 | 
					 | 
				
			||||||
                    "setuptools_main as main; sys.exit(main())",
 | 
					 | 
				
			||||||
                ]
 | 
					 | 
				
			||||||
                + flags
 | 
					 | 
				
			||||||
                + ["-o", tofile, fromfile]
 | 
					 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
            if r != 0:
 | 
					 | 
				
			||||||
                raise Exception("Cython failed")
 | 
					 | 
				
			||||||
    except OSError:
 | 
					 | 
				
			||||||
        raise OSError("Cython needs to be installed")
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def preserve_cwd(path, func, *args):
 | 
					 | 
				
			||||||
    orig_cwd = os.getcwd()
 | 
					 | 
				
			||||||
    try:
 | 
					 | 
				
			||||||
        os.chdir(path)
 | 
					 | 
				
			||||||
        func(*args)
 | 
					 | 
				
			||||||
    finally:
 | 
					 | 
				
			||||||
        os.chdir(orig_cwd)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def load_hashes(filename):
 | 
					 | 
				
			||||||
    try:
 | 
					 | 
				
			||||||
        return json.load(open(filename))
 | 
					 | 
				
			||||||
    except (ValueError, IOError):
 | 
					 | 
				
			||||||
        return {}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def save_hashes(hash_db, filename):
 | 
					 | 
				
			||||||
    with open(filename, "w") as f:
 | 
					 | 
				
			||||||
        f.write(json.dumps(hash_db))
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def get_hash(path):
 | 
					 | 
				
			||||||
    return hashlib.md5(open(path, "rb").read()).hexdigest()
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def hash_changed(base, path, db):
 | 
					 | 
				
			||||||
    full_path = os.path.normpath(os.path.join(base, path))
 | 
					 | 
				
			||||||
    return not get_hash(full_path) == db.get(full_path)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def hash_add(base, path, db):
 | 
					 | 
				
			||||||
    full_path = os.path.normpath(os.path.join(base, path))
 | 
					 | 
				
			||||||
    db[full_path] = get_hash(full_path)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def process(base, filename, db):
 | 
					 | 
				
			||||||
    root, ext = os.path.splitext(filename)
 | 
					 | 
				
			||||||
    if ext in [".pyx", ".cpp"]:
 | 
					 | 
				
			||||||
        if hash_changed(base, filename, db) or not os.path.isfile(
 | 
					 | 
				
			||||||
            os.path.join(base, root + ".cpp")
 | 
					 | 
				
			||||||
        ):
 | 
					 | 
				
			||||||
            preserve_cwd(base, process_pyx, root + ".pyx", root + ".cpp")
 | 
					 | 
				
			||||||
            hash_add(base, root + ".cpp", db)
 | 
					 | 
				
			||||||
            hash_add(base, root + ".pyx", db)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def check_changes(root, db):
 | 
					 | 
				
			||||||
    res = False
 | 
					 | 
				
			||||||
    new_db = {}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    setup_filename = "setup.py"
 | 
					 | 
				
			||||||
    hash_add(".", setup_filename, new_db)
 | 
					 | 
				
			||||||
    if hash_changed(".", setup_filename, db):
 | 
					 | 
				
			||||||
        res = True
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    for base, _, files in os.walk(root):
 | 
					 | 
				
			||||||
        for filename in files:
 | 
					 | 
				
			||||||
            if filename.endswith(".pxd"):
 | 
					 | 
				
			||||||
                hash_add(base, filename, new_db)
 | 
					 | 
				
			||||||
                if hash_changed(base, filename, db):
 | 
					 | 
				
			||||||
                    res = True
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    if res:
 | 
					 | 
				
			||||||
        db.clear()
 | 
					 | 
				
			||||||
        db.update(new_db)
 | 
					 | 
				
			||||||
    return res
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def run(root):
 | 
					 | 
				
			||||||
    db = load_hashes(HASH_FILE)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    try:
 | 
					 | 
				
			||||||
        check_changes(root, db)
 | 
					 | 
				
			||||||
        for base, _, files in os.walk(root):
 | 
					 | 
				
			||||||
            for filename in files:
 | 
					 | 
				
			||||||
                process(base, filename, db)
 | 
					 | 
				
			||||||
    finally:
 | 
					 | 
				
			||||||
        save_hashes(db, HASH_FILE)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
if __name__ == "__main__":
 | 
					 | 
				
			||||||
    parser = argparse.ArgumentParser(
 | 
					 | 
				
			||||||
        description="Cythonize pyx files into C++ files as needed"
 | 
					 | 
				
			||||||
    )
 | 
					 | 
				
			||||||
    parser.add_argument("root", help="root directory")
 | 
					 | 
				
			||||||
    args = parser.parse_args()
 | 
					 | 
				
			||||||
    run(args.root)
 | 
					 | 
				
			||||||
| 
						 | 
					@ -13,23 +13,12 @@ import srsly
 | 
				
			||||||
import spacy
 | 
					import spacy
 | 
				
			||||||
import spacy.util
 | 
					import spacy.util
 | 
				
			||||||
from spacy.tokens import Token, Doc
 | 
					from spacy.tokens import Token, Doc
 | 
				
			||||||
from spacy.gold import GoldParse
 | 
					 | 
				
			||||||
from spacy.util import compounding, minibatch_by_words
 | 
					 | 
				
			||||||
from spacy.syntax.nonproj import projectivize
 | 
					 | 
				
			||||||
from spacy.matcher import Matcher
 | 
					from spacy.matcher import Matcher
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# from spacy.morphology import Fused_begin, Fused_inside
 | 
					 | 
				
			||||||
from spacy import displacy
 | 
					 | 
				
			||||||
from collections import defaultdict, Counter
 | 
					 | 
				
			||||||
from timeit import default_timer as timer
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
Fused_begin = None
 | 
					Fused_begin = None
 | 
				
			||||||
Fused_inside = None
 | 
					Fused_inside = None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import itertools
 | 
					 | 
				
			||||||
import random
 | 
					 | 
				
			||||||
import numpy.random
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from . import conll17_ud_eval
 | 
					from . import conll17_ud_eval
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from spacy import lang
 | 
					from spacy import lang
 | 
				
			||||||
| 
						 | 
					@ -268,7 +257,7 @@ def load_nlp(experiments_dir, corpus):
 | 
				
			||||||
    return nlp
 | 
					    return nlp
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def initialize_pipeline(nlp, docs, golds, config, device):
 | 
					def initialize_pipeline(nlp, examples, config, device):
 | 
				
			||||||
    nlp.add_pipe(nlp.create_pipe("parser"))
 | 
					    nlp.add_pipe(nlp.create_pipe("parser"))
 | 
				
			||||||
    return nlp
 | 
					    return nlp
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -14,7 +14,7 @@ import spacy
 | 
				
			||||||
import spacy.util
 | 
					import spacy.util
 | 
				
			||||||
from bin.ud import conll17_ud_eval
 | 
					from bin.ud import conll17_ud_eval
 | 
				
			||||||
from spacy.tokens import Token, Doc
 | 
					from spacy.tokens import Token, Doc
 | 
				
			||||||
from spacy.gold import GoldParse
 | 
					from spacy.gold import GoldParse, Example
 | 
				
			||||||
from spacy.util import compounding, minibatch, minibatch_by_words
 | 
					from spacy.util import compounding, minibatch, minibatch_by_words
 | 
				
			||||||
from spacy.syntax.nonproj import projectivize
 | 
					from spacy.syntax.nonproj import projectivize
 | 
				
			||||||
from spacy.matcher import Matcher
 | 
					from spacy.matcher import Matcher
 | 
				
			||||||
| 
						 | 
					@ -53,7 +53,7 @@ def read_data(
 | 
				
			||||||
    max_doc_length=None,
 | 
					    max_doc_length=None,
 | 
				
			||||||
    limit=None,
 | 
					    limit=None,
 | 
				
			||||||
):
 | 
					):
 | 
				
			||||||
    """Read the CONLLU format into (Doc, GoldParse) tuples. If raw_text=True,
 | 
					    """Read the CONLLU format into Example objects. If raw_text=True,
 | 
				
			||||||
    include Doc objects created using nlp.make_doc and then aligned against
 | 
					    include Doc objects created using nlp.make_doc and then aligned against
 | 
				
			||||||
    the gold-standard sequences. If oracle_segments=True, include Doc objects
 | 
					    the gold-standard sequences. If oracle_segments=True, include Doc objects
 | 
				
			||||||
    created from the gold-standard segments. At least one must be True."""
 | 
					    created from the gold-standard segments. At least one must be True."""
 | 
				
			||||||
| 
						 | 
					@ -98,15 +98,16 @@ def read_data(
 | 
				
			||||||
                docs.append(doc)
 | 
					                docs.append(doc)
 | 
				
			||||||
                golds.append(gold)
 | 
					                golds.append(gold)
 | 
				
			||||||
                if limit and len(docs) >= limit:
 | 
					                if limit and len(docs) >= limit:
 | 
				
			||||||
                    return docs, golds
 | 
					                    return golds_to_gold_data(docs, golds)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if raw_text and sent_annots:
 | 
					        if raw_text and sent_annots:
 | 
				
			||||||
            doc, gold = _make_gold(nlp, None, sent_annots)
 | 
					            doc, gold = _make_gold(nlp, None, sent_annots)
 | 
				
			||||||
            docs.append(doc)
 | 
					            docs.append(doc)
 | 
				
			||||||
            golds.append(gold)
 | 
					            golds.append(gold)
 | 
				
			||||||
        if limit and len(docs) >= limit:
 | 
					        if limit and len(docs) >= limit:
 | 
				
			||||||
            return docs, golds
 | 
					            return golds_to_gold_data(docs, golds)
 | 
				
			||||||
    return docs, golds
 | 
					    return golds_to_gold_data(docs, golds)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def _parse_morph_string(morph_string):
 | 
					def _parse_morph_string(morph_string):
 | 
				
			||||||
    if morph_string == '_':
 | 
					    if morph_string == '_':
 | 
				
			||||||
| 
						 | 
					@ -120,6 +121,7 @@ def _parse_morph_string(morph_string):
 | 
				
			||||||
        output.append('%s_%s' % (key, value.lower()))
 | 
					        output.append('%s_%s' % (key, value.lower()))
 | 
				
			||||||
    return set(output)
 | 
					    return set(output)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def read_conllu(file_):
 | 
					def read_conllu(file_):
 | 
				
			||||||
    docs = []
 | 
					    docs = []
 | 
				
			||||||
    sent = []
 | 
					    sent = []
 | 
				
			||||||
| 
						 | 
					@ -180,16 +182,18 @@ def _make_gold(nlp, text, sent_annots, drop_deps=0.0):
 | 
				
			||||||
#############################
 | 
					#############################
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def golds_to_gold_tuples(docs, golds):
 | 
					def golds_to_gold_data(docs, golds):
 | 
				
			||||||
    """Get out the annoying 'tuples' format used by begin_training, given the
 | 
					    """Get out the training data format used by begin_training, given the
 | 
				
			||||||
    GoldParse objects."""
 | 
					    GoldParse objects."""
 | 
				
			||||||
    tuples = []
 | 
					    data = []
 | 
				
			||||||
    for doc, gold in zip(docs, golds):
 | 
					    for doc, gold in zip(docs, golds):
 | 
				
			||||||
        text = doc.text
 | 
					        example = Example(doc=doc)
 | 
				
			||||||
        ids, words, tags, heads, labels, iob = zip(*gold.orig_annot)
 | 
					        example.add_doc_annotation(cats=gold.cats)
 | 
				
			||||||
        sents = [((ids, words, tags, heads, labels, iob), [])]
 | 
					        token_annotation_dict = gold.orig.to_dict()
 | 
				
			||||||
        tuples.append((text, sents))
 | 
					        example.add_token_annotation(**token_annotation_dict)
 | 
				
			||||||
    return tuples
 | 
					        example.goldparse = gold
 | 
				
			||||||
 | 
					        data.append(example)
 | 
				
			||||||
 | 
					    return data
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
##############
 | 
					##############
 | 
				
			||||||
| 
						 | 
					@ -327,7 +331,6 @@ def get_token_conllu(token, i):
 | 
				
			||||||
    return "\n".join(lines)
 | 
					    return "\n".join(lines)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 | 
				
			||||||
##################
 | 
					##################
 | 
				
			||||||
# Initialization #
 | 
					# Initialization #
 | 
				
			||||||
##################
 | 
					##################
 | 
				
			||||||
| 
						 | 
					@ -348,7 +351,7 @@ def load_nlp(corpus, config, vectors=None):
 | 
				
			||||||
    return nlp
 | 
					    return nlp
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def initialize_pipeline(nlp, docs, golds, config, device):
 | 
					def initialize_pipeline(nlp, examples, config, device):
 | 
				
			||||||
    nlp.add_pipe(nlp.create_pipe("tagger", config={"set_morphology": False}))
 | 
					    nlp.add_pipe(nlp.create_pipe("tagger", config={"set_morphology": False}))
 | 
				
			||||||
    nlp.add_pipe(nlp.create_pipe("morphologizer"))
 | 
					    nlp.add_pipe(nlp.create_pipe("morphologizer"))
 | 
				
			||||||
    nlp.add_pipe(nlp.create_pipe("parser"))
 | 
					    nlp.add_pipe(nlp.create_pipe("parser"))
 | 
				
			||||||
| 
						 | 
					@ -356,14 +359,15 @@ def initialize_pipeline(nlp, docs, golds, config, device):
 | 
				
			||||||
        nlp.parser.add_multitask_objective("tag")
 | 
					        nlp.parser.add_multitask_objective("tag")
 | 
				
			||||||
    if config.multitask_sent:
 | 
					    if config.multitask_sent:
 | 
				
			||||||
        nlp.parser.add_multitask_objective("sent_start")
 | 
					        nlp.parser.add_multitask_objective("sent_start")
 | 
				
			||||||
    for gold in golds:
 | 
					    for ex in examples:
 | 
				
			||||||
 | 
					        gold = ex.gold
 | 
				
			||||||
        for tag in gold.tags:
 | 
					        for tag in gold.tags:
 | 
				
			||||||
            if tag is not None:
 | 
					            if tag is not None:
 | 
				
			||||||
                nlp.tagger.add_label(tag)
 | 
					                nlp.tagger.add_label(tag)
 | 
				
			||||||
    if torch is not None and device != -1:
 | 
					    if torch is not None and device != -1:
 | 
				
			||||||
        torch.set_default_tensor_type("torch.cuda.FloatTensor")
 | 
					        torch.set_default_tensor_type("torch.cuda.FloatTensor")
 | 
				
			||||||
    optimizer = nlp.begin_training(
 | 
					    optimizer = nlp.begin_training(
 | 
				
			||||||
        lambda: golds_to_gold_tuples(docs, golds),
 | 
					        lambda: examples,
 | 
				
			||||||
        device=device,
 | 
					        device=device,
 | 
				
			||||||
        subword_features=config.subword_features,
 | 
					        subword_features=config.subword_features,
 | 
				
			||||||
        conv_depth=config.conv_depth,
 | 
					        conv_depth=config.conv_depth,
 | 
				
			||||||
| 
						 | 
					@ -491,6 +495,10 @@ def main(
 | 
				
			||||||
    Token.set_extension("begins_fused", default=False)
 | 
					    Token.set_extension("begins_fused", default=False)
 | 
				
			||||||
    Token.set_extension("inside_fused", default=False)
 | 
					    Token.set_extension("inside_fused", default=False)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Token.set_extension("get_conllu_lines", method=get_token_conllu)
 | 
				
			||||||
 | 
					    Token.set_extension("begins_fused", default=False)
 | 
				
			||||||
 | 
					    Token.set_extension("inside_fused", default=False)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    spacy.util.fix_random_seed()
 | 
					    spacy.util.fix_random_seed()
 | 
				
			||||||
    lang.zh.Chinese.Defaults.use_jieba = False
 | 
					    lang.zh.Chinese.Defaults.use_jieba = False
 | 
				
			||||||
    lang.ja.Japanese.Defaults.use_janome = False
 | 
					    lang.ja.Japanese.Defaults.use_janome = False
 | 
				
			||||||
| 
						 | 
					@ -505,7 +513,7 @@ def main(
 | 
				
			||||||
    print("Train and evaluate", corpus, "using lang", paths.lang)
 | 
					    print("Train and evaluate", corpus, "using lang", paths.lang)
 | 
				
			||||||
    nlp = load_nlp(paths.lang, config, vectors=vectors_dir)
 | 
					    nlp = load_nlp(paths.lang, config, vectors=vectors_dir)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    docs, golds = read_data(
 | 
					    examples = read_data(
 | 
				
			||||||
        nlp,
 | 
					        nlp,
 | 
				
			||||||
        paths.train.conllu.open(encoding="utf8"),
 | 
					        paths.train.conllu.open(encoding="utf8"),
 | 
				
			||||||
        paths.train.text.open(encoding="utf8"),
 | 
					        paths.train.text.open(encoding="utf8"),
 | 
				
			||||||
| 
						 | 
					@ -513,12 +521,12 @@ def main(
 | 
				
			||||||
        limit=limit,
 | 
					        limit=limit,
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    optimizer = initialize_pipeline(nlp, docs, golds, config, gpu_device)
 | 
					    optimizer = initialize_pipeline(nlp, examples, config, gpu_device)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    batch_sizes = compounding(config.min_batch_size, config.max_batch_size, 1.001)
 | 
					    batch_sizes = compounding(config.min_batch_size, config.max_batch_size, 1.001)
 | 
				
			||||||
    beam_prob = compounding(0.2, 0.8, 1.001)
 | 
					    beam_prob = compounding(0.2, 0.8, 1.001)
 | 
				
			||||||
    for i in range(config.nr_epoch):
 | 
					    for i in range(config.nr_epoch):
 | 
				
			||||||
        docs, golds = read_data(
 | 
					        examples = read_data(
 | 
				
			||||||
            nlp,
 | 
					            nlp,
 | 
				
			||||||
            paths.train.conllu.open(encoding="utf8"),
 | 
					            paths.train.conllu.open(encoding="utf8"),
 | 
				
			||||||
            paths.train.text.open(encoding="utf8"),
 | 
					            paths.train.text.open(encoding="utf8"),
 | 
				
			||||||
| 
						 | 
					@ -527,22 +535,19 @@ def main(
 | 
				
			||||||
            oracle_segments=use_oracle_segments,
 | 
					            oracle_segments=use_oracle_segments,
 | 
				
			||||||
            raw_text=not use_oracle_segments,
 | 
					            raw_text=not use_oracle_segments,
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
        Xs = list(zip(docs, golds))
 | 
					        random.shuffle(examples)
 | 
				
			||||||
        random.shuffle(Xs)
 | 
					 | 
				
			||||||
        if config.batch_by_words:
 | 
					        if config.batch_by_words:
 | 
				
			||||||
            batches = minibatch_by_words(Xs, size=batch_sizes)
 | 
					            batches = minibatch_by_words(examples, size=batch_sizes)
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            batches = minibatch(Xs, size=batch_sizes)
 | 
					            batches = minibatch(examples, size=batch_sizes)
 | 
				
			||||||
        losses = {}
 | 
					        losses = {}
 | 
				
			||||||
        n_train_words = sum(len(doc) for doc in docs)
 | 
					        n_train_words = sum(len(ex.doc) for ex in examples)
 | 
				
			||||||
        with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
 | 
					        with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
 | 
				
			||||||
            for batch in batches:
 | 
					            for batch in batches:
 | 
				
			||||||
                batch_docs, batch_gold = zip(*batch)
 | 
					                pbar.update(sum(len(ex.doc) for ex in batch))
 | 
				
			||||||
                pbar.update(sum(len(doc) for doc in batch_docs))
 | 
					 | 
				
			||||||
                nlp.parser.cfg["beam_update_prob"] = next(beam_prob)
 | 
					                nlp.parser.cfg["beam_update_prob"] = next(beam_prob)
 | 
				
			||||||
                nlp.update(
 | 
					                nlp.update(
 | 
				
			||||||
                    batch_docs,
 | 
					                    batch,
 | 
				
			||||||
                    batch_gold,
 | 
					 | 
				
			||||||
                    sgd=optimizer,
 | 
					                    sgd=optimizer,
 | 
				
			||||||
                    drop=config.dropout,
 | 
					                    drop=config.dropout,
 | 
				
			||||||
                    losses=losses,
 | 
					                    losses=losses,
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -46,7 +46,7 @@ def _define_entities(nlp, kb, entity_def_path, entity_descr_path, min_entity_fre
 | 
				
			||||||
            " cf. https://spacy.io/usage/models#languages."
 | 
					            " cf. https://spacy.io/usage/models#languages."
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    logger.info("Filtering entities with fewer than {} mentions".format(min_entity_freq))
 | 
					    logger.info("Filtering entities with fewer than {} mentions or no description".format(min_entity_freq))
 | 
				
			||||||
    entity_frequencies = io.read_entity_to_count(entity_freq_path)
 | 
					    entity_frequencies = io.read_entity_to_count(entity_freq_path)
 | 
				
			||||||
    # filter the entities for in the KB by frequency, because there's just too much data (8M entities) otherwise
 | 
					    # filter the entities for in the KB by frequency, because there's just too much data (8M entities) otherwise
 | 
				
			||||||
    filtered_title_to_id, entity_list, description_list, frequency_list = get_filtered_entities(
 | 
					    filtered_title_to_id, entity_list, description_list, frequency_list = get_filtered_entities(
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -4,12 +4,12 @@ from random import shuffle
 | 
				
			||||||
import logging
 | 
					import logging
 | 
				
			||||||
import numpy as np
 | 
					import numpy as np
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from spacy._ml import zero_init, create_default_optimizer
 | 
					from thinc.model import Model
 | 
				
			||||||
from spacy.cli.pretrain import get_cossim_loss
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from thinc.v2v import Model
 | 
					 | 
				
			||||||
from thinc.api import chain
 | 
					from thinc.api import chain
 | 
				
			||||||
from thinc.neural._classes.affine import Affine
 | 
					from thinc.loss import CosineDistance
 | 
				
			||||||
 | 
					from thinc.layers import Linear
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from spacy.util import create_default_optimizer
 | 
				
			||||||
 | 
					
 | 
				
			||||||
logger = logging.getLogger(__name__)
 | 
					logger = logging.getLogger(__name__)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -34,6 +34,7 @@ class EntityEncoder:
 | 
				
			||||||
        self.input_dim = input_dim
 | 
					        self.input_dim = input_dim
 | 
				
			||||||
        self.desc_width = desc_width
 | 
					        self.desc_width = desc_width
 | 
				
			||||||
        self.epochs = epochs
 | 
					        self.epochs = epochs
 | 
				
			||||||
 | 
					        self.distance = CosineDistance(ignore_zeros=True, normalize=False)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def apply_encoder(self, description_list):
 | 
					    def apply_encoder(self, description_list):
 | 
				
			||||||
        if self.encoder is None:
 | 
					        if self.encoder is None:
 | 
				
			||||||
| 
						 | 
					@ -132,21 +133,17 @@ class EntityEncoder:
 | 
				
			||||||
    def _build_network(self, orig_width, hidden_with):
 | 
					    def _build_network(self, orig_width, hidden_with):
 | 
				
			||||||
        with Model.define_operators({">>": chain}):
 | 
					        with Model.define_operators({">>": chain}):
 | 
				
			||||||
            # very simple encoder-decoder model
 | 
					            # very simple encoder-decoder model
 | 
				
			||||||
            self.encoder = Affine(hidden_with, orig_width)
 | 
					            self.encoder = Linear(hidden_with, orig_width)
 | 
				
			||||||
            self.model = self.encoder >> zero_init(
 | 
					            # TODO: removed the zero_init here - is oK?
 | 
				
			||||||
                Affine(orig_width, hidden_with, drop_factor=0.0)
 | 
					            self.model = self.encoder >> Linear(orig_width, hidden_with)
 | 
				
			||||||
            )
 | 
					        self.sgd = create_default_optimizer()
 | 
				
			||||||
        self.sgd = create_default_optimizer(self.model.ops)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def _update(self, vectors):
 | 
					    def _update(self, vectors):
 | 
				
			||||||
 | 
					        truths = self.model.ops.asarray(vectors)
 | 
				
			||||||
        predictions, bp_model = self.model.begin_update(
 | 
					        predictions, bp_model = self.model.begin_update(
 | 
				
			||||||
            np.asarray(vectors), drop=self.DROP
 | 
					            truths, drop=self.DROP
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
        loss, d_scores = self._get_loss(scores=predictions, golds=np.asarray(vectors))
 | 
					        d_scores, loss = self.distance(predictions, truths)
 | 
				
			||||||
        bp_model(d_scores, sgd=self.sgd)
 | 
					        bp_model(d_scores, sgd=self.sgd)
 | 
				
			||||||
        return loss / len(vectors)
 | 
					        return loss / len(vectors)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @staticmethod
 | 
					 | 
				
			||||||
    def _get_loss(golds, scores):
 | 
					 | 
				
			||||||
        loss, gradients = get_cossim_loss(scores, golds)
 | 
					 | 
				
			||||||
        return loss, gradients
 | 
					 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -17,7 +17,13 @@ import plac
 | 
				
			||||||
from tqdm import tqdm
 | 
					from tqdm import tqdm
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from bin.wiki_entity_linking import wikipedia_processor
 | 
					from bin.wiki_entity_linking import wikipedia_processor
 | 
				
			||||||
from bin.wiki_entity_linking import TRAINING_DATA_FILE, KB_MODEL_DIR, KB_FILE, LOG_FORMAT, OUTPUT_MODEL_DIR
 | 
					from bin.wiki_entity_linking import (
 | 
				
			||||||
 | 
					    TRAINING_DATA_FILE,
 | 
				
			||||||
 | 
					    KB_MODEL_DIR,
 | 
				
			||||||
 | 
					    KB_FILE,
 | 
				
			||||||
 | 
					    LOG_FORMAT,
 | 
				
			||||||
 | 
					    OUTPUT_MODEL_DIR,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
from bin.wiki_entity_linking.entity_linker_evaluation import measure_performance
 | 
					from bin.wiki_entity_linking.entity_linker_evaluation import measure_performance
 | 
				
			||||||
from bin.wiki_entity_linking.kb_creator import read_kb
 | 
					from bin.wiki_entity_linking.kb_creator import read_kb
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -48,10 +54,12 @@ def main(
 | 
				
			||||||
    l2=1e-6,
 | 
					    l2=1e-6,
 | 
				
			||||||
    train_articles=None,
 | 
					    train_articles=None,
 | 
				
			||||||
    dev_articles=None,
 | 
					    dev_articles=None,
 | 
				
			||||||
    labels_discard=None
 | 
					    labels_discard=None,
 | 
				
			||||||
):
 | 
					):
 | 
				
			||||||
    if not output_dir:
 | 
					    if not output_dir:
 | 
				
			||||||
        logger.warning("No output dir specified so no results will be written, are you sure about this ?")
 | 
					        logger.warning(
 | 
				
			||||||
 | 
					            "No output dir specified so no results will be written, are you sure about this ?"
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    logger.info("Creating Entity Linker with Wikipedia and WikiData")
 | 
					    logger.info("Creating Entity Linker with Wikipedia and WikiData")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -68,7 +76,11 @@ def main(
 | 
				
			||||||
    # STEP 1 : load the NLP object
 | 
					    # STEP 1 : load the NLP object
 | 
				
			||||||
    logger.info("STEP 1a: Loading model from {}".format(nlp_dir))
 | 
					    logger.info("STEP 1a: Loading model from {}".format(nlp_dir))
 | 
				
			||||||
    nlp = spacy.load(nlp_dir)
 | 
					    nlp = spacy.load(nlp_dir)
 | 
				
			||||||
    logger.info("Original NLP pipeline has following pipeline components: {}".format(nlp.pipe_names))
 | 
					    logger.info(
 | 
				
			||||||
 | 
					        "Original NLP pipeline has following pipeline components: {}".format(
 | 
				
			||||||
 | 
					            nlp.pipe_names
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # check that there is a NER component in the pipeline
 | 
					    # check that there is a NER component in the pipeline
 | 
				
			||||||
    if "ner" not in nlp.pipe_names:
 | 
					    if "ner" not in nlp.pipe_names:
 | 
				
			||||||
| 
						 | 
					@ -79,25 +91,42 @@ def main(
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # STEP 2: read the training dataset previously created from WP
 | 
					    # STEP 2: read the training dataset previously created from WP
 | 
				
			||||||
    logger.info("STEP 2: Reading training & dev dataset from {}".format(training_path))
 | 
					    logger.info("STEP 2: Reading training & dev dataset from {}".format(training_path))
 | 
				
			||||||
    train_indices, dev_indices = wikipedia_processor.read_training_indices(training_path)
 | 
					    train_indices, dev_indices = wikipedia_processor.read_training_indices(
 | 
				
			||||||
    logger.info("Training set has {} articles, limit set to roughly {} articles per epoch"
 | 
					        training_path
 | 
				
			||||||
                .format(len(train_indices), train_articles if train_articles else "all"))
 | 
					    )
 | 
				
			||||||
    logger.info("Dev set has {} articles, limit set to rougly {} articles for evaluation"
 | 
					    logger.info(
 | 
				
			||||||
                .format(len(dev_indices), dev_articles if dev_articles else "all"))
 | 
					        "Training set has {} articles, limit set to roughly {} articles per epoch".format(
 | 
				
			||||||
 | 
					            len(train_indices), train_articles if train_articles else "all"
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					    logger.info(
 | 
				
			||||||
 | 
					        "Dev set has {} articles, limit set to rougly {} articles for evaluation".format(
 | 
				
			||||||
 | 
					            len(dev_indices), dev_articles if dev_articles else "all"
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
    if dev_articles:
 | 
					    if dev_articles:
 | 
				
			||||||
        dev_indices = dev_indices[0:dev_articles]
 | 
					        dev_indices = dev_indices[0:dev_articles]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # STEP 3: create and train an entity linking pipe
 | 
					    # STEP 3: create and train an entity linking pipe
 | 
				
			||||||
    logger.info("STEP 3: Creating and training an Entity Linking pipe for {} epochs".format(epochs))
 | 
					    logger.info(
 | 
				
			||||||
 | 
					        "STEP 3: Creating and training an Entity Linking pipe for {} epochs".format(
 | 
				
			||||||
 | 
					            epochs
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
    if labels_discard:
 | 
					    if labels_discard:
 | 
				
			||||||
        labels_discard = [x.strip() for x in labels_discard.split(",")]
 | 
					        labels_discard = [x.strip() for x in labels_discard.split(",")]
 | 
				
			||||||
        logger.info("Discarding {} NER types: {}".format(len(labels_discard), labels_discard))
 | 
					        logger.info(
 | 
				
			||||||
 | 
					            "Discarding {} NER types: {}".format(len(labels_discard), labels_discard)
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        labels_discard = []
 | 
					        labels_discard = []
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    el_pipe = nlp.create_pipe(
 | 
					    el_pipe = nlp.create_pipe(
 | 
				
			||||||
        name="entity_linker", config={"pretrained_vectors": nlp.vocab.vectors.name,
 | 
					        name="entity_linker",
 | 
				
			||||||
                                      "labels_discard": labels_discard}
 | 
					        config={
 | 
				
			||||||
 | 
					            "pretrained_vectors": nlp.vocab.vectors,
 | 
				
			||||||
 | 
					            "labels_discard": labels_discard,
 | 
				
			||||||
 | 
					        },
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
    el_pipe.set_kb(kb)
 | 
					    el_pipe.set_kb(kb)
 | 
				
			||||||
    nlp.add_pipe(el_pipe, last=True)
 | 
					    nlp.add_pipe(el_pipe, last=True)
 | 
				
			||||||
| 
						 | 
					@ -109,11 +138,18 @@ def main(
 | 
				
			||||||
        optimizer.L2 = l2
 | 
					        optimizer.L2 = l2
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    logger.info("Dev Baseline Accuracies:")
 | 
					    logger.info("Dev Baseline Accuracies:")
 | 
				
			||||||
    dev_data = wikipedia_processor.read_el_docs_golds(nlp=nlp, entity_file_path=training_path,
 | 
					    dev_data = wikipedia_processor.read_el_docs_golds(
 | 
				
			||||||
                                                      dev=True, line_ids=dev_indices,
 | 
					        nlp=nlp,
 | 
				
			||||||
                                                      kb=kb, labels_discard=labels_discard)
 | 
					        entity_file_path=training_path,
 | 
				
			||||||
 | 
					        dev=True,
 | 
				
			||||||
 | 
					        line_ids=dev_indices,
 | 
				
			||||||
 | 
					        kb=kb,
 | 
				
			||||||
 | 
					        labels_discard=labels_discard,
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    measure_performance(dev_data, kb, el_pipe, baseline=True, context=False, dev_limit=len(dev_indices))
 | 
					    measure_performance(
 | 
				
			||||||
 | 
					        dev_data, kb, el_pipe, baseline=True, context=False, dev_limit=len(dev_indices)
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    for itn in range(epochs):
 | 
					    for itn in range(epochs):
 | 
				
			||||||
        random.shuffle(train_indices)
 | 
					        random.shuffle(train_indices)
 | 
				
			||||||
| 
						 | 
					@ -127,13 +163,18 @@ def main(
 | 
				
			||||||
        if train_articles:
 | 
					        if train_articles:
 | 
				
			||||||
            bar_total = train_articles
 | 
					            bar_total = train_articles
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        with tqdm(total=bar_total, leave=False, desc='Epoch ' + str(itn)) as pbar:
 | 
					        with tqdm(total=bar_total, leave=False, desc=f"Epoch {itn}") as pbar:
 | 
				
			||||||
            for batch in batches:
 | 
					            for batch in batches:
 | 
				
			||||||
                if not train_articles or articles_processed < train_articles:
 | 
					                if not train_articles or articles_processed < train_articles:
 | 
				
			||||||
                    with nlp.disable_pipes("entity_linker"):
 | 
					                    with nlp.disable_pipes("entity_linker"):
 | 
				
			||||||
                        train_batch = wikipedia_processor.read_el_docs_golds(nlp=nlp, entity_file_path=training_path,
 | 
					                        train_batch = wikipedia_processor.read_el_docs_golds(
 | 
				
			||||||
                                                                             dev=False, line_ids=batch,
 | 
					                            nlp=nlp,
 | 
				
			||||||
                                                                             kb=kb, labels_discard=labels_discard)
 | 
					                            entity_file_path=training_path,
 | 
				
			||||||
 | 
					                            dev=False,
 | 
				
			||||||
 | 
					                            line_ids=batch,
 | 
				
			||||||
 | 
					                            kb=kb,
 | 
				
			||||||
 | 
					                            labels_discard=labels_discard,
 | 
				
			||||||
 | 
					                        )
 | 
				
			||||||
                        docs, golds = zip(*train_batch)
 | 
					                        docs, golds = zip(*train_batch)
 | 
				
			||||||
                    try:
 | 
					                    try:
 | 
				
			||||||
                        with nlp.disable_pipes(*other_pipes):
 | 
					                        with nlp.disable_pipes(*other_pipes):
 | 
				
			||||||
| 
						 | 
					@ -150,17 +191,36 @@ def main(
 | 
				
			||||||
                    except Exception as e:
 | 
					                    except Exception as e:
 | 
				
			||||||
                        logger.error("Error updating batch:" + str(e))
 | 
					                        logger.error("Error updating batch:" + str(e))
 | 
				
			||||||
        if batchnr > 0:
 | 
					        if batchnr > 0:
 | 
				
			||||||
            logging.info("Epoch {} trained on {} articles, train loss {}"
 | 
					            logging.info(
 | 
				
			||||||
                         .format(itn, articles_processed, round(losses["entity_linker"] / batchnr, 2)))
 | 
					                "Epoch {} trained on {} articles, train loss {}".format(
 | 
				
			||||||
 | 
					                    itn, articles_processed, round(losses["entity_linker"] / batchnr, 2)
 | 
				
			||||||
 | 
					                )
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
            # re-read the dev_data (data is returned as a generator)
 | 
					            # re-read the dev_data (data is returned as a generator)
 | 
				
			||||||
            dev_data = wikipedia_processor.read_el_docs_golds(nlp=nlp, entity_file_path=training_path,
 | 
					            dev_data = wikipedia_processor.read_el_docs_golds(
 | 
				
			||||||
                                                              dev=True, line_ids=dev_indices,
 | 
					                nlp=nlp,
 | 
				
			||||||
                                                              kb=kb, labels_discard=labels_discard)
 | 
					                entity_file_path=training_path,
 | 
				
			||||||
            measure_performance(dev_data, kb, el_pipe, baseline=False, context=True, dev_limit=len(dev_indices))
 | 
					                dev=True,
 | 
				
			||||||
 | 
					                line_ids=dev_indices,
 | 
				
			||||||
 | 
					                kb=kb,
 | 
				
			||||||
 | 
					                labels_discard=labels_discard,
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					            measure_performance(
 | 
				
			||||||
 | 
					                dev_data,
 | 
				
			||||||
 | 
					                kb,
 | 
				
			||||||
 | 
					                el_pipe,
 | 
				
			||||||
 | 
					                baseline=False,
 | 
				
			||||||
 | 
					                context=True,
 | 
				
			||||||
 | 
					                dev_limit=len(dev_indices),
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if output_dir:
 | 
					    if output_dir:
 | 
				
			||||||
        # STEP 4: write the NLP pipeline (now including an EL model) to file
 | 
					        # STEP 4: write the NLP pipeline (now including an EL model) to file
 | 
				
			||||||
        logger.info("Final NLP pipeline has following pipeline components: {}".format(nlp.pipe_names))
 | 
					        logger.info(
 | 
				
			||||||
 | 
					            "Final NLP pipeline has following pipeline components: {}".format(
 | 
				
			||||||
 | 
					                nlp.pipe_names
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
        logger.info("STEP 4: Writing trained NLP to {}".format(nlp_output_dir))
 | 
					        logger.info("STEP 4: Writing trained NLP to {}".format(nlp_output_dir))
 | 
				
			||||||
        nlp.to_disk(nlp_output_dir)
 | 
					        nlp.to_disk(nlp_output_dir)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -14,7 +14,7 @@ pip install keras==2.0.9
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Compatible with: spaCy v2.0.0+
 | 
					Compatible with: spaCy v2.0.0+
 | 
				
			||||||
"""
 | 
					"""
 | 
				
			||||||
 | 
					import ml_datasets
 | 
				
			||||||
import plac
 | 
					import plac
 | 
				
			||||||
import random
 | 
					import random
 | 
				
			||||||
import pathlib
 | 
					import pathlib
 | 
				
			||||||
| 
						 | 
					@ -24,7 +24,6 @@ from keras.models import Sequential, model_from_json
 | 
				
			||||||
from keras.layers import LSTM, Dense, Embedding, Bidirectional
 | 
					from keras.layers import LSTM, Dense, Embedding, Bidirectional
 | 
				
			||||||
from keras.layers import TimeDistributed
 | 
					from keras.layers import TimeDistributed
 | 
				
			||||||
from keras.optimizers import Adam
 | 
					from keras.optimizers import Adam
 | 
				
			||||||
import thinc.extra.datasets
 | 
					 | 
				
			||||||
from spacy.compat import pickle
 | 
					from spacy.compat import pickle
 | 
				
			||||||
import spacy
 | 
					import spacy
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -224,7 +223,7 @@ def main(
 | 
				
			||||||
    if model_dir is not None:
 | 
					    if model_dir is not None:
 | 
				
			||||||
        model_dir = pathlib.Path(model_dir)
 | 
					        model_dir = pathlib.Path(model_dir)
 | 
				
			||||||
    if train_dir is None or dev_dir is None:
 | 
					    if train_dir is None or dev_dir is None:
 | 
				
			||||||
        imdb_data = thinc.extra.datasets.imdb()
 | 
					        imdb_data = ml_datasets.imdb()
 | 
				
			||||||
    if is_runtime:
 | 
					    if is_runtime:
 | 
				
			||||||
        if dev_dir is None:
 | 
					        if dev_dir is None:
 | 
				
			||||||
            dev_texts, dev_labels = zip(*imdb_data[1])
 | 
					            dev_texts, dev_labels = zip(*imdb_data[1])
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										63
									
								
								examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										63
									
								
								examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,63 @@
 | 
				
			||||||
 | 
					[training]
 | 
				
			||||||
 | 
					patience = 10000
 | 
				
			||||||
 | 
					eval_frequency = 200
 | 
				
			||||||
 | 
					dropout = 0.2
 | 
				
			||||||
 | 
					init_tok2vec = null
 | 
				
			||||||
 | 
					vectors = null
 | 
				
			||||||
 | 
					max_epochs = 100
 | 
				
			||||||
 | 
					orth_variant_level = 0.0
 | 
				
			||||||
 | 
					gold_preproc = true
 | 
				
			||||||
 | 
					max_length = 0
 | 
				
			||||||
 | 
					use_gpu = 0
 | 
				
			||||||
 | 
					scores = ["tags_acc", "uas", "las"]
 | 
				
			||||||
 | 
					score_weights = {"las": 0.8, "tags_acc": 0.2}
 | 
				
			||||||
 | 
					limit = 0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[training.batch_size]
 | 
				
			||||||
 | 
					@schedules = "compounding.v1"
 | 
				
			||||||
 | 
					start = 100
 | 
				
			||||||
 | 
					stop = 1000
 | 
				
			||||||
 | 
					compound = 1.001
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[optimizer]
 | 
				
			||||||
 | 
					@optimizers = "Adam.v1"
 | 
				
			||||||
 | 
					learn_rate = 0.001
 | 
				
			||||||
 | 
					beta1 = 0.9
 | 
				
			||||||
 | 
					beta2 = 0.999
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp]
 | 
				
			||||||
 | 
					lang = "en"
 | 
				
			||||||
 | 
					vectors = ${training:vectors}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.pipeline.tok2vec]
 | 
				
			||||||
 | 
					factory = "tok2vec"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.pipeline.tagger]
 | 
				
			||||||
 | 
					factory = "tagger"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.pipeline.parser]
 | 
				
			||||||
 | 
					factory = "parser"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.pipeline.tagger.model]
 | 
				
			||||||
 | 
					@architectures = "tagger_model.v1"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.pipeline.tagger.model.tok2vec]
 | 
				
			||||||
 | 
					@architectures = "tok2vec_tensors.v1"
 | 
				
			||||||
 | 
					width = ${nlp.pipeline.tok2vec.model:width}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.pipeline.parser.model]
 | 
				
			||||||
 | 
					@architectures = "transition_based_parser.v1"
 | 
				
			||||||
 | 
					nr_feature_tokens = 8
 | 
				
			||||||
 | 
					hidden_width = 64
 | 
				
			||||||
 | 
					maxout_pieces = 3
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.pipeline.parser.model.tok2vec]
 | 
				
			||||||
 | 
					@architectures = "tok2vec_tensors.v1"
 | 
				
			||||||
 | 
					width = ${nlp.pipeline.tok2vec.model:width}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.pipeline.tok2vec.model]
 | 
				
			||||||
 | 
					@architectures = "hash_embed_bilstm.v1"
 | 
				
			||||||
 | 
					pretrained_vectors = ${nlp:vectors}
 | 
				
			||||||
 | 
					width = 96
 | 
				
			||||||
 | 
					depth = 4
 | 
				
			||||||
 | 
					embed_size = 2000
 | 
				
			||||||
							
								
								
									
										65
									
								
								examples/experiments/ptb-joint-pos-dep/defaults.cfg
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										65
									
								
								examples/experiments/ptb-joint-pos-dep/defaults.cfg
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,65 @@
 | 
				
			||||||
 | 
					[training]
 | 
				
			||||||
 | 
					patience = 10000
 | 
				
			||||||
 | 
					eval_frequency = 200
 | 
				
			||||||
 | 
					dropout = 0.2
 | 
				
			||||||
 | 
					init_tok2vec = null
 | 
				
			||||||
 | 
					vectors = null
 | 
				
			||||||
 | 
					max_epochs = 100
 | 
				
			||||||
 | 
					orth_variant_level = 0.0
 | 
				
			||||||
 | 
					gold_preproc = true
 | 
				
			||||||
 | 
					max_length = 0
 | 
				
			||||||
 | 
					use_gpu = -1
 | 
				
			||||||
 | 
					scores = ["tags_acc", "uas", "las"]
 | 
				
			||||||
 | 
					score_weights = {"las": 0.8, "tags_acc": 0.2}
 | 
				
			||||||
 | 
					limit = 0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[training.batch_size]
 | 
				
			||||||
 | 
					@schedules = "compounding.v1"
 | 
				
			||||||
 | 
					start = 100
 | 
				
			||||||
 | 
					stop = 1000
 | 
				
			||||||
 | 
					compound = 1.001
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[optimizer]
 | 
				
			||||||
 | 
					@optimizers = "Adam.v1"
 | 
				
			||||||
 | 
					learn_rate = 0.001
 | 
				
			||||||
 | 
					beta1 = 0.9
 | 
				
			||||||
 | 
					beta2 = 0.999
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp]
 | 
				
			||||||
 | 
					lang = "en"
 | 
				
			||||||
 | 
					vectors = ${training:vectors}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.pipeline.tok2vec]
 | 
				
			||||||
 | 
					factory = "tok2vec"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.pipeline.tagger]
 | 
				
			||||||
 | 
					factory = "tagger"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.pipeline.parser]
 | 
				
			||||||
 | 
					factory = "parser"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.pipeline.tagger.model]
 | 
				
			||||||
 | 
					@architectures = "tagger_model.v1"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.pipeline.tagger.model.tok2vec]
 | 
				
			||||||
 | 
					@architectures = "tok2vec_tensors.v1"
 | 
				
			||||||
 | 
					width = ${nlp.pipeline.tok2vec.model:width}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.pipeline.parser.model]
 | 
				
			||||||
 | 
					@architectures = "transition_based_parser.v1"
 | 
				
			||||||
 | 
					nr_feature_tokens = 8
 | 
				
			||||||
 | 
					hidden_width = 64
 | 
				
			||||||
 | 
					maxout_pieces = 3
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.pipeline.parser.model.tok2vec]
 | 
				
			||||||
 | 
					@architectures = "tok2vec_tensors.v1"
 | 
				
			||||||
 | 
					width = ${nlp.pipeline.tok2vec.model:width}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.pipeline.tok2vec.model]
 | 
				
			||||||
 | 
					@architectures = "hash_embed_cnn.v1"
 | 
				
			||||||
 | 
					pretrained_vectors = ${nlp:vectors}
 | 
				
			||||||
 | 
					width = 96
 | 
				
			||||||
 | 
					depth = 4
 | 
				
			||||||
 | 
					window_size = 1
 | 
				
			||||||
 | 
					embed_size = 2000
 | 
				
			||||||
 | 
					maxout_pieces = 3
 | 
				
			||||||
| 
						 | 
					@ -13,9 +13,10 @@ Prerequisites: pip install joblib
 | 
				
			||||||
from __future__ import print_function, unicode_literals
 | 
					from __future__ import print_function, unicode_literals
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from pathlib import Path
 | 
					from pathlib import Path
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import ml_datasets
 | 
				
			||||||
from joblib import Parallel, delayed
 | 
					from joblib import Parallel, delayed
 | 
				
			||||||
from functools import partial
 | 
					from functools import partial
 | 
				
			||||||
import thinc.extra.datasets
 | 
					 | 
				
			||||||
import plac
 | 
					import plac
 | 
				
			||||||
import spacy
 | 
					import spacy
 | 
				
			||||||
from spacy.util import minibatch
 | 
					from spacy.util import minibatch
 | 
				
			||||||
| 
						 | 
					@ -35,7 +36,7 @@ def main(output_dir, model="en_core_web_sm", n_jobs=4, batch_size=1000, limit=10
 | 
				
			||||||
        output_dir.mkdir()
 | 
					        output_dir.mkdir()
 | 
				
			||||||
    # load and pre-process the IMBD dataset
 | 
					    # load and pre-process the IMBD dataset
 | 
				
			||||||
    print("Loading IMDB data...")
 | 
					    print("Loading IMDB data...")
 | 
				
			||||||
    data, _ = thinc.extra.datasets.imdb()
 | 
					    data, _ = ml_datasets.imdb()
 | 
				
			||||||
    texts, _ = zip(*data[-limit:])
 | 
					    texts, _ = zip(*data[-limit:])
 | 
				
			||||||
    print("Processing texts...")
 | 
					    print("Processing texts...")
 | 
				
			||||||
    partitions = minibatch(texts, size=batch_size)
 | 
					    partitions = minibatch(texts, size=batch_size)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -26,12 +26,12 @@ DEFAULT_TEXT = "Mark Zuckerberg is the CEO of Facebook."
 | 
				
			||||||
HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; margin-bottom: 2.5rem">{}</div>"""
 | 
					HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; margin-bottom: 2.5rem">{}</div>"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@st.cache(ignore_hash=True)
 | 
					@st.cache(allow_output_mutation=True)
 | 
				
			||||||
def load_model(name):
 | 
					def load_model(name):
 | 
				
			||||||
    return spacy.load(name)
 | 
					    return spacy.load(name)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@st.cache(ignore_hash=True)
 | 
					@st.cache(allow_output_mutation=True)
 | 
				
			||||||
def process_text(model_name, text):
 | 
					def process_text(model_name, text):
 | 
				
			||||||
    nlp = load_model(model_name)
 | 
					    nlp = load_model(model_name)
 | 
				
			||||||
    return nlp(text)
 | 
					    return nlp(text)
 | 
				
			||||||
| 
						 | 
					@ -79,7 +79,9 @@ if "ner" in nlp.pipe_names:
 | 
				
			||||||
    st.header("Named Entities")
 | 
					    st.header("Named Entities")
 | 
				
			||||||
    st.sidebar.header("Named Entities")
 | 
					    st.sidebar.header("Named Entities")
 | 
				
			||||||
    label_set = nlp.get_pipe("ner").labels
 | 
					    label_set = nlp.get_pipe("ner").labels
 | 
				
			||||||
    labels = st.sidebar.multiselect("Entity labels", label_set, label_set)
 | 
					    labels = st.sidebar.multiselect(
 | 
				
			||||||
 | 
					        "Entity labels", options=label_set, default=list(label_set)
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
    html = displacy.render(doc, style="ent", options={"ents": labels})
 | 
					    html = displacy.render(doc, style="ent", options={"ents": labels})
 | 
				
			||||||
    # Newlines seem to mess with the rendering
 | 
					    # Newlines seem to mess with the rendering
 | 
				
			||||||
    html = html.replace("\n", " ")
 | 
					    html = html.replace("\n", " ")
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -12,7 +12,7 @@ import tqdm
 | 
				
			||||||
import spacy
 | 
					import spacy
 | 
				
			||||||
import spacy.util
 | 
					import spacy.util
 | 
				
			||||||
from spacy.tokens import Token, Doc
 | 
					from spacy.tokens import Token, Doc
 | 
				
			||||||
from spacy.gold import GoldParse
 | 
					from spacy.gold import GoldParse, Example
 | 
				
			||||||
from spacy.syntax.nonproj import projectivize
 | 
					from spacy.syntax.nonproj import projectivize
 | 
				
			||||||
from collections import defaultdict
 | 
					from collections import defaultdict
 | 
				
			||||||
from spacy.matcher import Matcher
 | 
					from spacy.matcher import Matcher
 | 
				
			||||||
| 
						 | 
					@ -33,25 +33,25 @@ random.seed(0)
 | 
				
			||||||
numpy.random.seed(0)
 | 
					numpy.random.seed(0)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def minibatch_by_words(items, size=5000):
 | 
					def minibatch_by_words(examples, size=5000):
 | 
				
			||||||
    random.shuffle(items)
 | 
					    random.shuffle(examples)
 | 
				
			||||||
    if isinstance(size, int):
 | 
					    if isinstance(size, int):
 | 
				
			||||||
        size_ = itertools.repeat(size)
 | 
					        size_ = itertools.repeat(size)
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        size_ = size
 | 
					        size_ = size
 | 
				
			||||||
    items = iter(items)
 | 
					    examples = iter(examples)
 | 
				
			||||||
    while True:
 | 
					    while True:
 | 
				
			||||||
        batch_size = next(size_)
 | 
					        batch_size = next(size_)
 | 
				
			||||||
        batch = []
 | 
					        batch = []
 | 
				
			||||||
        while batch_size >= 0:
 | 
					        while batch_size >= 0:
 | 
				
			||||||
            try:
 | 
					            try:
 | 
				
			||||||
                doc, gold = next(items)
 | 
					                example = next(examples)
 | 
				
			||||||
            except StopIteration:
 | 
					            except StopIteration:
 | 
				
			||||||
                if batch:
 | 
					                if batch:
 | 
				
			||||||
                    yield batch
 | 
					                    yield batch
 | 
				
			||||||
                return
 | 
					                return
 | 
				
			||||||
            batch_size -= len(doc)
 | 
					            batch_size -= len(example.doc)
 | 
				
			||||||
            batch.append((doc, gold))
 | 
					            batch.append(example)
 | 
				
			||||||
        if batch:
 | 
					        if batch:
 | 
				
			||||||
            yield batch
 | 
					            yield batch
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
| 
						 | 
					@ -78,7 +78,7 @@ def read_data(
 | 
				
			||||||
    max_doc_length=None,
 | 
					    max_doc_length=None,
 | 
				
			||||||
    limit=None,
 | 
					    limit=None,
 | 
				
			||||||
):
 | 
					):
 | 
				
			||||||
    """Read the CONLLU format into (Doc, GoldParse) tuples. If raw_text=True,
 | 
					    """Read the CONLLU format into Example objects. If raw_text=True,
 | 
				
			||||||
    include Doc objects created using nlp.make_doc and then aligned against
 | 
					    include Doc objects created using nlp.make_doc and then aligned against
 | 
				
			||||||
    the gold-standard sequences. If oracle_segments=True, include Doc objects
 | 
					    the gold-standard sequences. If oracle_segments=True, include Doc objects
 | 
				
			||||||
    created from the gold-standard segments. At least one must be True."""
 | 
					    created from the gold-standard segments. At least one must be True."""
 | 
				
			||||||
| 
						 | 
					@ -119,15 +119,15 @@ def read_data(
 | 
				
			||||||
                docs.append(doc)
 | 
					                docs.append(doc)
 | 
				
			||||||
                golds.append(gold)
 | 
					                golds.append(gold)
 | 
				
			||||||
                if limit and len(docs) >= limit:
 | 
					                if limit and len(docs) >= limit:
 | 
				
			||||||
                    return docs, golds
 | 
					                    return golds_to_gold_data(docs, golds)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if raw_text and sent_annots:
 | 
					        if raw_text and sent_annots:
 | 
				
			||||||
            doc, gold = _make_gold(nlp, None, sent_annots)
 | 
					            doc, gold = _make_gold(nlp, None, sent_annots)
 | 
				
			||||||
            docs.append(doc)
 | 
					            docs.append(doc)
 | 
				
			||||||
            golds.append(gold)
 | 
					            golds.append(gold)
 | 
				
			||||||
        if limit and len(docs) >= limit:
 | 
					        if limit and len(docs) >= limit:
 | 
				
			||||||
            return docs, golds
 | 
					            return golds_to_gold_data(docs, golds)
 | 
				
			||||||
    return docs, golds
 | 
					    return golds_to_gold_data(docs, golds)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def read_conllu(file_):
 | 
					def read_conllu(file_):
 | 
				
			||||||
| 
						 | 
					@ -181,16 +181,18 @@ def _make_gold(nlp, text, sent_annots):
 | 
				
			||||||
#############################
 | 
					#############################
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def golds_to_gold_tuples(docs, golds):
 | 
					def golds_to_gold_data(docs, golds):
 | 
				
			||||||
    """Get out the annoying 'tuples' format used by begin_training, given the
 | 
					    """Get out the training data format used by begin_training, given the
 | 
				
			||||||
    GoldParse objects."""
 | 
					    GoldParse objects."""
 | 
				
			||||||
    tuples = []
 | 
					    data = []
 | 
				
			||||||
    for doc, gold in zip(docs, golds):
 | 
					    for doc, gold in zip(docs, golds):
 | 
				
			||||||
        text = doc.text
 | 
					        example = Example(doc=doc)
 | 
				
			||||||
        ids, words, tags, heads, labels, iob = zip(*gold.orig_annot)
 | 
					        example.add_doc_annotation(cats=gold.cats)
 | 
				
			||||||
        sents = [((ids, words, tags, heads, labels, iob), [])]
 | 
					        token_annotation_dict = gold.orig.to_dict()
 | 
				
			||||||
        tuples.append((text, sents))
 | 
					        example.add_token_annotation(**token_annotation_dict)
 | 
				
			||||||
    return tuples
 | 
					        example.goldparse = gold
 | 
				
			||||||
 | 
					        data.append(example)
 | 
				
			||||||
 | 
					    return data
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
##############
 | 
					##############
 | 
				
			||||||
| 
						 | 
					@ -303,7 +305,7 @@ def load_nlp(corpus, config):
 | 
				
			||||||
    return nlp
 | 
					    return nlp
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def initialize_pipeline(nlp, docs, golds, config):
 | 
					def initialize_pipeline(nlp, examples, config):
 | 
				
			||||||
    nlp.add_pipe(nlp.create_pipe("parser"))
 | 
					    nlp.add_pipe(nlp.create_pipe("parser"))
 | 
				
			||||||
    if config.multitask_tag:
 | 
					    if config.multitask_tag:
 | 
				
			||||||
        nlp.parser.add_multitask_objective("tag")
 | 
					        nlp.parser.add_multitask_objective("tag")
 | 
				
			||||||
| 
						 | 
					@ -311,18 +313,19 @@ def initialize_pipeline(nlp, docs, golds, config):
 | 
				
			||||||
        nlp.parser.add_multitask_objective("sent_start")
 | 
					        nlp.parser.add_multitask_objective("sent_start")
 | 
				
			||||||
    nlp.parser.moves.add_action(2, "subtok")
 | 
					    nlp.parser.moves.add_action(2, "subtok")
 | 
				
			||||||
    nlp.add_pipe(nlp.create_pipe("tagger"))
 | 
					    nlp.add_pipe(nlp.create_pipe("tagger"))
 | 
				
			||||||
    for gold in golds:
 | 
					    for ex in examples:
 | 
				
			||||||
        for tag in gold.tags:
 | 
					        for tag in ex.gold.tags:
 | 
				
			||||||
            if tag is not None:
 | 
					            if tag is not None:
 | 
				
			||||||
                nlp.tagger.add_label(tag)
 | 
					                nlp.tagger.add_label(tag)
 | 
				
			||||||
    # Replace labels that didn't make the frequency cutoff
 | 
					    # Replace labels that didn't make the frequency cutoff
 | 
				
			||||||
    actions = set(nlp.parser.labels)
 | 
					    actions = set(nlp.parser.labels)
 | 
				
			||||||
    label_set = set([act.split("-")[1] for act in actions if "-" in act])
 | 
					    label_set = set([act.split("-")[1] for act in actions if "-" in act])
 | 
				
			||||||
    for gold in golds:
 | 
					    for ex in examples:
 | 
				
			||||||
 | 
					        gold = ex.gold
 | 
				
			||||||
        for i, label in enumerate(gold.labels):
 | 
					        for i, label in enumerate(gold.labels):
 | 
				
			||||||
            if label is not None and label not in label_set:
 | 
					            if label is not None and label not in label_set:
 | 
				
			||||||
                gold.labels[i] = label.split("||")[0]
 | 
					                gold.labels[i] = label.split("||")[0]
 | 
				
			||||||
    return nlp.begin_training(lambda: golds_to_gold_tuples(docs, golds))
 | 
					    return nlp.begin_training(lambda: examples)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
########################
 | 
					########################
 | 
				
			||||||
| 
						 | 
					@ -391,13 +394,17 @@ def main(ud_dir, parses_dir, config, corpus, limit=0):
 | 
				
			||||||
    Token.set_extension("begins_fused", default=False)
 | 
					    Token.set_extension("begins_fused", default=False)
 | 
				
			||||||
    Token.set_extension("inside_fused", default=False)
 | 
					    Token.set_extension("inside_fused", default=False)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Token.set_extension("get_conllu_lines", method=get_token_conllu)
 | 
				
			||||||
 | 
					    Token.set_extension("begins_fused", default=False)
 | 
				
			||||||
 | 
					    Token.set_extension("inside_fused", default=False)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    paths = TreebankPaths(ud_dir, corpus)
 | 
					    paths = TreebankPaths(ud_dir, corpus)
 | 
				
			||||||
    if not (parses_dir / corpus).exists():
 | 
					    if not (parses_dir / corpus).exists():
 | 
				
			||||||
        (parses_dir / corpus).mkdir()
 | 
					        (parses_dir / corpus).mkdir()
 | 
				
			||||||
    print("Train and evaluate", corpus, "using lang", paths.lang)
 | 
					    print("Train and evaluate", corpus, "using lang", paths.lang)
 | 
				
			||||||
    nlp = load_nlp(paths.lang, config)
 | 
					    nlp = load_nlp(paths.lang, config)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    docs, golds = read_data(
 | 
					    examples = read_data(
 | 
				
			||||||
        nlp,
 | 
					        nlp,
 | 
				
			||||||
        paths.train.conllu.open(encoding="utf8"),
 | 
					        paths.train.conllu.open(encoding="utf8"),
 | 
				
			||||||
        paths.train.text.open(encoding="utf8"),
 | 
					        paths.train.text.open(encoding="utf8"),
 | 
				
			||||||
| 
						 | 
					@ -405,23 +412,18 @@ def main(ud_dir, parses_dir, config, corpus, limit=0):
 | 
				
			||||||
        limit=limit,
 | 
					        limit=limit,
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    optimizer = initialize_pipeline(nlp, docs, golds, config)
 | 
					    optimizer = initialize_pipeline(nlp, examples, config)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    for i in range(config.nr_epoch):
 | 
					    for i in range(config.nr_epoch):
 | 
				
			||||||
        docs = [nlp.make_doc(doc.text) for doc in docs]
 | 
					        docs = [nlp.make_doc(example.doc.text) for example in examples]
 | 
				
			||||||
        batches = minibatch_by_words(list(zip(docs, golds)), size=config.batch_size)
 | 
					        batches = minibatch_by_words(examples, size=config.batch_size)
 | 
				
			||||||
        losses = {}
 | 
					        losses = {}
 | 
				
			||||||
        n_train_words = sum(len(doc) for doc in docs)
 | 
					        n_train_words = sum(len(doc) for doc in docs)
 | 
				
			||||||
        with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
 | 
					        with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
 | 
				
			||||||
            for batch in batches:
 | 
					            for batch in batches:
 | 
				
			||||||
                batch_docs, batch_gold = zip(*batch)
 | 
					                pbar.update(sum(len(ex.doc) for ex in batch))
 | 
				
			||||||
                pbar.update(sum(len(doc) for doc in batch_docs))
 | 
					 | 
				
			||||||
                nlp.update(
 | 
					                nlp.update(
 | 
				
			||||||
                    batch_docs,
 | 
					                    examples=batch, sgd=optimizer, drop=config.dropout, losses=losses,
 | 
				
			||||||
                    batch_gold,
 | 
					 | 
				
			||||||
                    sgd=optimizer,
 | 
					 | 
				
			||||||
                    drop=config.dropout,
 | 
					 | 
				
			||||||
                    losses=losses,
 | 
					 | 
				
			||||||
                )
 | 
					                )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        out_path = parses_dir / corpus / "epoch-{i}.conllu".format(i=i)
 | 
					        out_path = parses_dir / corpus / "epoch-{i}.conllu".format(i=i)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -31,14 +31,13 @@ random.seed(0)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
PWD = os.path.dirname(__file__)
 | 
					PWD = os.path.dirname(__file__)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
TRAIN_DATA = list(read_json_file(
 | 
					TRAIN_DATA = list(read_json_file(os.path.join(PWD, "training-data.json")))
 | 
				
			||||||
    os.path.join(PWD, "ner_example_data", "ner-sent-per-line.json")))
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def get_position_label(i, words, tags, heads, labels, ents):
 | 
					def get_position_label(i, token_annotation):
 | 
				
			||||||
    """Return labels indicating the position of the word in the document.
 | 
					    """Return labels indicating the position of the word in the document.
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    if len(words) < 20:
 | 
					    if len(token_annotation.words) < 20:
 | 
				
			||||||
        return "short-doc"
 | 
					        return "short-doc"
 | 
				
			||||||
    elif i == 0:
 | 
					    elif i == 0:
 | 
				
			||||||
        return "first-word"
 | 
					        return "first-word"
 | 
				
			||||||
| 
						 | 
					@ -46,7 +45,7 @@ def get_position_label(i, words, tags, heads, labels, ents):
 | 
				
			||||||
        return "early-word"
 | 
					        return "early-word"
 | 
				
			||||||
    elif i < 20:
 | 
					    elif i < 20:
 | 
				
			||||||
        return "mid-word"
 | 
					        return "mid-word"
 | 
				
			||||||
    elif i == len(words) - 1:
 | 
					    elif i == len(token_annotation.words) - 1:
 | 
				
			||||||
        return "last-word"
 | 
					        return "last-word"
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        return "late-word"
 | 
					        return "late-word"
 | 
				
			||||||
| 
						 | 
					@ -60,17 +59,17 @@ def main(n_iter=10):
 | 
				
			||||||
    print(nlp.pipeline)
 | 
					    print(nlp.pipeline)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    print("Create data", len(TRAIN_DATA))
 | 
					    print("Create data", len(TRAIN_DATA))
 | 
				
			||||||
    optimizer = nlp.begin_training(get_gold_tuples=lambda: TRAIN_DATA)
 | 
					    optimizer = nlp.begin_training(get_examples=lambda: TRAIN_DATA)
 | 
				
			||||||
    for itn in range(n_iter):
 | 
					    for itn in range(n_iter):
 | 
				
			||||||
        random.shuffle(TRAIN_DATA)
 | 
					        random.shuffle(TRAIN_DATA)
 | 
				
			||||||
        losses = {}
 | 
					        losses = {}
 | 
				
			||||||
        for text, annot_brackets in TRAIN_DATA:
 | 
					        for example in TRAIN_DATA:
 | 
				
			||||||
            for annotations, _ in annot_brackets:
 | 
					            for token_annotation in example.token_annotations:
 | 
				
			||||||
                doc = Doc(nlp.vocab, words=annotations[1])
 | 
					                doc = Doc(nlp.vocab, words=token_annotation.words)
 | 
				
			||||||
                gold = GoldParse.from_annot_tuples(doc, annotations)
 | 
					                gold = GoldParse.from_annotation(doc, example.doc_annotation, token_annotation)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
                nlp.update(
 | 
					                nlp.update(
 | 
				
			||||||
                    [doc],  # batch of texts
 | 
					                    examples=[(doc, gold)],  # 1 example
 | 
				
			||||||
                    [gold],  # batch of annotations
 | 
					 | 
				
			||||||
                    drop=0.2,  # dropout - make it harder to memorise data
 | 
					                    drop=0.2,  # dropout - make it harder to memorise data
 | 
				
			||||||
                    sgd=optimizer,  # callable to update weights
 | 
					                    sgd=optimizer,  # callable to update weights
 | 
				
			||||||
                    losses=losses,
 | 
					                    losses=losses,
 | 
				
			||||||
| 
						 | 
					@ -78,9 +77,9 @@ def main(n_iter=10):
 | 
				
			||||||
        print(losses.get("nn_labeller", 0.0), losses["ner"])
 | 
					        print(losses.get("nn_labeller", 0.0), losses["ner"])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # test the trained model
 | 
					    # test the trained model
 | 
				
			||||||
    for text, _ in TRAIN_DATA:
 | 
					    for example in TRAIN_DATA:
 | 
				
			||||||
        if text is not None:
 | 
					        if example.text is not None:
 | 
				
			||||||
            doc = nlp(text)
 | 
					            doc = nlp(example.text)
 | 
				
			||||||
            print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
 | 
					            print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
 | 
				
			||||||
            print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])
 | 
					            print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -16,16 +16,18 @@ the development labels, after all --- only the unlabelled text.
 | 
				
			||||||
import plac
 | 
					import plac
 | 
				
			||||||
import tqdm
 | 
					import tqdm
 | 
				
			||||||
import random
 | 
					import random
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import ml_datasets
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import spacy
 | 
					import spacy
 | 
				
			||||||
import thinc.extra.datasets
 | 
					 | 
				
			||||||
from spacy.util import minibatch, use_gpu, compounding
 | 
					from spacy.util import minibatch, use_gpu, compounding
 | 
				
			||||||
from spacy._ml import Tok2Vec
 | 
					 | 
				
			||||||
from spacy.pipeline import TextCategorizer
 | 
					from spacy.pipeline import TextCategorizer
 | 
				
			||||||
 | 
					from spacy.ml.tok2vec import Tok2Vec
 | 
				
			||||||
import numpy
 | 
					import numpy
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def load_texts(limit=0):
 | 
					def load_texts(limit=0):
 | 
				
			||||||
    train, dev = thinc.extra.datasets.imdb()
 | 
					    train, dev = ml_datasets.imdb()
 | 
				
			||||||
    train_texts, train_labels = zip(*train)
 | 
					    train_texts, train_labels = zip(*train)
 | 
				
			||||||
    dev_texts, dev_labels = zip(*train)
 | 
					    dev_texts, dev_labels = zip(*train)
 | 
				
			||||||
    train_texts = list(train_texts)
 | 
					    train_texts = list(train_texts)
 | 
				
			||||||
| 
						 | 
					@ -41,7 +43,7 @@ def load_texts(limit=0):
 | 
				
			||||||
def load_textcat_data(limit=0):
 | 
					def load_textcat_data(limit=0):
 | 
				
			||||||
    """Load data from the IMDB dataset."""
 | 
					    """Load data from the IMDB dataset."""
 | 
				
			||||||
    # Partition off part of the train data for evaluation
 | 
					    # Partition off part of the train data for evaluation
 | 
				
			||||||
    train_data, eval_data = thinc.extra.datasets.imdb()
 | 
					    train_data, eval_data = ml_datasets.imdb()
 | 
				
			||||||
    random.shuffle(train_data)
 | 
					    random.shuffle(train_data)
 | 
				
			||||||
    train_data = train_data[-limit:]
 | 
					    train_data = train_data[-limit:]
 | 
				
			||||||
    texts, labels = zip(*train_data)
 | 
					    texts, labels = zip(*train_data)
 | 
				
			||||||
| 
						 | 
					@ -63,17 +65,15 @@ def prefer_gpu():
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def build_textcat_model(tok2vec, nr_class, width):
 | 
					def build_textcat_model(tok2vec, nr_class, width):
 | 
				
			||||||
    from thinc.v2v import Model, Softmax, Maxout
 | 
					    from thinc.model import Model
 | 
				
			||||||
    from thinc.api import flatten_add_lengths, chain
 | 
					    from thinc.layers import Softmax, chain, reduce_mean
 | 
				
			||||||
    from thinc.t2v import Pooling, sum_pool, mean_pool, max_pool
 | 
					    from thinc.layers import list2ragged
 | 
				
			||||||
    from thinc.misc import Residual, LayerNorm
 | 
					 | 
				
			||||||
    from spacy._ml import logistic, zero_init
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    with Model.define_operators({">>": chain}):
 | 
					    with Model.define_operators({">>": chain}):
 | 
				
			||||||
        model = (
 | 
					        model = (
 | 
				
			||||||
            tok2vec
 | 
					            tok2vec
 | 
				
			||||||
            >> flatten_add_lengths
 | 
					            >> list2ragged()
 | 
				
			||||||
            >> Pooling(mean_pool)
 | 
					            >> reduce_mean()
 | 
				
			||||||
            >> Softmax(nr_class, width)
 | 
					            >> Softmax(nr_class, width)
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
    model.tok2vec = tok2vec
 | 
					    model.tok2vec = tok2vec
 | 
				
			||||||
| 
						 | 
					@ -81,7 +81,7 @@ def build_textcat_model(tok2vec, nr_class, width):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def block_gradients(model):
 | 
					def block_gradients(model):
 | 
				
			||||||
    from thinc.api import wrap
 | 
					    from thinc.api import wrap  # TODO FIX
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def forward(X, drop=0.0):
 | 
					    def forward(X, drop=0.0):
 | 
				
			||||||
        Y, _ = model.begin_update(X, drop=drop)
 | 
					        Y, _ = model.begin_update(X, drop=drop)
 | 
				
			||||||
| 
						 | 
					@ -114,7 +114,7 @@ def train_tensorizer(nlp, texts, dropout, n_iter):
 | 
				
			||||||
        losses = {}
 | 
					        losses = {}
 | 
				
			||||||
        for i, batch in enumerate(minibatch(tqdm.tqdm(texts))):
 | 
					        for i, batch in enumerate(minibatch(tqdm.tqdm(texts))):
 | 
				
			||||||
            docs = [nlp.make_doc(text) for text in batch]
 | 
					            docs = [nlp.make_doc(text) for text in batch]
 | 
				
			||||||
            tensorizer.update(docs, None, losses=losses, sgd=optimizer, drop=dropout)
 | 
					            tensorizer.update((docs, None), losses=losses, sgd=optimizer, drop=dropout)
 | 
				
			||||||
        print(losses)
 | 
					        print(losses)
 | 
				
			||||||
    return optimizer
 | 
					    return optimizer
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -143,8 +143,7 @@ def train_textcat(nlp, n_texts, n_iter=10):
 | 
				
			||||||
            # batch up the examples using spaCy's minibatch
 | 
					            # batch up the examples using spaCy's minibatch
 | 
				
			||||||
            batches = minibatch(tqdm.tqdm(train_data), size=2)
 | 
					            batches = minibatch(tqdm.tqdm(train_data), size=2)
 | 
				
			||||||
            for batch in batches:
 | 
					            for batch in batches:
 | 
				
			||||||
                texts, annotations = zip(*batch)
 | 
					                nlp.update(batch, sgd=optimizer, drop=0.2, losses=losses)
 | 
				
			||||||
                nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses)
 | 
					 | 
				
			||||||
            with textcat.model.use_params(optimizer.averages):
 | 
					            with textcat.model.use_params(optimizer.averages):
 | 
				
			||||||
                # evaluate on the dev data split off in load_data()
 | 
					                # evaluate on the dev data split off in load_data()
 | 
				
			||||||
                scores = evaluate_textcat(nlp.tokenizer, textcat, dev_texts, dev_cats)
 | 
					                scores = evaluate_textcat(nlp.tokenizer, textcat, dev_texts, dev_cats)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -58,7 +58,7 @@ def main(model_name, unlabelled_loc):
 | 
				
			||||||
    # yet, but I'm getting weird results from Adam. Try commenting out the
 | 
					    # yet, but I'm getting weird results from Adam. Try commenting out the
 | 
				
			||||||
    # nlp.update(), and using Adam -- you'll find the models drift apart.
 | 
					    # nlp.update(), and using Adam -- you'll find the models drift apart.
 | 
				
			||||||
    # I guess Adam is losing precision, introducing gradient noise?
 | 
					    # I guess Adam is losing precision, introducing gradient noise?
 | 
				
			||||||
    optimizer.alpha = 0.1
 | 
					    optimizer.learn_rate = 0.1
 | 
				
			||||||
    optimizer.b1 = 0.0
 | 
					    optimizer.b1 = 0.0
 | 
				
			||||||
    optimizer.b2 = 0.0
 | 
					    optimizer.b2 = 0.0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -75,8 +75,7 @@ def main(model_name, unlabelled_loc):
 | 
				
			||||||
            # batch up the examples using spaCy's minibatch
 | 
					            # batch up the examples using spaCy's minibatch
 | 
				
			||||||
            raw_batches = minibatch(raw_docs, size=4)
 | 
					            raw_batches = minibatch(raw_docs, size=4)
 | 
				
			||||||
            for batch in minibatch(TRAIN_DATA, size=sizes):
 | 
					            for batch in minibatch(TRAIN_DATA, size=sizes):
 | 
				
			||||||
                docs, golds = zip(*batch)
 | 
					                nlp.update(batch, sgd=optimizer, drop=dropout, losses=losses)
 | 
				
			||||||
                nlp.update(docs, golds, sgd=optimizer, drop=dropout, losses=losses)
 | 
					 | 
				
			||||||
                raw_batch = list(next(raw_batches))
 | 
					                raw_batch = list(next(raw_batches))
 | 
				
			||||||
                nlp.rehearse(raw_batch, sgd=optimizer, losses=r_losses)
 | 
					                nlp.rehearse(raw_batch, sgd=optimizer, losses=r_losses)
 | 
				
			||||||
            print("Losses", losses)
 | 
					            print("Losses", losses)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -17,7 +17,7 @@ import plac
 | 
				
			||||||
import random
 | 
					import random
 | 
				
			||||||
from pathlib import Path
 | 
					from pathlib import Path
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from spacy.symbols import PERSON
 | 
					import srsly
 | 
				
			||||||
from spacy.vocab import Vocab
 | 
					from spacy.vocab import Vocab
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import spacy
 | 
					import spacy
 | 
				
			||||||
| 
						 | 
					@ -68,7 +68,7 @@ def main(kb_path, vocab_path=None, output_dir=None, n_iter=50):
 | 
				
			||||||
    vocab = Vocab().from_disk(vocab_path)
 | 
					    vocab = Vocab().from_disk(vocab_path)
 | 
				
			||||||
    # create blank Language class with correct vocab
 | 
					    # create blank Language class with correct vocab
 | 
				
			||||||
    nlp = spacy.blank("en", vocab=vocab)
 | 
					    nlp = spacy.blank("en", vocab=vocab)
 | 
				
			||||||
    nlp.vocab.vectors.name = "spacy_pretrained_vectors"
 | 
					    nlp.vocab.vectors.name = "nel_vectors"
 | 
				
			||||||
    print("Created blank 'en' model with vocab from '%s'" % vocab_path)
 | 
					    print("Created blank 'en' model with vocab from '%s'" % vocab_path)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # Add a sentencizer component. Alternatively, add a dependency parser for higher accuracy.
 | 
					    # Add a sentencizer component. Alternatively, add a dependency parser for higher accuracy.
 | 
				
			||||||
| 
						 | 
					@ -93,7 +93,7 @@ def main(kb_path, vocab_path=None, output_dir=None, n_iter=50):
 | 
				
			||||||
        nlp.add_pipe(entity_linker, last=True)
 | 
					        nlp.add_pipe(entity_linker, last=True)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # Convert the texts to docs to make sure we have doc.ents set for the training examples.
 | 
					    # Convert the texts to docs to make sure we have doc.ents set for the training examples.
 | 
				
			||||||
    # Also ensure that the annotated examples correspond to known identifiers in the knowlege base.
 | 
					    # Also ensure that the annotated examples correspond to known identifiers in the knowledge base.
 | 
				
			||||||
    kb_ids = nlp.get_pipe("entity_linker").kb.get_entity_strings()
 | 
					    kb_ids = nlp.get_pipe("entity_linker").kb.get_entity_strings()
 | 
				
			||||||
    TRAIN_DOCS = []
 | 
					    TRAIN_DOCS = []
 | 
				
			||||||
    for text, annotation in TRAIN_DATA:
 | 
					    for text, annotation in TRAIN_DATA:
 | 
				
			||||||
| 
						 | 
					@ -118,16 +118,15 @@ def main(kb_path, vocab_path=None, output_dir=None, n_iter=50):
 | 
				
			||||||
    with nlp.disable_pipes(*other_pipes):  # only train entity linker
 | 
					    with nlp.disable_pipes(*other_pipes):  # only train entity linker
 | 
				
			||||||
        # reset and initialize the weights randomly
 | 
					        # reset and initialize the weights randomly
 | 
				
			||||||
        optimizer = nlp.begin_training()
 | 
					        optimizer = nlp.begin_training()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        for itn in range(n_iter):
 | 
					        for itn in range(n_iter):
 | 
				
			||||||
            random.shuffle(TRAIN_DOCS)
 | 
					            random.shuffle(TRAIN_DOCS)
 | 
				
			||||||
            losses = {}
 | 
					            losses = {}
 | 
				
			||||||
            # batch up the examples using spaCy's minibatch
 | 
					            # batch up the examples using spaCy's minibatch
 | 
				
			||||||
            batches = minibatch(TRAIN_DOCS, size=compounding(4.0, 32.0, 1.001))
 | 
					            batches = minibatch(TRAIN_DOCS, size=compounding(4.0, 32.0, 1.001))
 | 
				
			||||||
            for batch in batches:
 | 
					            for batch in batches:
 | 
				
			||||||
                texts, annotations = zip(*batch)
 | 
					 | 
				
			||||||
                nlp.update(
 | 
					                nlp.update(
 | 
				
			||||||
                    texts,  # batch of texts
 | 
					                    batch,
 | 
				
			||||||
                    annotations,  # batch of annotations
 | 
					 | 
				
			||||||
                    drop=0.2,  # dropout - make it harder to memorise data
 | 
					                    drop=0.2,  # dropout - make it harder to memorise data
 | 
				
			||||||
                    losses=losses,
 | 
					                    losses=losses,
 | 
				
			||||||
                    sgd=optimizer,
 | 
					                    sgd=optimizer,
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -134,8 +134,7 @@ def main(model=None, output_dir=None, n_iter=15):
 | 
				
			||||||
            # batch up the examples using spaCy's minibatch
 | 
					            # batch up the examples using spaCy's minibatch
 | 
				
			||||||
            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
 | 
					            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
 | 
				
			||||||
            for batch in batches:
 | 
					            for batch in batches:
 | 
				
			||||||
                texts, annotations = zip(*batch)
 | 
					                nlp.update(batch, sgd=optimizer, losses=losses)
 | 
				
			||||||
                nlp.update(texts, annotations, sgd=optimizer, losses=losses)
 | 
					 | 
				
			||||||
            print("Losses", losses)
 | 
					            print("Losses", losses)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # test the trained model
 | 
					    # test the trained model
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -68,10 +68,8 @@ def main(model=None, output_dir=None, n_iter=100):
 | 
				
			||||||
            # batch up the examples using spaCy's minibatch
 | 
					            # batch up the examples using spaCy's minibatch
 | 
				
			||||||
            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
 | 
					            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
 | 
				
			||||||
            for batch in batches:
 | 
					            for batch in batches:
 | 
				
			||||||
                texts, annotations = zip(*batch)
 | 
					 | 
				
			||||||
                nlp.update(
 | 
					                nlp.update(
 | 
				
			||||||
                    texts,  # batch of texts
 | 
					                    batch,
 | 
				
			||||||
                    annotations,  # batch of annotations
 | 
					 | 
				
			||||||
                    drop=0.5,  # dropout - make it harder to memorise data
 | 
					                    drop=0.5,  # dropout - make it harder to memorise data
 | 
				
			||||||
                    losses=losses,
 | 
					                    losses=losses,
 | 
				
			||||||
                )
 | 
					                )
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -105,8 +105,7 @@ def main(model=None, new_model_name="animal", output_dir=None, n_iter=30):
 | 
				
			||||||
            batches = minibatch(TRAIN_DATA, size=sizes)
 | 
					            batches = minibatch(TRAIN_DATA, size=sizes)
 | 
				
			||||||
            losses = {}
 | 
					            losses = {}
 | 
				
			||||||
            for batch in batches:
 | 
					            for batch in batches:
 | 
				
			||||||
                texts, annotations = zip(*batch)
 | 
					                nlp.update(batch, sgd=optimizer, drop=0.35, losses=losses)
 | 
				
			||||||
                nlp.update(texts, annotations, sgd=optimizer, drop=0.35, losses=losses)
 | 
					 | 
				
			||||||
            print("Losses", losses)
 | 
					            print("Losses", losses)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # test the trained model
 | 
					    # test the trained model
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -75,8 +75,7 @@ def main(model=None, output_dir=None, n_iter=15):
 | 
				
			||||||
            # batch up the examples using spaCy's minibatch
 | 
					            # batch up the examples using spaCy's minibatch
 | 
				
			||||||
            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
 | 
					            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
 | 
				
			||||||
            for batch in batches:
 | 
					            for batch in batches:
 | 
				
			||||||
                texts, annotations = zip(*batch)
 | 
					                nlp.update(batch, sgd=optimizer, losses=losses)
 | 
				
			||||||
                nlp.update(texts, annotations, sgd=optimizer, losses=losses)
 | 
					 | 
				
			||||||
            print("Losses", losses)
 | 
					            print("Losses", losses)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # test the trained model
 | 
					    # test the trained model
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -65,8 +65,7 @@ def main(lang="en", output_dir=None, n_iter=25):
 | 
				
			||||||
        # batch up the examples using spaCy's minibatch
 | 
					        # batch up the examples using spaCy's minibatch
 | 
				
			||||||
        batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
 | 
					        batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
 | 
				
			||||||
        for batch in batches:
 | 
					        for batch in batches:
 | 
				
			||||||
            texts, annotations = zip(*batch)
 | 
					            nlp.update(batch, sgd=optimizer, losses=losses)
 | 
				
			||||||
            nlp.update(texts, annotations, sgd=optimizer, losses=losses)
 | 
					 | 
				
			||||||
        print("Losses", losses)
 | 
					        print("Losses", losses)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # test the trained model
 | 
					    # test the trained model
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -10,10 +10,11 @@ see the documentation:
 | 
				
			||||||
Compatible with: spaCy v2.0.0+
 | 
					Compatible with: spaCy v2.0.0+
 | 
				
			||||||
"""
 | 
					"""
 | 
				
			||||||
from __future__ import unicode_literals, print_function
 | 
					from __future__ import unicode_literals, print_function
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import ml_datasets
 | 
				
			||||||
import plac
 | 
					import plac
 | 
				
			||||||
import random
 | 
					import random
 | 
				
			||||||
from pathlib import Path
 | 
					from pathlib import Path
 | 
				
			||||||
import thinc.extra.datasets
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
import spacy
 | 
					import spacy
 | 
				
			||||||
from spacy.util import minibatch, compounding
 | 
					from spacy.util import minibatch, compounding
 | 
				
			||||||
| 
						 | 
					@ -83,8 +84,7 @@ def main(model=None, output_dir=None, n_iter=20, n_texts=2000, init_tok2vec=None
 | 
				
			||||||
            random.shuffle(train_data)
 | 
					            random.shuffle(train_data)
 | 
				
			||||||
            batches = minibatch(train_data, size=batch_sizes)
 | 
					            batches = minibatch(train_data, size=batch_sizes)
 | 
				
			||||||
            for batch in batches:
 | 
					            for batch in batches:
 | 
				
			||||||
                texts, annotations = zip(*batch)
 | 
					                nlp.update(batch, sgd=optimizer, drop=0.2, losses=losses)
 | 
				
			||||||
                nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses)
 | 
					 | 
				
			||||||
            with textcat.model.use_params(optimizer.averages):
 | 
					            with textcat.model.use_params(optimizer.averages):
 | 
				
			||||||
                # evaluate on the dev data split off in load_data()
 | 
					                # evaluate on the dev data split off in load_data()
 | 
				
			||||||
                scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)
 | 
					                scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)
 | 
				
			||||||
| 
						 | 
					@ -117,7 +117,7 @@ def main(model=None, output_dir=None, n_iter=20, n_texts=2000, init_tok2vec=None
 | 
				
			||||||
def load_data(limit=0, split=0.8):
 | 
					def load_data(limit=0, split=0.8):
 | 
				
			||||||
    """Load data from the IMDB dataset."""
 | 
					    """Load data from the IMDB dataset."""
 | 
				
			||||||
    # Partition off part of the train data for evaluation
 | 
					    # Partition off part of the train data for evaluation
 | 
				
			||||||
    train_data, _ = thinc.extra.datasets.imdb()
 | 
					    train_data, _ = ml_datasets.imdb()
 | 
				
			||||||
    random.shuffle(train_data)
 | 
					    random.shuffle(train_data)
 | 
				
			||||||
    train_data = train_data[-limit:]
 | 
					    train_data = train_data[-limit:]
 | 
				
			||||||
    texts, labels = zip(*train_data)
 | 
					    texts, labels = zip(*train_data)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										9
									
								
								fabfile.py
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										9
									
								
								fabfile.py
									
									
									
									
										vendored
									
									
								
							| 
						 | 
					@ -1,9 +1,6 @@
 | 
				
			||||||
# coding: utf-8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals, print_function
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
import contextlib
 | 
					import contextlib
 | 
				
			||||||
from pathlib import Path
 | 
					from pathlib import Path
 | 
				
			||||||
from fabric.api import local, lcd, env, settings, prefix
 | 
					from fabric.api import local, lcd
 | 
				
			||||||
from os import path, environ
 | 
					from os import path, environ
 | 
				
			||||||
import shutil
 | 
					import shutil
 | 
				
			||||||
import sys
 | 
					import sys
 | 
				
			||||||
| 
						 | 
					@ -82,9 +79,7 @@ def pex():
 | 
				
			||||||
    with virtualenv(VENV_DIR) as venv_local:
 | 
					    with virtualenv(VENV_DIR) as venv_local:
 | 
				
			||||||
        with lcd(path.dirname(__file__)):
 | 
					        with lcd(path.dirname(__file__)):
 | 
				
			||||||
            sha = local("git rev-parse --short HEAD", capture=True)
 | 
					            sha = local("git rev-parse --short HEAD", capture=True)
 | 
				
			||||||
            venv_local(
 | 
					            venv_local(f"pex dist/*.whl -e spacy -o dist/spacy-{sha}.pex", direct=True)
 | 
				
			||||||
                "pex dist/*.whl -e spacy -o dist/spacy-%s.pex" % sha, direct=True
 | 
					 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def clean():
 | 
					def clean():
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,20 +1,21 @@
 | 
				
			||||||
# Our libraries
 | 
					# Our libraries
 | 
				
			||||||
cymem>=2.0.2,<2.1.0
 | 
					cymem>=2.0.2,<2.1.0
 | 
				
			||||||
preshed>=3.0.2,<3.1.0
 | 
					preshed>=3.0.2,<3.1.0
 | 
				
			||||||
thinc==7.4.0.dev0
 | 
					thinc==8.0.0a0
 | 
				
			||||||
blis>=0.4.0,<0.5.0
 | 
					blis>=0.4.0,<0.5.0
 | 
				
			||||||
 | 
					ml_datasets>=0.1.1
 | 
				
			||||||
murmurhash>=0.28.0,<1.1.0
 | 
					murmurhash>=0.28.0,<1.1.0
 | 
				
			||||||
wasabi>=0.4.0,<1.1.0
 | 
					wasabi>=0.4.0,<1.1.0
 | 
				
			||||||
srsly>=1.0.1,<1.1.0
 | 
					srsly>=2.0.0,<3.0.0
 | 
				
			||||||
catalogue>=0.0.7,<1.1.0
 | 
					catalogue>=0.0.7,<1.1.0
 | 
				
			||||||
# Third party dependencies
 | 
					# Third party dependencies
 | 
				
			||||||
numpy>=1.15.0
 | 
					numpy>=1.15.0
 | 
				
			||||||
requests>=2.13.0,<3.0.0
 | 
					requests>=2.13.0,<3.0.0
 | 
				
			||||||
plac>=0.9.6,<1.2.0
 | 
					plac>=0.9.6,<1.2.0
 | 
				
			||||||
pathlib==1.0.1; python_version < "3.4"
 | 
					 | 
				
			||||||
tqdm>=4.38.0,<5.0.0
 | 
					tqdm>=4.38.0,<5.0.0
 | 
				
			||||||
# Optional dependencies
 | 
					# Optional dependencies
 | 
				
			||||||
jsonschema>=2.6.0,<3.1.0
 | 
					jsonschema>=2.6.0,<3.1.0
 | 
				
			||||||
 | 
					pydantic>=1.0.0,<2.0.0
 | 
				
			||||||
# Development dependencies
 | 
					# Development dependencies
 | 
				
			||||||
cython>=0.25
 | 
					cython>=0.25
 | 
				
			||||||
pytest>=4.6.5
 | 
					pytest>=4.6.5
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										16
									
								
								setup.cfg
									
									
									
									
									
								
							
							
						
						
									
										16
									
								
								setup.cfg
									
									
									
									
									
								
							| 
						 | 
					@ -16,10 +16,7 @@ classifiers =
 | 
				
			||||||
    Operating System :: MacOS :: MacOS X
 | 
					    Operating System :: MacOS :: MacOS X
 | 
				
			||||||
    Operating System :: Microsoft :: Windows
 | 
					    Operating System :: Microsoft :: Windows
 | 
				
			||||||
    Programming Language :: Cython
 | 
					    Programming Language :: Cython
 | 
				
			||||||
    Programming Language :: Python :: 2
 | 
					 | 
				
			||||||
    Programming Language :: Python :: 2.7
 | 
					 | 
				
			||||||
    Programming Language :: Python :: 3
 | 
					    Programming Language :: Python :: 3
 | 
				
			||||||
    Programming Language :: Python :: 3.5
 | 
					 | 
				
			||||||
    Programming Language :: Python :: 3.6
 | 
					    Programming Language :: Python :: 3.6
 | 
				
			||||||
    Programming Language :: Python :: 3.7
 | 
					    Programming Language :: Python :: 3.7
 | 
				
			||||||
    Programming Language :: Python :: 3.8
 | 
					    Programming Language :: Python :: 3.8
 | 
				
			||||||
| 
						 | 
					@ -30,32 +27,35 @@ zip_safe = false
 | 
				
			||||||
include_package_data = true
 | 
					include_package_data = true
 | 
				
			||||||
scripts =
 | 
					scripts =
 | 
				
			||||||
    bin/spacy
 | 
					    bin/spacy
 | 
				
			||||||
python_requires = >=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*
 | 
					python_requires = >=3.6
 | 
				
			||||||
setup_requires =
 | 
					setup_requires =
 | 
				
			||||||
    wheel
 | 
					    wheel
 | 
				
			||||||
    cython>=0.25
 | 
					    cython>=0.25
 | 
				
			||||||
 | 
					    numpy>=1.15.0
 | 
				
			||||||
    # We also need our Cython packages here to compile against
 | 
					    # We also need our Cython packages here to compile against
 | 
				
			||||||
    cymem>=2.0.2,<2.1.0
 | 
					    cymem>=2.0.2,<2.1.0
 | 
				
			||||||
    preshed>=3.0.2,<3.1.0
 | 
					    preshed>=3.0.2,<3.1.0
 | 
				
			||||||
    murmurhash>=0.28.0,<1.1.0
 | 
					    murmurhash>=0.28.0,<1.1.0
 | 
				
			||||||
    thinc==7.4.0.dev0
 | 
					    thinc==8.0.0a0
 | 
				
			||||||
install_requires =
 | 
					install_requires =
 | 
				
			||||||
    # Our libraries
 | 
					    # Our libraries
 | 
				
			||||||
    murmurhash>=0.28.0,<1.1.0
 | 
					    murmurhash>=0.28.0,<1.1.0
 | 
				
			||||||
    cymem>=2.0.2,<2.1.0
 | 
					    cymem>=2.0.2,<2.1.0
 | 
				
			||||||
    preshed>=3.0.2,<3.1.0
 | 
					    preshed>=3.0.2,<3.1.0
 | 
				
			||||||
    thinc==7.4.0.dev0
 | 
					    thinc==8.0.0a0
 | 
				
			||||||
    blis>=0.4.0,<0.5.0
 | 
					    blis>=0.4.0,<0.5.0
 | 
				
			||||||
    wasabi>=0.4.0,<1.1.0
 | 
					    wasabi>=0.4.0,<1.1.0
 | 
				
			||||||
    srsly>=1.0.1,<1.1.0
 | 
					    srsly>=2.0.0,<3.0.0
 | 
				
			||||||
    catalogue>=0.0.7,<1.1.0
 | 
					    catalogue>=0.0.7,<1.1.0
 | 
				
			||||||
 | 
					    ml_datasets
 | 
				
			||||||
    # Third-party dependencies
 | 
					    # Third-party dependencies
 | 
				
			||||||
    tqdm>=4.38.0,<5.0.0
 | 
					    tqdm>=4.38.0,<5.0.0
 | 
				
			||||||
    setuptools
 | 
					    setuptools
 | 
				
			||||||
    numpy>=1.15.0
 | 
					    numpy>=1.15.0
 | 
				
			||||||
    plac>=0.9.6,<1.2.0
 | 
					    plac>=0.9.6,<1.2.0
 | 
				
			||||||
    requests>=2.13.0,<3.0.0
 | 
					    requests>=2.13.0,<3.0.0
 | 
				
			||||||
    pathlib==1.0.1; python_version < "3.4"
 | 
					    pydantic>=1.3.0,<2.0.0
 | 
				
			||||||
 | 
					    tqdm>=4.38.0,<5.0.0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
[options.extras_require]
 | 
					[options.extras_require]
 | 
				
			||||||
lookups =
 | 
					lookups =
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										133
									
								
								setup.py
									
									
									
									
									
								
							
							
						
						
									
										133
									
								
								setup.py
									
									
									
									
									
								
							| 
						 | 
					@ -1,37 +1,23 @@
 | 
				
			||||||
#!/usr/bin/env python
 | 
					#!/usr/bin/env python
 | 
				
			||||||
from __future__ import print_function
 | 
					 | 
				
			||||||
import io
 | 
					 | 
				
			||||||
import os
 | 
					 | 
				
			||||||
import subprocess
 | 
					 | 
				
			||||||
import sys
 | 
					import sys
 | 
				
			||||||
import contextlib
 | 
					 | 
				
			||||||
from distutils.command.build_ext import build_ext
 | 
					from distutils.command.build_ext import build_ext
 | 
				
			||||||
from distutils.sysconfig import get_python_inc
 | 
					from distutils.sysconfig import get_python_inc
 | 
				
			||||||
import distutils.util
 | 
					import distutils.util
 | 
				
			||||||
from distutils import ccompiler, msvccompiler
 | 
					from distutils import ccompiler, msvccompiler
 | 
				
			||||||
from setuptools import Extension, setup, find_packages
 | 
					from setuptools import Extension, setup, find_packages
 | 
				
			||||||
 | 
					import numpy
 | 
				
			||||||
 | 
					from pathlib import Path
 | 
				
			||||||
 | 
					from Cython.Build import cythonize
 | 
				
			||||||
 | 
					from Cython.Compiler import Options
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def is_new_osx():
 | 
					# Preserve `__doc__` on functions and classes
 | 
				
			||||||
    """Check whether we're on OSX >= 10.10"""
 | 
					# http://docs.cython.org/en/latest/src/userguide/source_files_and_compilation.html#compiler-options
 | 
				
			||||||
    name = distutils.util.get_platform()
 | 
					Options.docstrings = True
 | 
				
			||||||
    if sys.platform != "darwin":
 | 
					 | 
				
			||||||
        return False
 | 
					 | 
				
			||||||
    elif name.startswith("macosx-10"):
 | 
					 | 
				
			||||||
        minor_version = int(name.split("-")[1].split(".")[1])
 | 
					 | 
				
			||||||
        if minor_version >= 7:
 | 
					 | 
				
			||||||
            return True
 | 
					 | 
				
			||||||
        else:
 | 
					 | 
				
			||||||
            return False
 | 
					 | 
				
			||||||
    else:
 | 
					 | 
				
			||||||
        return False
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
PACKAGES = find_packages()
 | 
					PACKAGES = find_packages()
 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
MOD_NAMES = [
 | 
					MOD_NAMES = [
 | 
				
			||||||
    "spacy._align",
 | 
					 | 
				
			||||||
    "spacy.parts_of_speech",
 | 
					    "spacy.parts_of_speech",
 | 
				
			||||||
    "spacy.strings",
 | 
					    "spacy.strings",
 | 
				
			||||||
    "spacy.lexeme",
 | 
					    "spacy.lexeme",
 | 
				
			||||||
| 
						 | 
					@ -63,16 +49,32 @@ MOD_NAMES = [
 | 
				
			||||||
    "spacy.symbols",
 | 
					    "spacy.symbols",
 | 
				
			||||||
    "spacy.vectors",
 | 
					    "spacy.vectors",
 | 
				
			||||||
]
 | 
					]
 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
COMPILE_OPTIONS = {
 | 
					COMPILE_OPTIONS = {
 | 
				
			||||||
    "msvc": ["/Ox", "/EHsc"],
 | 
					    "msvc": ["/Ox", "/EHsc"],
 | 
				
			||||||
    "mingw32": ["-O2", "-Wno-strict-prototypes", "-Wno-unused-function"],
 | 
					    "mingw32": ["-O2", "-Wno-strict-prototypes", "-Wno-unused-function"],
 | 
				
			||||||
    "other": ["-O2", "-Wno-strict-prototypes", "-Wno-unused-function"],
 | 
					    "other": ["-O2", "-Wno-strict-prototypes", "-Wno-unused-function"],
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
LINK_OPTIONS = {"msvc": [], "mingw32": [], "other": []}
 | 
					LINK_OPTIONS = {"msvc": [], "mingw32": [], "other": []}
 | 
				
			||||||
 | 
					COMPILER_DIRECTIVES = {
 | 
				
			||||||
 | 
					    "language_level": -3,
 | 
				
			||||||
 | 
					    "embedsignature": True,
 | 
				
			||||||
 | 
					    "annotation_typing": False,
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def is_new_osx():
 | 
				
			||||||
 | 
					    """Check whether we're on OSX >= 10.10"""
 | 
				
			||||||
 | 
					    name = distutils.util.get_platform()
 | 
				
			||||||
 | 
					    if sys.platform != "darwin":
 | 
				
			||||||
 | 
					        return False
 | 
				
			||||||
 | 
					    elif name.startswith("macosx-10"):
 | 
				
			||||||
 | 
					        minor_version = int(name.split("-")[1].split(".")[1])
 | 
				
			||||||
 | 
					        if minor_version >= 7:
 | 
				
			||||||
 | 
					            return True
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            return False
 | 
				
			||||||
 | 
					    else:
 | 
				
			||||||
 | 
					        return False
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
if is_new_osx():
 | 
					if is_new_osx():
 | 
				
			||||||
| 
						 | 
					@ -105,87 +107,40 @@ class build_ext_subclass(build_ext, build_ext_options):
 | 
				
			||||||
        build_ext.build_extensions(self)
 | 
					        build_ext.build_extensions(self)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def generate_cython(root, source):
 | 
					 | 
				
			||||||
    print("Cythonizing sources")
 | 
					 | 
				
			||||||
    p = subprocess.call(
 | 
					 | 
				
			||||||
        [sys.executable, os.path.join(root, "bin", "cythonize.py"), source],
 | 
					 | 
				
			||||||
        env=os.environ,
 | 
					 | 
				
			||||||
    )
 | 
					 | 
				
			||||||
    if p != 0:
 | 
					 | 
				
			||||||
        raise RuntimeError("Running cythonize failed")
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def is_source_release(path):
 | 
					 | 
				
			||||||
    return os.path.exists(os.path.join(path, "PKG-INFO"))
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def clean(path):
 | 
					def clean(path):
 | 
				
			||||||
    for name in MOD_NAMES:
 | 
					    for path in path.glob("**/*"):
 | 
				
			||||||
        name = name.replace(".", "/")
 | 
					        if path.is_file() and path.suffix in (".so", ".cpp"):
 | 
				
			||||||
        for ext in [".so", ".html", ".cpp", ".c"]:
 | 
					            print(f"Deleting {path.name}")
 | 
				
			||||||
            file_path = os.path.join(path, name + ext)
 | 
					            path.unlink()
 | 
				
			||||||
            if os.path.exists(file_path):
 | 
					 | 
				
			||||||
                os.unlink(file_path)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@contextlib.contextmanager
 | 
					 | 
				
			||||||
def chdir(new_dir):
 | 
					 | 
				
			||||||
    old_dir = os.getcwd()
 | 
					 | 
				
			||||||
    try:
 | 
					 | 
				
			||||||
        os.chdir(new_dir)
 | 
					 | 
				
			||||||
        sys.path.insert(0, new_dir)
 | 
					 | 
				
			||||||
        yield
 | 
					 | 
				
			||||||
    finally:
 | 
					 | 
				
			||||||
        del sys.path[0]
 | 
					 | 
				
			||||||
        os.chdir(old_dir)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def setup_package():
 | 
					def setup_package():
 | 
				
			||||||
    root = os.path.abspath(os.path.dirname(__file__))
 | 
					    root = Path(__file__).parent
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if len(sys.argv) > 1 and sys.argv[1] == "clean":
 | 
					    if len(sys.argv) > 1 and sys.argv[1] == "clean":
 | 
				
			||||||
        return clean(root)
 | 
					        return clean(root / "spacy")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    with chdir(root):
 | 
					    with (root / "spacy" / "about.py").open("r") as f:
 | 
				
			||||||
        with io.open(os.path.join(root, "spacy", "about.py"), encoding="utf8") as f:
 | 
					 | 
				
			||||||
        about = {}
 | 
					        about = {}
 | 
				
			||||||
        exec(f.read(), about)
 | 
					        exec(f.read(), about)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    include_dirs = [
 | 
					    include_dirs = [
 | 
				
			||||||
        get_python_inc(plat_specific=True),
 | 
					        get_python_inc(plat_specific=True),
 | 
				
			||||||
            os.path.join(root, "include"),
 | 
					        numpy.get_include(),
 | 
				
			||||||
 | 
					        str(root / "include"),
 | 
				
			||||||
    ]
 | 
					    ]
 | 
				
			||||||
 | 
					 | 
				
			||||||
    if (
 | 
					    if (
 | 
				
			||||||
        ccompiler.new_compiler().compiler_type == "msvc"
 | 
					        ccompiler.new_compiler().compiler_type == "msvc"
 | 
				
			||||||
        and msvccompiler.get_build_version() == 9
 | 
					        and msvccompiler.get_build_version() == 9
 | 
				
			||||||
    ):
 | 
					    ):
 | 
				
			||||||
            include_dirs.append(os.path.join(root, "include", "msvc9"))
 | 
					        include_dirs.append(str(root / "include" / "msvc9"))
 | 
				
			||||||
 | 
					 | 
				
			||||||
    ext_modules = []
 | 
					    ext_modules = []
 | 
				
			||||||
        for mod_name in MOD_NAMES:
 | 
					    for name in MOD_NAMES:
 | 
				
			||||||
            mod_path = mod_name.replace(".", "/") + ".cpp"
 | 
					        mod_path = name.replace(".", "/") + ".pyx"
 | 
				
			||||||
            extra_link_args = []
 | 
					        ext = Extension(name, [mod_path], language="c++")
 | 
				
			||||||
            # ???
 | 
					        ext_modules.append(ext)
 | 
				
			||||||
            # Imported from patch from @mikepb
 | 
					    print("Cythonizing sources")
 | 
				
			||||||
            # See Issue #267. Running blind here...
 | 
					    ext_modules = cythonize(ext_modules, compiler_directives=COMPILER_DIRECTIVES)
 | 
				
			||||||
            if sys.platform == "darwin":
 | 
					 | 
				
			||||||
                dylib_path = [".." for _ in range(mod_name.count("."))]
 | 
					 | 
				
			||||||
                dylib_path = "/".join(dylib_path)
 | 
					 | 
				
			||||||
                dylib_path = "@loader_path/%s/spacy/platform/darwin/lib" % dylib_path
 | 
					 | 
				
			||||||
                extra_link_args.append("-Wl,-rpath,%s" % dylib_path)
 | 
					 | 
				
			||||||
            ext_modules.append(
 | 
					 | 
				
			||||||
                Extension(
 | 
					 | 
				
			||||||
                    mod_name,
 | 
					 | 
				
			||||||
                    [mod_path],
 | 
					 | 
				
			||||||
                    language="c++",
 | 
					 | 
				
			||||||
                    include_dirs=include_dirs,
 | 
					 | 
				
			||||||
                    extra_link_args=extra_link_args,
 | 
					 | 
				
			||||||
                )
 | 
					 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        if not is_source_release(root):
 | 
					 | 
				
			||||||
            generate_cython(root, "spacy")
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    setup(
 | 
					    setup(
 | 
				
			||||||
        name="spacy",
 | 
					        name="spacy",
 | 
				
			||||||
| 
						 | 
					@ -193,6 +148,8 @@ def setup_package():
 | 
				
			||||||
        version=about["__version__"],
 | 
					        version=about["__version__"],
 | 
				
			||||||
        ext_modules=ext_modules,
 | 
					        ext_modules=ext_modules,
 | 
				
			||||||
        cmdclass={"build_ext": build_ext_subclass},
 | 
					        cmdclass={"build_ext": build_ext_subclass},
 | 
				
			||||||
 | 
					        include_dirs=include_dirs,
 | 
				
			||||||
 | 
					        package_data={"": ["*.pyx", "*.pxd", "*.pxi", "*.cpp"]},
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,5 +1,3 @@
 | 
				
			||||||
# coding: utf8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
import warnings
 | 
					import warnings
 | 
				
			||||||
import sys
 | 
					import sys
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -7,7 +5,7 @@ warnings.filterwarnings("ignore", message="numpy.dtype size changed")
 | 
				
			||||||
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")
 | 
					warnings.filterwarnings("ignore", message="numpy.ufunc size changed")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# These are imported as part of the API
 | 
					# These are imported as part of the API
 | 
				
			||||||
from thinc.neural.util import prefer_gpu, require_gpu
 | 
					from thinc.api import prefer_gpu, require_gpu
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from . import pipeline
 | 
					from . import pipeline
 | 
				
			||||||
from .cli.info import info as cli_info
 | 
					from .cli.info import info as cli_info
 | 
				
			||||||
| 
						 | 
					@ -23,6 +21,9 @@ if sys.maxunicode == 65535:
 | 
				
			||||||
    raise SystemError(Errors.E130)
 | 
					    raise SystemError(Errors.E130)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					config = registry
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def load(name, **overrides):
 | 
					def load(name, **overrides):
 | 
				
			||||||
    depr_path = overrides.get("path")
 | 
					    depr_path = overrides.get("path")
 | 
				
			||||||
    if depr_path not in (True, False, None):
 | 
					    if depr_path not in (True, False, None):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,21 +1,17 @@
 | 
				
			||||||
# coding: utf8
 | 
					 | 
				
			||||||
from __future__ import print_function
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
# NB! This breaks in plac on Python 2!!
 | 
					 | 
				
			||||||
# from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
if __name__ == "__main__":
 | 
					if __name__ == "__main__":
 | 
				
			||||||
    import plac
 | 
					    import plac
 | 
				
			||||||
    import sys
 | 
					    import sys
 | 
				
			||||||
    from wasabi import msg
 | 
					    from wasabi import msg
 | 
				
			||||||
    from spacy.cli import download, link, info, package, train, pretrain, convert
 | 
					    from spacy.cli import download, link, info, package, train, pretrain, convert
 | 
				
			||||||
    from spacy.cli import init_model, profile, evaluate, validate, debug_data
 | 
					    from spacy.cli import init_model, profile, evaluate, validate, debug_data
 | 
				
			||||||
 | 
					    from spacy.cli import train_from_config_cli
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    commands = {
 | 
					    commands = {
 | 
				
			||||||
        "download": download,
 | 
					        "download": download,
 | 
				
			||||||
        "link": link,
 | 
					        "link": link,
 | 
				
			||||||
        "info": info,
 | 
					        "info": info,
 | 
				
			||||||
        "train": train,
 | 
					        "train": train,
 | 
				
			||||||
 | 
					        "train-from-config": train_from_config_cli,
 | 
				
			||||||
        "pretrain": pretrain,
 | 
					        "pretrain": pretrain,
 | 
				
			||||||
        "debug-data": debug_data,
 | 
					        "debug-data": debug_data,
 | 
				
			||||||
        "evaluate": evaluate,
 | 
					        "evaluate": evaluate,
 | 
				
			||||||
| 
						 | 
					@ -28,9 +24,9 @@ if __name__ == "__main__":
 | 
				
			||||||
    if len(sys.argv) == 1:
 | 
					    if len(sys.argv) == 1:
 | 
				
			||||||
        msg.info("Available commands", ", ".join(commands), exits=1)
 | 
					        msg.info("Available commands", ", ".join(commands), exits=1)
 | 
				
			||||||
    command = sys.argv.pop(1)
 | 
					    command = sys.argv.pop(1)
 | 
				
			||||||
    sys.argv[0] = "spacy %s" % command
 | 
					    sys.argv[0] = f"spacy {command}"
 | 
				
			||||||
    if command in commands:
 | 
					    if command in commands:
 | 
				
			||||||
        plac.call(commands[command], sys.argv[1:])
 | 
					        plac.call(commands[command], sys.argv[1:])
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        available = "Available: {}".format(", ".join(commands))
 | 
					        available = f"Available: {', '.join(commands)}"
 | 
				
			||||||
        msg.fail("Unknown command: {}".format(command), available, exits=1)
 | 
					        msg.fail(f"Unknown command: {command}", available, exits=1)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										255
									
								
								spacy/_align.pyx
									
									
									
									
									
								
							
							
						
						
									
										255
									
								
								spacy/_align.pyx
									
									
									
									
									
								
							| 
						 | 
					@ -1,255 +0,0 @@
 | 
				
			||||||
# cython: infer_types=True
 | 
					 | 
				
			||||||
'''Do Levenshtein alignment, for evaluation of tokenized input.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
Random notes:
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  r i n g
 | 
					 | 
				
			||||||
  0 1 2 3 4
 | 
					 | 
				
			||||||
r 1 0 1 2 3
 | 
					 | 
				
			||||||
a 2 1 1 2 3
 | 
					 | 
				
			||||||
n 3 2 2 1 2
 | 
					 | 
				
			||||||
g 4 3 3 2 1
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
0,0: (1,1)=min(0+0,1+1,1+1)=0 S
 | 
					 | 
				
			||||||
1,0: (2,1)=min(1+1,0+1,2+1)=1 D
 | 
					 | 
				
			||||||
2,0: (3,1)=min(2+1,3+1,1+1)=2 D
 | 
					 | 
				
			||||||
3,0: (4,1)=min(3+1,4+1,2+1)=3 D
 | 
					 | 
				
			||||||
0,1: (1,2)=min(1+1,2+1,0+1)=1 D
 | 
					 | 
				
			||||||
1,1: (2,2)=min(0+1,1+1,1+1)=1 S
 | 
					 | 
				
			||||||
2,1: (3,2)=min(1+1,1+1,2+1)=2 S or I
 | 
					 | 
				
			||||||
3,1: (4,2)=min(2+1,2+1,3+1)=3 S or I
 | 
					 | 
				
			||||||
0,2: (1,3)=min(2+1,3+1,1+1)=2 I
 | 
					 | 
				
			||||||
1,2: (2,3)=min(1+1,2+1,1+1)=2 S or I
 | 
					 | 
				
			||||||
2,2: (3,3)
 | 
					 | 
				
			||||||
3,2: (4,3)
 | 
					 | 
				
			||||||
At state (i, j) we're asking "How do I transform S[:i+1] to T[:j+1]?"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
We know the costs to transition:
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
S[:i]   -> T[:j]   (at D[i,j])
 | 
					 | 
				
			||||||
S[:i+1] -> T[:j]   (at D[i+1,j])
 | 
					 | 
				
			||||||
S[:i]   -> T[:j+1] (at D[i,j+1])
 | 
					 | 
				
			||||||
    
 | 
					 | 
				
			||||||
Further, now we can transform:
 | 
					 | 
				
			||||||
S[:i+1] -> S[:i] (DEL) for 1,
 | 
					 | 
				
			||||||
T[:j+1] -> T[:j] (INS) for 1.
 | 
					 | 
				
			||||||
S[i+1]  -> T[j+1] (SUB) for 0 or 1
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
Therefore we have the costs:
 | 
					 | 
				
			||||||
SUB: Cost(S[:i]->T[:j])   + Cost(S[i]->S[j])
 | 
					 | 
				
			||||||
i.e. D[i, j] + S[i+1] != T[j+1]
 | 
					 | 
				
			||||||
INS: Cost(S[:i+1]->T[:j]) + Cost(T[:j+1]->T[:j])
 | 
					 | 
				
			||||||
i.e. D[i+1,j] + 1
 | 
					 | 
				
			||||||
DEL: Cost(S[:i]->T[:j+1]) + Cost(S[:i+1]->S[:i]) 
 | 
					 | 
				
			||||||
i.e. D[i,j+1] + 1
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    Source string S has length m, with index i
 | 
					 | 
				
			||||||
    Target string T has length n, with index j
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    Output two alignment vectors: i2j (length m) and j2i (length n)
 | 
					 | 
				
			||||||
    # function LevenshteinDistance(char s[1..m], char t[1..n]):
 | 
					 | 
				
			||||||
    # for all i and j, d[i,j] will hold the Levenshtein distance between
 | 
					 | 
				
			||||||
    # the first i characters of s and the first j characters of t
 | 
					 | 
				
			||||||
    # note that d has (m+1)*(n+1) values
 | 
					 | 
				
			||||||
    # set each element in d to zero
 | 
					 | 
				
			||||||
    ring rang
 | 
					 | 
				
			||||||
      - r i n g
 | 
					 | 
				
			||||||
    - 0 0 0 0 0
 | 
					 | 
				
			||||||
    r 0 0 0 0 0
 | 
					 | 
				
			||||||
    a 0 0 0 0 0
 | 
					 | 
				
			||||||
    n 0 0 0 0 0
 | 
					 | 
				
			||||||
    g 0 0 0 0 0
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    # source prefixes can be transformed into empty string by
 | 
					 | 
				
			||||||
    # dropping all characters
 | 
					 | 
				
			||||||
    # d[i, 0] := i
 | 
					 | 
				
			||||||
    ring rang
 | 
					 | 
				
			||||||
      - r i n g
 | 
					 | 
				
			||||||
    - 0 0 0 0 0
 | 
					 | 
				
			||||||
    r 1 0 0 0 0
 | 
					 | 
				
			||||||
    a 2 0 0 0 0
 | 
					 | 
				
			||||||
    n 3 0 0 0 0
 | 
					 | 
				
			||||||
    g 4 0 0 0 0
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    # target prefixes can be reached from empty source prefix
 | 
					 | 
				
			||||||
    # by inserting every character
 | 
					 | 
				
			||||||
    # d[0, j] := j
 | 
					 | 
				
			||||||
      - r i n g
 | 
					 | 
				
			||||||
    - 0 1 2 3 4
 | 
					 | 
				
			||||||
    r 1 0 0 0 0
 | 
					 | 
				
			||||||
    a 2 0 0 0 0
 | 
					 | 
				
			||||||
    n 3 0 0 0 0
 | 
					 | 
				
			||||||
    g 4 0 0 0 0
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
'''
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
from libc.stdint cimport uint32_t
 | 
					 | 
				
			||||||
import numpy
 | 
					 | 
				
			||||||
cimport numpy as np
 | 
					 | 
				
			||||||
from .compat import unicode_
 | 
					 | 
				
			||||||
from murmurhash.mrmr cimport hash32
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def align(S, T):
 | 
					 | 
				
			||||||
    cdef int m = len(S)
 | 
					 | 
				
			||||||
    cdef int n = len(T)
 | 
					 | 
				
			||||||
    cdef np.ndarray matrix = numpy.zeros((m+1, n+1), dtype='int32')
 | 
					 | 
				
			||||||
    cdef np.ndarray i2j = numpy.zeros((m,), dtype='i')
 | 
					 | 
				
			||||||
    cdef np.ndarray j2i = numpy.zeros((n,), dtype='i')
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    cdef np.ndarray S_arr = _convert_sequence(S)
 | 
					 | 
				
			||||||
    cdef np.ndarray T_arr = _convert_sequence(T)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    fill_matrix(<int*>matrix.data,
 | 
					 | 
				
			||||||
        <const int*>S_arr.data, m, <const int*>T_arr.data, n)
 | 
					 | 
				
			||||||
    fill_i2j(i2j, matrix)
 | 
					 | 
				
			||||||
    fill_j2i(j2i, matrix)
 | 
					 | 
				
			||||||
    for i in range(i2j.shape[0]):
 | 
					 | 
				
			||||||
        if i2j[i] >= 0 and len(S[i]) != len(T[i2j[i]]):
 | 
					 | 
				
			||||||
            i2j[i] = -1
 | 
					 | 
				
			||||||
    for j in range(j2i.shape[0]):
 | 
					 | 
				
			||||||
        if j2i[j] >= 0 and len(T[j]) != len(S[j2i[j]]):
 | 
					 | 
				
			||||||
            j2i[j] = -1
 | 
					 | 
				
			||||||
    return matrix[-1,-1], i2j, j2i, matrix
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def multi_align(np.ndarray i2j, np.ndarray j2i, i_lengths, j_lengths):
 | 
					 | 
				
			||||||
    '''Let's say we had:
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    Guess: [aa bb cc dd]
 | 
					 | 
				
			||||||
    Truth: [aa bbcc dd]
 | 
					 | 
				
			||||||
    i2j: [0, None, -2, 2]
 | 
					 | 
				
			||||||
    j2i: [0, -2, 3]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    We want:
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    i2j_multi: {1: 1, 2: 1}
 | 
					 | 
				
			||||||
    j2i_multi: {}
 | 
					 | 
				
			||||||
    '''
 | 
					 | 
				
			||||||
    i2j_miss = _get_regions(i2j, i_lengths)
 | 
					 | 
				
			||||||
    j2i_miss = _get_regions(j2i, j_lengths)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    i2j_multi, j2i_multi = _get_mapping(i2j_miss, j2i_miss, i_lengths, j_lengths)
 | 
					 | 
				
			||||||
    return i2j_multi, j2i_multi
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def _get_regions(alignment, lengths):
 | 
					 | 
				
			||||||
    regions = {}
 | 
					 | 
				
			||||||
    start = None
 | 
					 | 
				
			||||||
    offset = 0
 | 
					 | 
				
			||||||
    for i in range(len(alignment)):
 | 
					 | 
				
			||||||
        if alignment[i] < 0:
 | 
					 | 
				
			||||||
            if start is None:
 | 
					 | 
				
			||||||
                start = offset
 | 
					 | 
				
			||||||
                regions.setdefault(start, [])
 | 
					 | 
				
			||||||
            regions[start].append(i)
 | 
					 | 
				
			||||||
        else:
 | 
					 | 
				
			||||||
            start = None
 | 
					 | 
				
			||||||
        offset += lengths[i]
 | 
					 | 
				
			||||||
    return regions
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def _get_mapping(miss1, miss2, lengths1, lengths2):
 | 
					 | 
				
			||||||
    i2j = {}
 | 
					 | 
				
			||||||
    j2i = {}
 | 
					 | 
				
			||||||
    for start, region1 in miss1.items():
 | 
					 | 
				
			||||||
        if not region1 or start not in miss2:
 | 
					 | 
				
			||||||
            continue
 | 
					 | 
				
			||||||
        region2 = miss2[start]
 | 
					 | 
				
			||||||
        if sum(lengths1[i] for i in region1) == sum(lengths2[i] for i in region2):
 | 
					 | 
				
			||||||
            j = region2.pop(0)
 | 
					 | 
				
			||||||
            buff = []
 | 
					 | 
				
			||||||
            # Consume tokens from region 1, until we meet the length of the
 | 
					 | 
				
			||||||
            # first token in region2. If we do, align the tokens. If
 | 
					 | 
				
			||||||
            # we exceed the length, break.
 | 
					 | 
				
			||||||
            while region1:
 | 
					 | 
				
			||||||
                buff.append(region1.pop(0))
 | 
					 | 
				
			||||||
                if sum(lengths1[i] for i in buff) == lengths2[j]:
 | 
					 | 
				
			||||||
                    for i in buff:
 | 
					 | 
				
			||||||
                        i2j[i] = j
 | 
					 | 
				
			||||||
                    j2i[j] = buff[-1]
 | 
					 | 
				
			||||||
                    j += 1
 | 
					 | 
				
			||||||
                    buff = []
 | 
					 | 
				
			||||||
                elif sum(lengths1[i] for i in buff) > lengths2[j]:
 | 
					 | 
				
			||||||
                    break
 | 
					 | 
				
			||||||
            else:
 | 
					 | 
				
			||||||
                if buff and sum(lengths1[i] for i in buff) == lengths2[j]:
 | 
					 | 
				
			||||||
                    for i in buff:
 | 
					 | 
				
			||||||
                        i2j[i] = j
 | 
					 | 
				
			||||||
                    j2i[j] = buff[-1]
 | 
					 | 
				
			||||||
    return i2j, j2i
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def _convert_sequence(seq):
 | 
					 | 
				
			||||||
    if isinstance(seq, numpy.ndarray):
 | 
					 | 
				
			||||||
        return numpy.ascontiguousarray(seq, dtype='uint32_t')
 | 
					 | 
				
			||||||
    cdef np.ndarray output = numpy.zeros((len(seq),), dtype='uint32')
 | 
					 | 
				
			||||||
    cdef bytes item_bytes
 | 
					 | 
				
			||||||
    for i, item in enumerate(seq):
 | 
					 | 
				
			||||||
        if item == "``":
 | 
					 | 
				
			||||||
            item = '"'
 | 
					 | 
				
			||||||
        elif item == "''":
 | 
					 | 
				
			||||||
            item = '"'
 | 
					 | 
				
			||||||
        if isinstance(item, unicode):
 | 
					 | 
				
			||||||
            item_bytes = item.encode('utf8')
 | 
					 | 
				
			||||||
        else:
 | 
					 | 
				
			||||||
            item_bytes = item
 | 
					 | 
				
			||||||
        output[i] = hash32(<void*><char*>item_bytes, len(item_bytes), 0)
 | 
					 | 
				
			||||||
    return output
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
cdef void fill_matrix(int* D, 
 | 
					 | 
				
			||||||
        const int* S, int m, const int* T, int n) nogil:
 | 
					 | 
				
			||||||
    m1 = m+1
 | 
					 | 
				
			||||||
    n1 = n+1
 | 
					 | 
				
			||||||
    for i in range(m1*n1):
 | 
					 | 
				
			||||||
        D[i] = 0
 | 
					 | 
				
			||||||
 
 | 
					 | 
				
			||||||
    for i in range(m1):
 | 
					 | 
				
			||||||
        D[i*n1] = i
 | 
					 | 
				
			||||||
 
 | 
					 | 
				
			||||||
    for j in range(n1):
 | 
					 | 
				
			||||||
        D[j] = j
 | 
					 | 
				
			||||||
 
 | 
					 | 
				
			||||||
    cdef int sub_cost, ins_cost, del_cost
 | 
					 | 
				
			||||||
    for j in range(n):
 | 
					 | 
				
			||||||
        for i in range(m):
 | 
					 | 
				
			||||||
            i_j = i*n1 + j
 | 
					 | 
				
			||||||
            i1_j1 = (i+1)*n1 + j+1
 | 
					 | 
				
			||||||
            i1_j = (i+1)*n1 + j
 | 
					 | 
				
			||||||
            i_j1 = i*n1 + j+1
 | 
					 | 
				
			||||||
            if S[i] != T[j]:
 | 
					 | 
				
			||||||
                sub_cost = D[i_j] + 1
 | 
					 | 
				
			||||||
            else:
 | 
					 | 
				
			||||||
                sub_cost = D[i_j]
 | 
					 | 
				
			||||||
            del_cost = D[i_j1] + 1
 | 
					 | 
				
			||||||
            ins_cost = D[i1_j] + 1
 | 
					 | 
				
			||||||
            best = min(min(sub_cost, ins_cost), del_cost)
 | 
					 | 
				
			||||||
            D[i1_j1] = best
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
cdef void fill_i2j(np.ndarray i2j, np.ndarray D) except *:
 | 
					 | 
				
			||||||
    j = D.shape[1]-2
 | 
					 | 
				
			||||||
    cdef int i = D.shape[0]-2
 | 
					 | 
				
			||||||
    while i >= 0:
 | 
					 | 
				
			||||||
        while D[i+1, j] < D[i+1, j+1]:
 | 
					 | 
				
			||||||
            j -= 1
 | 
					 | 
				
			||||||
        if D[i, j+1] < D[i+1, j+1]:
 | 
					 | 
				
			||||||
            i2j[i] = -1
 | 
					 | 
				
			||||||
        else:
 | 
					 | 
				
			||||||
            i2j[i] = j
 | 
					 | 
				
			||||||
            j -= 1
 | 
					 | 
				
			||||||
        i -= 1
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
cdef void fill_j2i(np.ndarray j2i, np.ndarray D) except *:
 | 
					 | 
				
			||||||
    i = D.shape[0]-2
 | 
					 | 
				
			||||||
    cdef int j = D.shape[1]-2
 | 
					 | 
				
			||||||
    while j >= 0:
 | 
					 | 
				
			||||||
        while D[i, j+1] < D[i+1, j+1]:
 | 
					 | 
				
			||||||
            i -= 1
 | 
					 | 
				
			||||||
        if D[i+1, j] < D[i+1, j+1]:
 | 
					 | 
				
			||||||
            j2i[j] = -1
 | 
					 | 
				
			||||||
        else:
 | 
					 | 
				
			||||||
            j2i[j] = i
 | 
					 | 
				
			||||||
            i -= 1
 | 
					 | 
				
			||||||
        j -= 1
 | 
					 | 
				
			||||||
							
								
								
									
										985
									
								
								spacy/_ml.py
									
									
									
									
									
								
							
							
						
						
									
										985
									
								
								spacy/_ml.py
									
									
									
									
									
								
							| 
						 | 
					@ -1,985 +0,0 @@
 | 
				
			||||||
# coding: utf8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
import numpy
 | 
					 | 
				
			||||||
from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu
 | 
					 | 
				
			||||||
from thinc.t2t import ExtractWindow, ParametricAttention
 | 
					 | 
				
			||||||
from thinc.t2v import Pooling, sum_pool, mean_pool
 | 
					 | 
				
			||||||
from thinc.i2v import HashEmbed
 | 
					 | 
				
			||||||
from thinc.misc import Residual, FeatureExtracter
 | 
					 | 
				
			||||||
from thinc.misc import LayerNorm as LN
 | 
					 | 
				
			||||||
from thinc.api import add, layerize, chain, clone, concatenate, with_flatten
 | 
					 | 
				
			||||||
from thinc.api import with_getitem, flatten_add_lengths
 | 
					 | 
				
			||||||
from thinc.api import uniqued, wrap, noop
 | 
					 | 
				
			||||||
from thinc.linear.linear import LinearModel
 | 
					 | 
				
			||||||
from thinc.neural.ops import NumpyOps, CupyOps
 | 
					 | 
				
			||||||
from thinc.neural.util import get_array_module, copy_array
 | 
					 | 
				
			||||||
from thinc.neural.optimizers import Adam
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from thinc import describe
 | 
					 | 
				
			||||||
from thinc.describe import Dimension, Synapses, Biases, Gradient
 | 
					 | 
				
			||||||
from thinc.neural._classes.affine import _set_dimensions_if_needed
 | 
					 | 
				
			||||||
import thinc.extra.load_nlp
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE
 | 
					 | 
				
			||||||
from .errors import Errors, user_warning, Warnings
 | 
					 | 
				
			||||||
from . import util
 | 
					 | 
				
			||||||
from . import ml as new_ml
 | 
					 | 
				
			||||||
from .ml import _legacy_tok2vec
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
VECTORS_KEY = "spacy_pretrained_vectors"
 | 
					 | 
				
			||||||
# Backwards compatibility with <2.2.2
 | 
					 | 
				
			||||||
USE_MODEL_REGISTRY_TOK2VEC = False
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def cosine(vec1, vec2):
 | 
					 | 
				
			||||||
    xp = get_array_module(vec1)
 | 
					 | 
				
			||||||
    norm1 = xp.linalg.norm(vec1)
 | 
					 | 
				
			||||||
    norm2 = xp.linalg.norm(vec2)
 | 
					 | 
				
			||||||
    if norm1 == 0.0 or norm2 == 0.0:
 | 
					 | 
				
			||||||
        return 0
 | 
					 | 
				
			||||||
    else:
 | 
					 | 
				
			||||||
        return vec1.dot(vec2) / (norm1 * norm2)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def create_default_optimizer(ops, **cfg):
 | 
					 | 
				
			||||||
    learn_rate = util.env_opt("learn_rate", 0.001)
 | 
					 | 
				
			||||||
    beta1 = util.env_opt("optimizer_B1", 0.9)
 | 
					 | 
				
			||||||
    beta2 = util.env_opt("optimizer_B2", 0.999)
 | 
					 | 
				
			||||||
    eps = util.env_opt("optimizer_eps", 1e-8)
 | 
					 | 
				
			||||||
    L2 = util.env_opt("L2_penalty", 1e-6)
 | 
					 | 
				
			||||||
    max_grad_norm = util.env_opt("grad_norm_clip", 1.0)
 | 
					 | 
				
			||||||
    optimizer = Adam(ops, learn_rate, L2=L2, beta1=beta1, beta2=beta2, eps=eps)
 | 
					 | 
				
			||||||
    optimizer.max_grad_norm = max_grad_norm
 | 
					 | 
				
			||||||
    optimizer.device = ops.device
 | 
					 | 
				
			||||||
    return optimizer
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@layerize
 | 
					 | 
				
			||||||
def _flatten_add_lengths(seqs, pad=0, drop=0.0):
 | 
					 | 
				
			||||||
    ops = Model.ops
 | 
					 | 
				
			||||||
    lengths = ops.asarray([len(seq) for seq in seqs], dtype="i")
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def finish_update(d_X, sgd=None):
 | 
					 | 
				
			||||||
        return ops.unflatten(d_X, lengths, pad=pad)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    X = ops.flatten(seqs, pad=pad)
 | 
					 | 
				
			||||||
    return (X, lengths), finish_update
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def _zero_init(model):
 | 
					 | 
				
			||||||
    def _zero_init_impl(self, *args, **kwargs):
 | 
					 | 
				
			||||||
        self.W.fill(0)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    model.on_init_hooks.append(_zero_init_impl)
 | 
					 | 
				
			||||||
    if model.W is not None:
 | 
					 | 
				
			||||||
        model.W.fill(0.0)
 | 
					 | 
				
			||||||
    return model
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def with_cpu(ops, model):
 | 
					 | 
				
			||||||
    """Wrap a model that should run on CPU, transferring inputs and outputs
 | 
					 | 
				
			||||||
    as necessary."""
 | 
					 | 
				
			||||||
    model.to_cpu()
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def with_cpu_forward(inputs, drop=0.0):
 | 
					 | 
				
			||||||
        cpu_outputs, backprop = model.begin_update(_to_cpu(inputs), drop=drop)
 | 
					 | 
				
			||||||
        gpu_outputs = _to_device(ops, cpu_outputs)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        def with_cpu_backprop(d_outputs, sgd=None):
 | 
					 | 
				
			||||||
            cpu_d_outputs = _to_cpu(d_outputs)
 | 
					 | 
				
			||||||
            return backprop(cpu_d_outputs, sgd=sgd)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        return gpu_outputs, with_cpu_backprop
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    return wrap(with_cpu_forward, model)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def _to_cpu(X):
 | 
					 | 
				
			||||||
    if isinstance(X, numpy.ndarray):
 | 
					 | 
				
			||||||
        return X
 | 
					 | 
				
			||||||
    elif isinstance(X, tuple):
 | 
					 | 
				
			||||||
        return tuple([_to_cpu(x) for x in X])
 | 
					 | 
				
			||||||
    elif isinstance(X, list):
 | 
					 | 
				
			||||||
        return [_to_cpu(x) for x in X]
 | 
					 | 
				
			||||||
    elif hasattr(X, "get"):
 | 
					 | 
				
			||||||
        return X.get()
 | 
					 | 
				
			||||||
    else:
 | 
					 | 
				
			||||||
        return X
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def _to_device(ops, X):
 | 
					 | 
				
			||||||
    if isinstance(X, tuple):
 | 
					 | 
				
			||||||
        return tuple([_to_device(ops, x) for x in X])
 | 
					 | 
				
			||||||
    elif isinstance(X, list):
 | 
					 | 
				
			||||||
        return [_to_device(ops, x) for x in X]
 | 
					 | 
				
			||||||
    else:
 | 
					 | 
				
			||||||
        return ops.asarray(X)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
class extract_ngrams(Model):
 | 
					 | 
				
			||||||
    def __init__(self, ngram_size, attr=LOWER):
 | 
					 | 
				
			||||||
        Model.__init__(self)
 | 
					 | 
				
			||||||
        self.ngram_size = ngram_size
 | 
					 | 
				
			||||||
        self.attr = attr
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def begin_update(self, docs, drop=0.0):
 | 
					 | 
				
			||||||
        batch_keys = []
 | 
					 | 
				
			||||||
        batch_vals = []
 | 
					 | 
				
			||||||
        for doc in docs:
 | 
					 | 
				
			||||||
            unigrams = doc.to_array([self.attr])
 | 
					 | 
				
			||||||
            ngrams = [unigrams]
 | 
					 | 
				
			||||||
            for n in range(2, self.ngram_size + 1):
 | 
					 | 
				
			||||||
                ngrams.append(self.ops.ngrams(n, unigrams))
 | 
					 | 
				
			||||||
            keys = self.ops.xp.concatenate(ngrams)
 | 
					 | 
				
			||||||
            keys, vals = self.ops.xp.unique(keys, return_counts=True)
 | 
					 | 
				
			||||||
            batch_keys.append(keys)
 | 
					 | 
				
			||||||
            batch_vals.append(vals)
 | 
					 | 
				
			||||||
        # The dtype here matches what thinc is expecting -- which differs per
 | 
					 | 
				
			||||||
        # platform (by int definition). This should be fixed once the problem
 | 
					 | 
				
			||||||
        # is fixed on Thinc's side.
 | 
					 | 
				
			||||||
        lengths = self.ops.asarray(
 | 
					 | 
				
			||||||
            [arr.shape[0] for arr in batch_keys], dtype=numpy.int_
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
        batch_keys = self.ops.xp.concatenate(batch_keys)
 | 
					 | 
				
			||||||
        batch_vals = self.ops.asarray(self.ops.xp.concatenate(batch_vals), dtype="f")
 | 
					 | 
				
			||||||
        return (batch_keys, batch_vals, lengths), None
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@describe.on_data(
 | 
					 | 
				
			||||||
    _set_dimensions_if_needed, lambda model, X, y: model.init_weights(model)
 | 
					 | 
				
			||||||
)
 | 
					 | 
				
			||||||
@describe.attributes(
 | 
					 | 
				
			||||||
    nI=Dimension("Input size"),
 | 
					 | 
				
			||||||
    nF=Dimension("Number of features"),
 | 
					 | 
				
			||||||
    nO=Dimension("Output size"),
 | 
					 | 
				
			||||||
    nP=Dimension("Maxout pieces"),
 | 
					 | 
				
			||||||
    W=Synapses("Weights matrix", lambda obj: (obj.nF, obj.nO, obj.nP, obj.nI)),
 | 
					 | 
				
			||||||
    b=Biases("Bias vector", lambda obj: (obj.nO, obj.nP)),
 | 
					 | 
				
			||||||
    pad=Synapses(
 | 
					 | 
				
			||||||
        "Pad",
 | 
					 | 
				
			||||||
        lambda obj: (1, obj.nF, obj.nO, obj.nP),
 | 
					 | 
				
			||||||
        lambda M, ops: ops.normal_init(M, 1.0),
 | 
					 | 
				
			||||||
    ),
 | 
					 | 
				
			||||||
    d_W=Gradient("W"),
 | 
					 | 
				
			||||||
    d_pad=Gradient("pad"),
 | 
					 | 
				
			||||||
    d_b=Gradient("b"),
 | 
					 | 
				
			||||||
)
 | 
					 | 
				
			||||||
class PrecomputableAffine(Model):
 | 
					 | 
				
			||||||
    def __init__(self, nO=None, nI=None, nF=None, nP=None, **kwargs):
 | 
					 | 
				
			||||||
        Model.__init__(self, **kwargs)
 | 
					 | 
				
			||||||
        self.nO = nO
 | 
					 | 
				
			||||||
        self.nP = nP
 | 
					 | 
				
			||||||
        self.nI = nI
 | 
					 | 
				
			||||||
        self.nF = nF
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def begin_update(self, X, drop=0.0):
 | 
					 | 
				
			||||||
        Yf = self.ops.gemm(
 | 
					 | 
				
			||||||
            X, self.W.reshape((self.nF * self.nO * self.nP, self.nI)), trans2=True
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
        Yf = Yf.reshape((Yf.shape[0], self.nF, self.nO, self.nP))
 | 
					 | 
				
			||||||
        Yf = self._add_padding(Yf)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        def backward(dY_ids, sgd=None):
 | 
					 | 
				
			||||||
            dY, ids = dY_ids
 | 
					 | 
				
			||||||
            dY, ids = self._backprop_padding(dY, ids)
 | 
					 | 
				
			||||||
            Xf = X[ids]
 | 
					 | 
				
			||||||
            Xf = Xf.reshape((Xf.shape[0], self.nF * self.nI))
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
            self.d_b += dY.sum(axis=0)
 | 
					 | 
				
			||||||
            dY = dY.reshape((dY.shape[0], self.nO * self.nP))
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
            Wopfi = self.W.transpose((1, 2, 0, 3))
 | 
					 | 
				
			||||||
            Wopfi = self.ops.xp.ascontiguousarray(Wopfi)
 | 
					 | 
				
			||||||
            Wopfi = Wopfi.reshape((self.nO * self.nP, self.nF * self.nI))
 | 
					 | 
				
			||||||
            dXf = self.ops.gemm(dY.reshape((dY.shape[0], self.nO * self.nP)), Wopfi)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
            # Reuse the buffer
 | 
					 | 
				
			||||||
            dWopfi = Wopfi
 | 
					 | 
				
			||||||
            dWopfi.fill(0.0)
 | 
					 | 
				
			||||||
            self.ops.gemm(dY, Xf, out=dWopfi, trans1=True)
 | 
					 | 
				
			||||||
            dWopfi = dWopfi.reshape((self.nO, self.nP, self.nF, self.nI))
 | 
					 | 
				
			||||||
            # (o, p, f, i) --> (f, o, p, i)
 | 
					 | 
				
			||||||
            self.d_W += dWopfi.transpose((2, 0, 1, 3))
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
            if sgd is not None:
 | 
					 | 
				
			||||||
                sgd(self._mem.weights, self._mem.gradient, key=self.id)
 | 
					 | 
				
			||||||
            return dXf.reshape((dXf.shape[0], self.nF, self.nI))
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        return Yf, backward
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def _add_padding(self, Yf):
 | 
					 | 
				
			||||||
        Yf_padded = self.ops.xp.vstack((self.pad, Yf))
 | 
					 | 
				
			||||||
        return Yf_padded
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def _backprop_padding(self, dY, ids):
 | 
					 | 
				
			||||||
        # (1, nF, nO, nP) += (nN, nF, nO, nP) where IDs (nN, nF) < 0
 | 
					 | 
				
			||||||
        mask = ids < 0.0
 | 
					 | 
				
			||||||
        mask = mask.sum(axis=1)
 | 
					 | 
				
			||||||
        d_pad = dY * mask.reshape((ids.shape[0], 1, 1))
 | 
					 | 
				
			||||||
        self.d_pad += d_pad.sum(axis=0)
 | 
					 | 
				
			||||||
        return dY, ids
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    @staticmethod
 | 
					 | 
				
			||||||
    def init_weights(model):
 | 
					 | 
				
			||||||
        """This is like the 'layer sequential unit variance', but instead
 | 
					 | 
				
			||||||
        of taking the actual inputs, we randomly generate whitened data.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        Why's this all so complicated? We have a huge number of inputs,
 | 
					 | 
				
			||||||
        and the maxout unit makes guessing the dynamics tricky. Instead
 | 
					 | 
				
			||||||
        we set the maxout weights to values that empirically result in
 | 
					 | 
				
			||||||
        whitened outputs given whitened inputs.
 | 
					 | 
				
			||||||
        """
 | 
					 | 
				
			||||||
        if (model.W ** 2).sum() != 0.0:
 | 
					 | 
				
			||||||
            return
 | 
					 | 
				
			||||||
        ops = model.ops
 | 
					 | 
				
			||||||
        xp = ops.xp
 | 
					 | 
				
			||||||
        ops.normal_init(model.W, model.nF * model.nI, inplace=True)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        ids = ops.allocate((5000, model.nF), dtype="f")
 | 
					 | 
				
			||||||
        ids += xp.random.uniform(0, 1000, ids.shape)
 | 
					 | 
				
			||||||
        ids = ops.asarray(ids, dtype="i")
 | 
					 | 
				
			||||||
        tokvecs = ops.allocate((5000, model.nI), dtype="f")
 | 
					 | 
				
			||||||
        tokvecs += xp.random.normal(loc=0.0, scale=1.0, size=tokvecs.size).reshape(
 | 
					 | 
				
			||||||
            tokvecs.shape
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        def predict(ids, tokvecs):
 | 
					 | 
				
			||||||
            # nS ids. nW tokvecs. Exclude the padding array.
 | 
					 | 
				
			||||||
            hiddens = model(tokvecs[:-1])  # (nW, f, o, p)
 | 
					 | 
				
			||||||
            vectors = model.ops.allocate((ids.shape[0], model.nO * model.nP), dtype="f")
 | 
					 | 
				
			||||||
            # need nS vectors
 | 
					 | 
				
			||||||
            hiddens = hiddens.reshape(
 | 
					 | 
				
			||||||
                (hiddens.shape[0] * model.nF, model.nO * model.nP)
 | 
					 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
            model.ops.scatter_add(vectors, ids.flatten(), hiddens)
 | 
					 | 
				
			||||||
            vectors = vectors.reshape((vectors.shape[0], model.nO, model.nP))
 | 
					 | 
				
			||||||
            vectors += model.b
 | 
					 | 
				
			||||||
            vectors = model.ops.asarray(vectors)
 | 
					 | 
				
			||||||
            if model.nP >= 2:
 | 
					 | 
				
			||||||
                return model.ops.maxout(vectors)[0]
 | 
					 | 
				
			||||||
            else:
 | 
					 | 
				
			||||||
                return vectors * (vectors >= 0)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        tol_var = 0.01
 | 
					 | 
				
			||||||
        tol_mean = 0.01
 | 
					 | 
				
			||||||
        t_max = 10
 | 
					 | 
				
			||||||
        t_i = 0
 | 
					 | 
				
			||||||
        for t_i in range(t_max):
 | 
					 | 
				
			||||||
            acts1 = predict(ids, tokvecs)
 | 
					 | 
				
			||||||
            var = model.ops.xp.var(acts1)
 | 
					 | 
				
			||||||
            mean = model.ops.xp.mean(acts1)
 | 
					 | 
				
			||||||
            if abs(var - 1.0) >= tol_var:
 | 
					 | 
				
			||||||
                model.W /= model.ops.xp.sqrt(var)
 | 
					 | 
				
			||||||
            elif abs(mean) >= tol_mean:
 | 
					 | 
				
			||||||
                model.b -= mean
 | 
					 | 
				
			||||||
            else:
 | 
					 | 
				
			||||||
                break
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def link_vectors_to_models(vocab):
 | 
					 | 
				
			||||||
    vectors = vocab.vectors
 | 
					 | 
				
			||||||
    if vectors.name is None:
 | 
					 | 
				
			||||||
        vectors.name = VECTORS_KEY
 | 
					 | 
				
			||||||
        if vectors.data.size != 0:
 | 
					 | 
				
			||||||
            user_warning(Warnings.W020.format(shape=vectors.data.shape))
 | 
					 | 
				
			||||||
    ops = Model.ops
 | 
					 | 
				
			||||||
    for word in vocab:
 | 
					 | 
				
			||||||
        if word.orth in vectors.key2row:
 | 
					 | 
				
			||||||
            word.rank = vectors.key2row[word.orth]
 | 
					 | 
				
			||||||
        else:
 | 
					 | 
				
			||||||
            word.rank = 0
 | 
					 | 
				
			||||||
    data = ops.asarray(vectors.data)
 | 
					 | 
				
			||||||
    # Set an entry here, so that vectors are accessed by StaticVectors
 | 
					 | 
				
			||||||
    # (unideal, I know)
 | 
					 | 
				
			||||||
    key = (ops.device, vectors.name)
 | 
					 | 
				
			||||||
    if key in thinc.extra.load_nlp.VECTORS:
 | 
					 | 
				
			||||||
        if thinc.extra.load_nlp.VECTORS[key].shape != data.shape:
 | 
					 | 
				
			||||||
            # This is a hack to avoid the problem in #3853. Maybe we should
 | 
					 | 
				
			||||||
            # print a warning as well?
 | 
					 | 
				
			||||||
            old_name = vectors.name
 | 
					 | 
				
			||||||
            new_name = vectors.name + "_%d" % data.shape[0]
 | 
					 | 
				
			||||||
            user_warning(Warnings.W019.format(old=old_name, new=new_name))
 | 
					 | 
				
			||||||
            vectors.name = new_name
 | 
					 | 
				
			||||||
            key = (ops.device, vectors.name)
 | 
					 | 
				
			||||||
    thinc.extra.load_nlp.VECTORS[key] = data
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def PyTorchBiLSTM(nO, nI, depth, dropout=0.2):
 | 
					 | 
				
			||||||
    import torch.nn
 | 
					 | 
				
			||||||
    from thinc.api import with_square_sequences
 | 
					 | 
				
			||||||
    from thinc.extra.wrappers import PyTorchWrapperRNN
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    if depth == 0:
 | 
					 | 
				
			||||||
        return layerize(noop())
 | 
					 | 
				
			||||||
    model = torch.nn.LSTM(nI, nO // 2, depth, bidirectional=True, dropout=dropout)
 | 
					 | 
				
			||||||
    return with_square_sequences(PyTorchWrapperRNN(model))
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def Tok2Vec(width, embed_size, **kwargs):
 | 
					 | 
				
			||||||
    if not USE_MODEL_REGISTRY_TOK2VEC:
 | 
					 | 
				
			||||||
        # Preserve prior tok2vec for backwards compat, in v2.2.2
 | 
					 | 
				
			||||||
        return _legacy_tok2vec.Tok2Vec(width, embed_size, **kwargs)
 | 
					 | 
				
			||||||
    pretrained_vectors = kwargs.get("pretrained_vectors", None)
 | 
					 | 
				
			||||||
    cnn_maxout_pieces = kwargs.get("cnn_maxout_pieces", 3)
 | 
					 | 
				
			||||||
    subword_features = kwargs.get("subword_features", True)
 | 
					 | 
				
			||||||
    char_embed = kwargs.get("char_embed", False)
 | 
					 | 
				
			||||||
    conv_depth = kwargs.get("conv_depth", 4)
 | 
					 | 
				
			||||||
    bilstm_depth = kwargs.get("bilstm_depth", 0)
 | 
					 | 
				
			||||||
    conv_window = kwargs.get("conv_window", 1)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    cols = ["ID", "NORM", "PREFIX", "SUFFIX", "SHAPE", "ORTH"]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    doc2feats_cfg = {"arch": "spacy.Doc2Feats.v1", "config": {"columns": cols}}
 | 
					 | 
				
			||||||
    if char_embed:
 | 
					 | 
				
			||||||
        embed_cfg = {
 | 
					 | 
				
			||||||
            "arch": "spacy.CharacterEmbed.v1",
 | 
					 | 
				
			||||||
            "config": {
 | 
					 | 
				
			||||||
                "width": 64,
 | 
					 | 
				
			||||||
                "chars": 6,
 | 
					 | 
				
			||||||
                "@mix": {
 | 
					 | 
				
			||||||
                    "arch": "spacy.LayerNormalizedMaxout.v1",
 | 
					 | 
				
			||||||
                    "config": {"width": width, "pieces": 3},
 | 
					 | 
				
			||||||
                },
 | 
					 | 
				
			||||||
                "@embed_features": None,
 | 
					 | 
				
			||||||
            },
 | 
					 | 
				
			||||||
        }
 | 
					 | 
				
			||||||
    else:
 | 
					 | 
				
			||||||
        embed_cfg = {
 | 
					 | 
				
			||||||
            "arch": "spacy.MultiHashEmbed.v1",
 | 
					 | 
				
			||||||
            "config": {
 | 
					 | 
				
			||||||
                "width": width,
 | 
					 | 
				
			||||||
                "rows": embed_size,
 | 
					 | 
				
			||||||
                "columns": cols,
 | 
					 | 
				
			||||||
                "use_subwords": subword_features,
 | 
					 | 
				
			||||||
                "@pretrained_vectors": None,
 | 
					 | 
				
			||||||
                "@mix": {
 | 
					 | 
				
			||||||
                    "arch": "spacy.LayerNormalizedMaxout.v1",
 | 
					 | 
				
			||||||
                    "config": {"width": width, "pieces": 3},
 | 
					 | 
				
			||||||
                },
 | 
					 | 
				
			||||||
            },
 | 
					 | 
				
			||||||
        }
 | 
					 | 
				
			||||||
        if pretrained_vectors:
 | 
					 | 
				
			||||||
            embed_cfg["config"]["@pretrained_vectors"] = {
 | 
					 | 
				
			||||||
                "arch": "spacy.PretrainedVectors.v1",
 | 
					 | 
				
			||||||
                "config": {
 | 
					 | 
				
			||||||
                    "vectors_name": pretrained_vectors,
 | 
					 | 
				
			||||||
                    "width": width,
 | 
					 | 
				
			||||||
                    "column": cols.index("ID"),
 | 
					 | 
				
			||||||
                },
 | 
					 | 
				
			||||||
            }
 | 
					 | 
				
			||||||
    if cnn_maxout_pieces >= 2:
 | 
					 | 
				
			||||||
        cnn_cfg = {
 | 
					 | 
				
			||||||
            "arch": "spacy.MaxoutWindowEncoder.v1",
 | 
					 | 
				
			||||||
            "config": {
 | 
					 | 
				
			||||||
                "width": width,
 | 
					 | 
				
			||||||
                "window_size": conv_window,
 | 
					 | 
				
			||||||
                "pieces": cnn_maxout_pieces,
 | 
					 | 
				
			||||||
                "depth": conv_depth,
 | 
					 | 
				
			||||||
            },
 | 
					 | 
				
			||||||
        }
 | 
					 | 
				
			||||||
    else:
 | 
					 | 
				
			||||||
        cnn_cfg = {
 | 
					 | 
				
			||||||
            "arch": "spacy.MishWindowEncoder.v1",
 | 
					 | 
				
			||||||
            "config": {"width": width, "window_size": conv_window, "depth": conv_depth},
 | 
					 | 
				
			||||||
        }
 | 
					 | 
				
			||||||
    bilstm_cfg = {
 | 
					 | 
				
			||||||
        "arch": "spacy.TorchBiLSTMEncoder.v1",
 | 
					 | 
				
			||||||
        "config": {"width": width, "depth": bilstm_depth},
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
    if conv_depth == 0 and bilstm_depth == 0:
 | 
					 | 
				
			||||||
        encode_cfg = {}
 | 
					 | 
				
			||||||
    elif conv_depth >= 1 and bilstm_depth >= 1:
 | 
					 | 
				
			||||||
        encode_cfg = {
 | 
					 | 
				
			||||||
            "arch": "thinc.FeedForward.v1",
 | 
					 | 
				
			||||||
            "config": {"children": [cnn_cfg, bilstm_cfg]},
 | 
					 | 
				
			||||||
        }
 | 
					 | 
				
			||||||
    elif conv_depth >= 1:
 | 
					 | 
				
			||||||
        encode_cfg = cnn_cfg
 | 
					 | 
				
			||||||
    else:
 | 
					 | 
				
			||||||
        encode_cfg = bilstm_cfg
 | 
					 | 
				
			||||||
    config = {"@doc2feats": doc2feats_cfg, "@embed": embed_cfg, "@encode": encode_cfg}
 | 
					 | 
				
			||||||
    return new_ml.Tok2Vec(config)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def reapply(layer, n_times):
 | 
					 | 
				
			||||||
    def reapply_fwd(X, drop=0.0):
 | 
					 | 
				
			||||||
        backprops = []
 | 
					 | 
				
			||||||
        for i in range(n_times):
 | 
					 | 
				
			||||||
            Y, backprop = layer.begin_update(X, drop=drop)
 | 
					 | 
				
			||||||
            X = Y
 | 
					 | 
				
			||||||
            backprops.append(backprop)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        def reapply_bwd(dY, sgd=None):
 | 
					 | 
				
			||||||
            dX = None
 | 
					 | 
				
			||||||
            for backprop in reversed(backprops):
 | 
					 | 
				
			||||||
                dY = backprop(dY, sgd=sgd)
 | 
					 | 
				
			||||||
                if dX is None:
 | 
					 | 
				
			||||||
                    dX = dY
 | 
					 | 
				
			||||||
                else:
 | 
					 | 
				
			||||||
                    dX += dY
 | 
					 | 
				
			||||||
            return dX
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        return Y, reapply_bwd
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    return wrap(reapply_fwd, layer)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def asarray(ops, dtype):
 | 
					 | 
				
			||||||
    def forward(X, drop=0.0):
 | 
					 | 
				
			||||||
        return ops.asarray(X, dtype=dtype), None
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    return layerize(forward)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def _divide_array(X, size):
 | 
					 | 
				
			||||||
    parts = []
 | 
					 | 
				
			||||||
    index = 0
 | 
					 | 
				
			||||||
    while index < len(X):
 | 
					 | 
				
			||||||
        parts.append(X[index : index + size])
 | 
					 | 
				
			||||||
        index += size
 | 
					 | 
				
			||||||
    return parts
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def get_col(idx):
 | 
					 | 
				
			||||||
    if idx < 0:
 | 
					 | 
				
			||||||
        raise IndexError(Errors.E066.format(value=idx))
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def forward(X, drop=0.0):
 | 
					 | 
				
			||||||
        if isinstance(X, numpy.ndarray):
 | 
					 | 
				
			||||||
            ops = NumpyOps()
 | 
					 | 
				
			||||||
        else:
 | 
					 | 
				
			||||||
            ops = CupyOps()
 | 
					 | 
				
			||||||
        output = ops.xp.ascontiguousarray(X[:, idx], dtype=X.dtype)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        def backward(y, sgd=None):
 | 
					 | 
				
			||||||
            dX = ops.allocate(X.shape)
 | 
					 | 
				
			||||||
            dX[:, idx] += y
 | 
					 | 
				
			||||||
            return dX
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        return output, backward
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    return layerize(forward)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def doc2feats(cols=None):
 | 
					 | 
				
			||||||
    if cols is None:
 | 
					 | 
				
			||||||
        cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def forward(docs, drop=0.0):
 | 
					 | 
				
			||||||
        feats = []
 | 
					 | 
				
			||||||
        for doc in docs:
 | 
					 | 
				
			||||||
            feats.append(doc.to_array(cols))
 | 
					 | 
				
			||||||
        return feats, None
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    model = layerize(forward)
 | 
					 | 
				
			||||||
    model.cols = cols
 | 
					 | 
				
			||||||
    return model
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def print_shape(prefix):
 | 
					 | 
				
			||||||
    def forward(X, drop=0.0):
 | 
					 | 
				
			||||||
        return X, lambda dX, **kwargs: dX
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    return layerize(forward)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@layerize
 | 
					 | 
				
			||||||
def get_token_vectors(tokens_attrs_vectors, drop=0.0):
 | 
					 | 
				
			||||||
    tokens, attrs, vectors = tokens_attrs_vectors
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def backward(d_output, sgd=None):
 | 
					 | 
				
			||||||
        return (tokens, d_output)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    return vectors, backward
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@layerize
 | 
					 | 
				
			||||||
def logistic(X, drop=0.0):
 | 
					 | 
				
			||||||
    xp = get_array_module(X)
 | 
					 | 
				
			||||||
    if not isinstance(X, xp.ndarray):
 | 
					 | 
				
			||||||
        X = xp.asarray(X)
 | 
					 | 
				
			||||||
    # Clip to range (-10, 10)
 | 
					 | 
				
			||||||
    X = xp.minimum(X, 10.0, X)
 | 
					 | 
				
			||||||
    X = xp.maximum(X, -10.0, X)
 | 
					 | 
				
			||||||
    Y = 1.0 / (1.0 + xp.exp(-X))
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def logistic_bwd(dY, sgd=None):
 | 
					 | 
				
			||||||
        dX = dY * (Y * (1 - Y))
 | 
					 | 
				
			||||||
        return dX
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    return Y, logistic_bwd
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def zero_init(model):
 | 
					 | 
				
			||||||
    def _zero_init_impl(self, X, y):
 | 
					 | 
				
			||||||
        self.W.fill(0)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    model.on_data_hooks.append(_zero_init_impl)
 | 
					 | 
				
			||||||
    return model
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def getitem(i):
 | 
					 | 
				
			||||||
    def getitem_fwd(X, drop=0.0):
 | 
					 | 
				
			||||||
        return X[i], None
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    return layerize(getitem_fwd)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@describe.attributes(
 | 
					 | 
				
			||||||
    W=Synapses("Weights matrix", lambda obj: (obj.nO, obj.nI), lambda W, ops: None)
 | 
					 | 
				
			||||||
)
 | 
					 | 
				
			||||||
class MultiSoftmax(Affine):
 | 
					 | 
				
			||||||
    """Neural network layer that predicts several multi-class attributes at once.
 | 
					 | 
				
			||||||
    For instance, we might predict one class with 6 variables, and another with 5.
 | 
					 | 
				
			||||||
    We predict the 11 neurons required for this, and then softmax them such
 | 
					 | 
				
			||||||
    that columns 0-6 make a probability distribution and coumns 6-11 make another.
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    name = "multisoftmax"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def __init__(self, out_sizes, nI=None, **kwargs):
 | 
					 | 
				
			||||||
        Model.__init__(self, **kwargs)
 | 
					 | 
				
			||||||
        self.out_sizes = out_sizes
 | 
					 | 
				
			||||||
        self.nO = sum(out_sizes)
 | 
					 | 
				
			||||||
        self.nI = nI
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def predict(self, input__BI):
 | 
					 | 
				
			||||||
        output__BO = self.ops.affine(self.W, self.b, input__BI)
 | 
					 | 
				
			||||||
        i = 0
 | 
					 | 
				
			||||||
        for out_size in self.out_sizes:
 | 
					 | 
				
			||||||
            self.ops.softmax(output__BO[:, i : i + out_size], inplace=True)
 | 
					 | 
				
			||||||
            i += out_size
 | 
					 | 
				
			||||||
        return output__BO
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def begin_update(self, input__BI, drop=0.0):
 | 
					 | 
				
			||||||
        output__BO = self.predict(input__BI)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        def finish_update(grad__BO, sgd=None):
 | 
					 | 
				
			||||||
            self.d_W += self.ops.gemm(grad__BO, input__BI, trans1=True)
 | 
					 | 
				
			||||||
            self.d_b += grad__BO.sum(axis=0)
 | 
					 | 
				
			||||||
            grad__BI = self.ops.gemm(grad__BO, self.W)
 | 
					 | 
				
			||||||
            if sgd is not None:
 | 
					 | 
				
			||||||
                sgd(self._mem.weights, self._mem.gradient, key=self.id)
 | 
					 | 
				
			||||||
            return grad__BI
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        return output__BO, finish_update
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def build_tagger_model(nr_class, **cfg):
 | 
					 | 
				
			||||||
    embed_size = util.env_opt("embed_size", 2000)
 | 
					 | 
				
			||||||
    if "token_vector_width" in cfg:
 | 
					 | 
				
			||||||
        token_vector_width = cfg["token_vector_width"]
 | 
					 | 
				
			||||||
    else:
 | 
					 | 
				
			||||||
        token_vector_width = util.env_opt("token_vector_width", 96)
 | 
					 | 
				
			||||||
    pretrained_vectors = cfg.get("pretrained_vectors")
 | 
					 | 
				
			||||||
    subword_features = cfg.get("subword_features", True)
 | 
					 | 
				
			||||||
    with Model.define_operators({">>": chain, "+": add}):
 | 
					 | 
				
			||||||
        if "tok2vec" in cfg:
 | 
					 | 
				
			||||||
            tok2vec = cfg["tok2vec"]
 | 
					 | 
				
			||||||
        else:
 | 
					 | 
				
			||||||
            tok2vec = Tok2Vec(
 | 
					 | 
				
			||||||
                token_vector_width,
 | 
					 | 
				
			||||||
                embed_size,
 | 
					 | 
				
			||||||
                subword_features=subword_features,
 | 
					 | 
				
			||||||
                pretrained_vectors=pretrained_vectors,
 | 
					 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
        softmax = with_flatten(Softmax(nr_class, token_vector_width))
 | 
					 | 
				
			||||||
        model = tok2vec >> softmax
 | 
					 | 
				
			||||||
    model.nI = None
 | 
					 | 
				
			||||||
    model.tok2vec = tok2vec
 | 
					 | 
				
			||||||
    model.softmax = softmax
 | 
					 | 
				
			||||||
    return model
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def build_morphologizer_model(class_nums, **cfg):
 | 
					 | 
				
			||||||
    embed_size = util.env_opt("embed_size", 7000)
 | 
					 | 
				
			||||||
    if "token_vector_width" in cfg:
 | 
					 | 
				
			||||||
        token_vector_width = cfg["token_vector_width"]
 | 
					 | 
				
			||||||
    else:
 | 
					 | 
				
			||||||
        token_vector_width = util.env_opt("token_vector_width", 128)
 | 
					 | 
				
			||||||
    pretrained_vectors = cfg.get("pretrained_vectors")
 | 
					 | 
				
			||||||
    char_embed = cfg.get("char_embed", True)
 | 
					 | 
				
			||||||
    with Model.define_operators({">>": chain, "+": add, "**": clone}):
 | 
					 | 
				
			||||||
        if "tok2vec" in cfg:
 | 
					 | 
				
			||||||
            tok2vec = cfg["tok2vec"]
 | 
					 | 
				
			||||||
        else:
 | 
					 | 
				
			||||||
            tok2vec = Tok2Vec(
 | 
					 | 
				
			||||||
                token_vector_width,
 | 
					 | 
				
			||||||
                embed_size,
 | 
					 | 
				
			||||||
                char_embed=char_embed,
 | 
					 | 
				
			||||||
                pretrained_vectors=pretrained_vectors,
 | 
					 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
        softmax = with_flatten(MultiSoftmax(class_nums, token_vector_width))
 | 
					 | 
				
			||||||
        softmax.out_sizes = class_nums
 | 
					 | 
				
			||||||
        model = tok2vec >> softmax
 | 
					 | 
				
			||||||
    model.nI = None
 | 
					 | 
				
			||||||
    model.tok2vec = tok2vec
 | 
					 | 
				
			||||||
    model.softmax = softmax
 | 
					 | 
				
			||||||
    return model
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@layerize
 | 
					 | 
				
			||||||
def SpacyVectors(docs, drop=0.0):
 | 
					 | 
				
			||||||
    batch = []
 | 
					 | 
				
			||||||
    for doc in docs:
 | 
					 | 
				
			||||||
        indices = numpy.zeros((len(doc),), dtype="i")
 | 
					 | 
				
			||||||
        for i, word in enumerate(doc):
 | 
					 | 
				
			||||||
            if word.orth in doc.vocab.vectors.key2row:
 | 
					 | 
				
			||||||
                indices[i] = doc.vocab.vectors.key2row[word.orth]
 | 
					 | 
				
			||||||
            else:
 | 
					 | 
				
			||||||
                indices[i] = 0
 | 
					 | 
				
			||||||
        vectors = doc.vocab.vectors.data[indices]
 | 
					 | 
				
			||||||
        batch.append(vectors)
 | 
					 | 
				
			||||||
    return batch, None
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def build_text_classifier(nr_class, width=64, **cfg):
 | 
					 | 
				
			||||||
    depth = cfg.get("depth", 2)
 | 
					 | 
				
			||||||
    nr_vector = cfg.get("nr_vector", 5000)
 | 
					 | 
				
			||||||
    pretrained_dims = cfg.get("pretrained_dims", 0)
 | 
					 | 
				
			||||||
    with Model.define_operators({">>": chain, "+": add, "|": concatenate, "**": clone}):
 | 
					 | 
				
			||||||
        if cfg.get("low_data") and pretrained_dims:
 | 
					 | 
				
			||||||
            model = (
 | 
					 | 
				
			||||||
                SpacyVectors
 | 
					 | 
				
			||||||
                >> flatten_add_lengths
 | 
					 | 
				
			||||||
                >> with_getitem(0, Affine(width, pretrained_dims))
 | 
					 | 
				
			||||||
                >> ParametricAttention(width)
 | 
					 | 
				
			||||||
                >> Pooling(sum_pool)
 | 
					 | 
				
			||||||
                >> Residual(ReLu(width, width)) ** 2
 | 
					 | 
				
			||||||
                >> zero_init(Affine(nr_class, width, drop_factor=0.0))
 | 
					 | 
				
			||||||
                >> logistic
 | 
					 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
            return model
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        lower = HashEmbed(width, nr_vector, column=1)
 | 
					 | 
				
			||||||
        prefix = HashEmbed(width // 2, nr_vector, column=2)
 | 
					 | 
				
			||||||
        suffix = HashEmbed(width // 2, nr_vector, column=3)
 | 
					 | 
				
			||||||
        shape = HashEmbed(width // 2, nr_vector, column=4)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        trained_vectors = FeatureExtracter(
 | 
					 | 
				
			||||||
            [ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID]
 | 
					 | 
				
			||||||
        ) >> with_flatten(
 | 
					 | 
				
			||||||
            uniqued(
 | 
					 | 
				
			||||||
                (lower | prefix | suffix | shape)
 | 
					 | 
				
			||||||
                >> LN(Maxout(width, width + (width // 2) * 3)),
 | 
					 | 
				
			||||||
                column=0,
 | 
					 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        if pretrained_dims:
 | 
					 | 
				
			||||||
            static_vectors = SpacyVectors >> with_flatten(
 | 
					 | 
				
			||||||
                Affine(width, pretrained_dims)
 | 
					 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
            # TODO Make concatenate support lists
 | 
					 | 
				
			||||||
            vectors = concatenate_lists(trained_vectors, static_vectors)
 | 
					 | 
				
			||||||
            vectors_width = width * 2
 | 
					 | 
				
			||||||
        else:
 | 
					 | 
				
			||||||
            vectors = trained_vectors
 | 
					 | 
				
			||||||
            vectors_width = width
 | 
					 | 
				
			||||||
            static_vectors = None
 | 
					 | 
				
			||||||
        tok2vec = vectors >> with_flatten(
 | 
					 | 
				
			||||||
            LN(Maxout(width, vectors_width))
 | 
					 | 
				
			||||||
            >> Residual((ExtractWindow(nW=1) >> LN(Maxout(width, width * 3)))) ** depth,
 | 
					 | 
				
			||||||
            pad=depth,
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
        cnn_model = (
 | 
					 | 
				
			||||||
            tok2vec
 | 
					 | 
				
			||||||
            >> flatten_add_lengths
 | 
					 | 
				
			||||||
            >> ParametricAttention(width)
 | 
					 | 
				
			||||||
            >> Pooling(sum_pool)
 | 
					 | 
				
			||||||
            >> Residual(zero_init(Maxout(width, width)))
 | 
					 | 
				
			||||||
            >> zero_init(Affine(nr_class, width, drop_factor=0.0))
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        linear_model = build_bow_text_classifier(
 | 
					 | 
				
			||||||
            nr_class, ngram_size=cfg.get("ngram_size", 1), exclusive_classes=False
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
        if cfg.get("exclusive_classes"):
 | 
					 | 
				
			||||||
            output_layer = Softmax(nr_class, nr_class * 2)
 | 
					 | 
				
			||||||
        else:
 | 
					 | 
				
			||||||
            output_layer = (
 | 
					 | 
				
			||||||
                zero_init(Affine(nr_class, nr_class * 2, drop_factor=0.0)) >> logistic
 | 
					 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
        model = (linear_model | cnn_model) >> output_layer
 | 
					 | 
				
			||||||
        model.tok2vec = chain(tok2vec, flatten)
 | 
					 | 
				
			||||||
    model.nO = nr_class
 | 
					 | 
				
			||||||
    model.lsuv = False
 | 
					 | 
				
			||||||
    return model
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def build_bow_text_classifier(
 | 
					 | 
				
			||||||
    nr_class, ngram_size=1, exclusive_classes=False, no_output_layer=False, **cfg
 | 
					 | 
				
			||||||
):
 | 
					 | 
				
			||||||
    with Model.define_operators({">>": chain}):
 | 
					 | 
				
			||||||
        model = with_cpu(
 | 
					 | 
				
			||||||
            Model.ops, extract_ngrams(ngram_size, attr=ORTH) >> LinearModel(nr_class)
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
        if not no_output_layer:
 | 
					 | 
				
			||||||
            model = model >> (cpu_softmax if exclusive_classes else logistic)
 | 
					 | 
				
			||||||
    model.nO = nr_class
 | 
					 | 
				
			||||||
    return model
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@layerize
 | 
					 | 
				
			||||||
def cpu_softmax(X, drop=0.0):
 | 
					 | 
				
			||||||
    ops = NumpyOps()
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def cpu_softmax_backward(dY, sgd=None):
 | 
					 | 
				
			||||||
        return dY
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    return ops.softmax(X), cpu_softmax_backward
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def build_simple_cnn_text_classifier(tok2vec, nr_class, exclusive_classes=False, **cfg):
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
    Build a simple CNN text classifier, given a token-to-vector model as inputs.
 | 
					 | 
				
			||||||
    If exclusive_classes=True, a softmax non-linearity is applied, so that the
 | 
					 | 
				
			||||||
    outputs sum to 1. If exclusive_classes=False, a logistic non-linearity
 | 
					 | 
				
			||||||
    is applied instead, so that outputs are in the range [0, 1].
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
    with Model.define_operators({">>": chain}):
 | 
					 | 
				
			||||||
        if exclusive_classes:
 | 
					 | 
				
			||||||
            output_layer = Softmax(nr_class, tok2vec.nO)
 | 
					 | 
				
			||||||
        else:
 | 
					 | 
				
			||||||
            output_layer = (
 | 
					 | 
				
			||||||
                zero_init(Affine(nr_class, tok2vec.nO, drop_factor=0.0)) >> logistic
 | 
					 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
        model = tok2vec >> flatten_add_lengths >> Pooling(mean_pool) >> output_layer
 | 
					 | 
				
			||||||
    model.tok2vec = chain(tok2vec, flatten)
 | 
					 | 
				
			||||||
    model.nO = nr_class
 | 
					 | 
				
			||||||
    return model
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def build_nel_encoder(embed_width, hidden_width, ner_types, **cfg):
 | 
					 | 
				
			||||||
    if "entity_width" not in cfg:
 | 
					 | 
				
			||||||
        raise ValueError(Errors.E144.format(param="entity_width"))
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    conv_depth = cfg.get("conv_depth", 2)
 | 
					 | 
				
			||||||
    cnn_maxout_pieces = cfg.get("cnn_maxout_pieces", 3)
 | 
					 | 
				
			||||||
    pretrained_vectors = cfg.get("pretrained_vectors", None)
 | 
					 | 
				
			||||||
    context_width = cfg.get("entity_width")
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    with Model.define_operators({">>": chain, "**": clone}):
 | 
					 | 
				
			||||||
        # context encoder
 | 
					 | 
				
			||||||
        tok2vec = Tok2Vec(
 | 
					 | 
				
			||||||
            width=hidden_width,
 | 
					 | 
				
			||||||
            embed_size=embed_width,
 | 
					 | 
				
			||||||
            pretrained_vectors=pretrained_vectors,
 | 
					 | 
				
			||||||
            cnn_maxout_pieces=cnn_maxout_pieces,
 | 
					 | 
				
			||||||
            subword_features=True,
 | 
					 | 
				
			||||||
            conv_depth=conv_depth,
 | 
					 | 
				
			||||||
            bilstm_depth=0,
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        model = (
 | 
					 | 
				
			||||||
            tok2vec
 | 
					 | 
				
			||||||
            >> flatten_add_lengths
 | 
					 | 
				
			||||||
            >> Pooling(mean_pool)
 | 
					 | 
				
			||||||
            >> Residual(zero_init(Maxout(hidden_width, hidden_width)))
 | 
					 | 
				
			||||||
            >> zero_init(Affine(context_width, hidden_width, drop_factor=0.0))
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        model.tok2vec = tok2vec
 | 
					 | 
				
			||||||
        model.nO = context_width
 | 
					 | 
				
			||||||
    return model
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@layerize
 | 
					 | 
				
			||||||
def flatten(seqs, drop=0.0):
 | 
					 | 
				
			||||||
    ops = Model.ops
 | 
					 | 
				
			||||||
    lengths = ops.asarray([len(seq) for seq in seqs], dtype="i")
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def finish_update(d_X, sgd=None):
 | 
					 | 
				
			||||||
        return ops.unflatten(d_X, lengths, pad=0)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    X = ops.flatten(seqs, pad=0)
 | 
					 | 
				
			||||||
    return X, finish_update
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def concatenate_lists(*layers, **kwargs):  # pragma: no cover
 | 
					 | 
				
			||||||
    """Compose two or more models `f`, `g`, etc, such that their outputs are
 | 
					 | 
				
			||||||
    concatenated, i.e. `concatenate(f, g)(x)` computes `hstack(f(x), g(x))`
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
    if not layers:
 | 
					 | 
				
			||||||
        return noop()
 | 
					 | 
				
			||||||
    drop_factor = kwargs.get("drop_factor", 1.0)
 | 
					 | 
				
			||||||
    ops = layers[0].ops
 | 
					 | 
				
			||||||
    layers = [chain(layer, flatten) for layer in layers]
 | 
					 | 
				
			||||||
    concat = concatenate(*layers)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def concatenate_lists_fwd(Xs, drop=0.0):
 | 
					 | 
				
			||||||
        if drop is not None:
 | 
					 | 
				
			||||||
            drop *= drop_factor
 | 
					 | 
				
			||||||
        lengths = ops.asarray([len(X) for X in Xs], dtype="i")
 | 
					 | 
				
			||||||
        flat_y, bp_flat_y = concat.begin_update(Xs, drop=drop)
 | 
					 | 
				
			||||||
        ys = ops.unflatten(flat_y, lengths)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        def concatenate_lists_bwd(d_ys, sgd=None):
 | 
					 | 
				
			||||||
            return bp_flat_y(ops.flatten(d_ys), sgd=sgd)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        return ys, concatenate_lists_bwd
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    model = wrap(concatenate_lists_fwd, concat)
 | 
					 | 
				
			||||||
    return model
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def masked_language_model(vocab, model, mask_prob=0.15):
 | 
					 | 
				
			||||||
    """Convert a model into a BERT-style masked language model"""
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    random_words = _RandomWords(vocab)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def mlm_forward(docs, drop=0.0):
 | 
					 | 
				
			||||||
        mask, docs = _apply_mask(docs, random_words, mask_prob=mask_prob)
 | 
					 | 
				
			||||||
        mask = model.ops.asarray(mask).reshape((mask.shape[0], 1))
 | 
					 | 
				
			||||||
        output, backprop = model.begin_update(docs, drop=drop)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        def mlm_backward(d_output, sgd=None):
 | 
					 | 
				
			||||||
            d_output *= 1 - mask
 | 
					 | 
				
			||||||
            return backprop(d_output, sgd=sgd)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        return output, mlm_backward
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    return wrap(mlm_forward, model)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
class _RandomWords(object):
 | 
					 | 
				
			||||||
    def __init__(self, vocab):
 | 
					 | 
				
			||||||
        self.words = [lex.text for lex in vocab if lex.prob != 0.0]
 | 
					 | 
				
			||||||
        self.probs = [lex.prob for lex in vocab if lex.prob != 0.0]
 | 
					 | 
				
			||||||
        self.words = self.words[:10000]
 | 
					 | 
				
			||||||
        self.probs = self.probs[:10000]
 | 
					 | 
				
			||||||
        self.probs = numpy.exp(numpy.array(self.probs, dtype="f"))
 | 
					 | 
				
			||||||
        self.probs /= self.probs.sum()
 | 
					 | 
				
			||||||
        self._cache = []
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def next(self):
 | 
					 | 
				
			||||||
        if not self._cache:
 | 
					 | 
				
			||||||
            self._cache.extend(
 | 
					 | 
				
			||||||
                numpy.random.choice(len(self.words), 10000, p=self.probs)
 | 
					 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
        index = self._cache.pop()
 | 
					 | 
				
			||||||
        return self.words[index]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def _apply_mask(docs, random_words, mask_prob=0.15):
 | 
					 | 
				
			||||||
    # This needs to be here to avoid circular imports
 | 
					 | 
				
			||||||
    from .tokens.doc import Doc
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    N = sum(len(doc) for doc in docs)
 | 
					 | 
				
			||||||
    mask = numpy.random.uniform(0.0, 1.0, (N,))
 | 
					 | 
				
			||||||
    mask = mask >= mask_prob
 | 
					 | 
				
			||||||
    i = 0
 | 
					 | 
				
			||||||
    masked_docs = []
 | 
					 | 
				
			||||||
    for doc in docs:
 | 
					 | 
				
			||||||
        words = []
 | 
					 | 
				
			||||||
        for token in doc:
 | 
					 | 
				
			||||||
            if not mask[i]:
 | 
					 | 
				
			||||||
                word = _replace_word(token.text, random_words)
 | 
					 | 
				
			||||||
            else:
 | 
					 | 
				
			||||||
                word = token.text
 | 
					 | 
				
			||||||
            words.append(word)
 | 
					 | 
				
			||||||
            i += 1
 | 
					 | 
				
			||||||
        spaces = [bool(w.whitespace_) for w in doc]
 | 
					 | 
				
			||||||
        # NB: If you change this implementation to instead modify
 | 
					 | 
				
			||||||
        # the docs in place, take care that the IDs reflect the original
 | 
					 | 
				
			||||||
        # words. Currently we use the original docs to make the vectors
 | 
					 | 
				
			||||||
        # for the target, so we don't lose the original tokens. But if
 | 
					 | 
				
			||||||
        # you modified the docs in place here, you would.
 | 
					 | 
				
			||||||
        masked_docs.append(Doc(doc.vocab, words=words, spaces=spaces))
 | 
					 | 
				
			||||||
    return mask, masked_docs
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def _replace_word(word, random_words, mask="[MASK]"):
 | 
					 | 
				
			||||||
    roll = numpy.random.random()
 | 
					 | 
				
			||||||
    if roll < 0.8:
 | 
					 | 
				
			||||||
        return mask
 | 
					 | 
				
			||||||
    elif roll < 0.9:
 | 
					 | 
				
			||||||
        return random_words.next()
 | 
					 | 
				
			||||||
    else:
 | 
					 | 
				
			||||||
        return word
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def _uniform_init(lo, hi):
 | 
					 | 
				
			||||||
    def wrapped(W, ops):
 | 
					 | 
				
			||||||
        copy_array(W, ops.xp.random.uniform(lo, hi, W.shape))
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    return wrapped
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@describe.attributes(
 | 
					 | 
				
			||||||
    nM=Dimension("Vector dimensions"),
 | 
					 | 
				
			||||||
    nC=Dimension("Number of characters per word"),
 | 
					 | 
				
			||||||
    vectors=Synapses(
 | 
					 | 
				
			||||||
        "Embed matrix", lambda obj: (obj.nC, obj.nV, obj.nM), _uniform_init(-0.1, 0.1)
 | 
					 | 
				
			||||||
    ),
 | 
					 | 
				
			||||||
    d_vectors=Gradient("vectors"),
 | 
					 | 
				
			||||||
)
 | 
					 | 
				
			||||||
class CharacterEmbed(Model):
 | 
					 | 
				
			||||||
    def __init__(self, nM=None, nC=None, **kwargs):
 | 
					 | 
				
			||||||
        Model.__init__(self, **kwargs)
 | 
					 | 
				
			||||||
        self.nM = nM
 | 
					 | 
				
			||||||
        self.nC = nC
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    @property
 | 
					 | 
				
			||||||
    def nO(self):
 | 
					 | 
				
			||||||
        return self.nM * self.nC
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    @property
 | 
					 | 
				
			||||||
    def nV(self):
 | 
					 | 
				
			||||||
        return 256
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def begin_update(self, docs, drop=0.0):
 | 
					 | 
				
			||||||
        if not docs:
 | 
					 | 
				
			||||||
            return []
 | 
					 | 
				
			||||||
        ids = []
 | 
					 | 
				
			||||||
        output = []
 | 
					 | 
				
			||||||
        weights = self.vectors
 | 
					 | 
				
			||||||
        # This assists in indexing; it's like looping over this dimension.
 | 
					 | 
				
			||||||
        # Still consider this weird witch craft...But thanks to Mark Neumann
 | 
					 | 
				
			||||||
        # for the tip.
 | 
					 | 
				
			||||||
        nCv = self.ops.xp.arange(self.nC)
 | 
					 | 
				
			||||||
        for doc in docs:
 | 
					 | 
				
			||||||
            doc_ids = doc.to_utf8_array(nr_char=self.nC)
 | 
					 | 
				
			||||||
            doc_vectors = self.ops.allocate((len(doc), self.nC, self.nM))
 | 
					 | 
				
			||||||
            # Let's say I have a 2d array of indices, and a 3d table of data. What numpy
 | 
					 | 
				
			||||||
            # incantation do I chant to get
 | 
					 | 
				
			||||||
            # output[i, j, k] == data[j, ids[i, j], k]?
 | 
					 | 
				
			||||||
            doc_vectors[:, nCv] = weights[nCv, doc_ids[:, nCv]]
 | 
					 | 
				
			||||||
            output.append(doc_vectors.reshape((len(doc), self.nO)))
 | 
					 | 
				
			||||||
            ids.append(doc_ids)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        def backprop_character_embed(d_vectors, sgd=None):
 | 
					 | 
				
			||||||
            gradient = self.d_vectors
 | 
					 | 
				
			||||||
            for doc_ids, d_doc_vectors in zip(ids, d_vectors):
 | 
					 | 
				
			||||||
                d_doc_vectors = d_doc_vectors.reshape((len(doc_ids), self.nC, self.nM))
 | 
					 | 
				
			||||||
                gradient[nCv, doc_ids[:, nCv]] += d_doc_vectors[:, nCv]
 | 
					 | 
				
			||||||
            if sgd is not None:
 | 
					 | 
				
			||||||
                sgd(self._mem.weights, self._mem.gradient, key=self.id)
 | 
					 | 
				
			||||||
            return None
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        return output, backprop_character_embed
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def get_cossim_loss(yh, y, ignore_zeros=False):
 | 
					 | 
				
			||||||
    xp = get_array_module(yh)
 | 
					 | 
				
			||||||
    # Find the zero vectors
 | 
					 | 
				
			||||||
    if ignore_zeros:
 | 
					 | 
				
			||||||
        zero_indices = xp.abs(y).sum(axis=1) == 0
 | 
					 | 
				
			||||||
    # Add a small constant to avoid 0 vectors
 | 
					 | 
				
			||||||
    yh = yh + 1e-8
 | 
					 | 
				
			||||||
    y = y + 1e-8
 | 
					 | 
				
			||||||
    # https://math.stackexchange.com/questions/1923613/partial-derivative-of-cosine-similarity
 | 
					 | 
				
			||||||
    norm_yh = xp.linalg.norm(yh, axis=1, keepdims=True)
 | 
					 | 
				
			||||||
    norm_y = xp.linalg.norm(y, axis=1, keepdims=True)
 | 
					 | 
				
			||||||
    mul_norms = norm_yh * norm_y
 | 
					 | 
				
			||||||
    cosine = (yh * y).sum(axis=1, keepdims=True) / mul_norms
 | 
					 | 
				
			||||||
    d_yh = (y / mul_norms) - (cosine * (yh / norm_yh ** 2))
 | 
					 | 
				
			||||||
    losses = xp.abs(cosine - 1)
 | 
					 | 
				
			||||||
    if ignore_zeros:
 | 
					 | 
				
			||||||
        # If the target was a zero vector, don't count it in the loss.
 | 
					 | 
				
			||||||
        d_yh[zero_indices] = 0
 | 
					 | 
				
			||||||
        losses[zero_indices] = 0
 | 
					 | 
				
			||||||
    loss = losses.sum()
 | 
					 | 
				
			||||||
    return loss, -d_yh
 | 
					 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,6 @@
 | 
				
			||||||
# fmt: off
 | 
					# fmt: off
 | 
				
			||||||
__title__ = "spacy"
 | 
					__title__ = "spacy"
 | 
				
			||||||
__version__ = "2.2.3"
 | 
					__version__ = "3.0.0.dev3"
 | 
				
			||||||
__release__ = True
 | 
					__release__ = True
 | 
				
			||||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 | 
					__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 | 
				
			||||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
 | 
					__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,7 +1,3 @@
 | 
				
			||||||
# coding: utf8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from collections import OrderedDict
 | 
					 | 
				
			||||||
from wasabi import Printer
 | 
					from wasabi import Printer
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .tokens import Doc, Token, Span
 | 
					from .tokens import Doc, Token, Span
 | 
				
			||||||
| 
						 | 
					@ -23,7 +19,7 @@ def analyze_pipes(pipeline, name, pipe, index, warn=True):
 | 
				
			||||||
    assert pipeline[index][0] == name
 | 
					    assert pipeline[index][0] == name
 | 
				
			||||||
    prev_pipes = pipeline[:index]
 | 
					    prev_pipes = pipeline[:index]
 | 
				
			||||||
    pipe_requires = getattr(pipe, "requires", [])
 | 
					    pipe_requires = getattr(pipe, "requires", [])
 | 
				
			||||||
    requires = OrderedDict([(annot, False) for annot in pipe_requires])
 | 
					    requires = {annot: False for annot in pipe_requires}
 | 
				
			||||||
    if requires:
 | 
					    if requires:
 | 
				
			||||||
        for prev_name, prev_pipe in prev_pipes:
 | 
					        for prev_name, prev_pipe in prev_pipes:
 | 
				
			||||||
            prev_assigns = getattr(prev_pipe, "assigns", [])
 | 
					            prev_assigns = getattr(prev_pipe, "assigns", [])
 | 
				
			||||||
| 
						 | 
					@ -98,15 +94,15 @@ def validate_attrs(values):
 | 
				
			||||||
                for ext_attr, ext_value in value.items():
 | 
					                for ext_attr, ext_value in value.items():
 | 
				
			||||||
                    # We don't check whether the attribute actually exists
 | 
					                    # We don't check whether the attribute actually exists
 | 
				
			||||||
                    if ext_value is not True:  # attr is something like doc._.x.y
 | 
					                    if ext_value is not True:  # attr is something like doc._.x.y
 | 
				
			||||||
                        good = "{}._.{}".format(obj_key, ext_attr)
 | 
					                        good = f"{obj_key}._.{ext_attr}"
 | 
				
			||||||
                        bad = "{}.{}".format(good, ".".join(ext_value))
 | 
					                        bad = f"{good}.{'.'.join(ext_value)}"
 | 
				
			||||||
                        raise ValueError(Errors.E183.format(attr=bad, solution=good))
 | 
					                        raise ValueError(Errors.E183.format(attr=bad, solution=good))
 | 
				
			||||||
                continue  # we can't validate those further
 | 
					                continue  # we can't validate those further
 | 
				
			||||||
            if attr.endswith("_"):  # attr is something like "token.pos_"
 | 
					            if attr.endswith("_"):  # attr is something like "token.pos_"
 | 
				
			||||||
                raise ValueError(Errors.E184.format(attr=attr, solution=attr[:-1]))
 | 
					                raise ValueError(Errors.E184.format(attr=attr, solution=attr[:-1]))
 | 
				
			||||||
            if value is not True:  # attr is something like doc.x.y
 | 
					            if value is not True:  # attr is something like doc.x.y
 | 
				
			||||||
                good = "{}.{}".format(obj_key, attr)
 | 
					                good = f"{obj_key}.{attr}"
 | 
				
			||||||
                bad = "{}.{}".format(good, ".".join(value))
 | 
					                bad = f"{good}.{'.'.join(value)}"
 | 
				
			||||||
                raise ValueError(Errors.E183.format(attr=bad, solution=good))
 | 
					                raise ValueError(Errors.E183.format(attr=bad, solution=good))
 | 
				
			||||||
            obj = objs[obj_key]
 | 
					            obj = objs[obj_key]
 | 
				
			||||||
            if not hasattr(obj, attr):
 | 
					            if not hasattr(obj, attr):
 | 
				
			||||||
| 
						 | 
					@ -168,11 +164,10 @@ def print_summary(nlp, pretty=True, no_print=False):
 | 
				
			||||||
    msg.table(overview, header=header, divider=True, multiline=True)
 | 
					    msg.table(overview, header=header, divider=True, multiline=True)
 | 
				
			||||||
    n_problems = sum(len(p) for p in problems.values())
 | 
					    n_problems = sum(len(p) for p in problems.values())
 | 
				
			||||||
    if any(p for p in problems.values()):
 | 
					    if any(p for p in problems.values()):
 | 
				
			||||||
        msg.divider("Problems ({})".format(n_problems))
 | 
					        msg.divider(f"Problems ({n_problems})")
 | 
				
			||||||
        for name, problem in problems.items():
 | 
					        for name, problem in problems.items():
 | 
				
			||||||
            if problem:
 | 
					            if problem:
 | 
				
			||||||
                problem = ", ".join(problem)
 | 
					                msg.warn(f"'{name}' requirements not met: {', '.join(problem)}")
 | 
				
			||||||
                msg.warn("'{}' requirements not met: {}".format(name, problem))
 | 
					 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        msg.good("No problems found.")
 | 
					        msg.good("No problems found.")
 | 
				
			||||||
    if no_print:
 | 
					    if no_print:
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -91,4 +91,5 @@ cdef enum attr_id_t:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    LANG
 | 
					    LANG
 | 
				
			||||||
    ENT_KB_ID = symbols.ENT_KB_ID
 | 
					    ENT_KB_ID = symbols.ENT_KB_ID
 | 
				
			||||||
 | 
					    MORPH
 | 
				
			||||||
    ENT_ID = symbols.ENT_ID
 | 
					    ENT_ID = symbols.ENT_ID
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,3 @@
 | 
				
			||||||
# coding: utf8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
IDS = {
 | 
					IDS = {
 | 
				
			||||||
    "": NULL_ATTR,
 | 
					    "": NULL_ATTR,
 | 
				
			||||||
| 
						 | 
					@ -91,6 +88,7 @@ IDS = {
 | 
				
			||||||
    "SPACY": SPACY,
 | 
					    "SPACY": SPACY,
 | 
				
			||||||
    "PROB": PROB,
 | 
					    "PROB": PROB,
 | 
				
			||||||
    "LANG": LANG,
 | 
					    "LANG": LANG,
 | 
				
			||||||
 | 
					    "MORPH": MORPH,
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,12 +1,21 @@
 | 
				
			||||||
 | 
					from wasabi import msg
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .download import download  # noqa: F401
 | 
					from .download import download  # noqa: F401
 | 
				
			||||||
from .info import info  # noqa: F401
 | 
					from .info import info  # noqa: F401
 | 
				
			||||||
from .link import link  # noqa: F401
 | 
					 | 
				
			||||||
from .package import package  # noqa: F401
 | 
					from .package import package  # noqa: F401
 | 
				
			||||||
from .profile import profile  # noqa: F401
 | 
					from .profile import profile  # noqa: F401
 | 
				
			||||||
from .train import train  # noqa: F401
 | 
					from .train import train  # noqa: F401
 | 
				
			||||||
 | 
					from .train_from_config import train_from_config_cli  # noqa: F401
 | 
				
			||||||
from .pretrain import pretrain  # noqa: F401
 | 
					from .pretrain import pretrain  # noqa: F401
 | 
				
			||||||
from .debug_data import debug_data  # noqa: F401
 | 
					from .debug_data import debug_data  # noqa: F401
 | 
				
			||||||
from .evaluate import evaluate  # noqa: F401
 | 
					from .evaluate import evaluate  # noqa: F401
 | 
				
			||||||
from .convert import convert  # noqa: F401
 | 
					from .convert import convert  # noqa: F401
 | 
				
			||||||
from .init_model import init_model  # noqa: F401
 | 
					from .init_model import init_model  # noqa: F401
 | 
				
			||||||
from .validate import validate  # noqa: F401
 | 
					from .validate import validate  # noqa: F401
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def link(*args, **kwargs):
 | 
				
			||||||
 | 
					    msg.warn(
 | 
				
			||||||
 | 
					        "As of spaCy v3.0, model symlinks are deprecated. You can load models "
 | 
				
			||||||
 | 
					        "using their full names or from a directory path."
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,220 +0,0 @@
 | 
				
			||||||
# coding: utf-8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
# NB: This schema describes the new format of the training data, see #2928
 | 
					 | 
				
			||||||
TRAINING_SCHEMA = {
 | 
					 | 
				
			||||||
    "$schema": "http://json-schema.org/draft-06/schema",
 | 
					 | 
				
			||||||
    "title": "Training data for spaCy models",
 | 
					 | 
				
			||||||
    "type": "array",
 | 
					 | 
				
			||||||
    "items": {
 | 
					 | 
				
			||||||
        "type": "object",
 | 
					 | 
				
			||||||
        "properties": {
 | 
					 | 
				
			||||||
            "text": {
 | 
					 | 
				
			||||||
                "title": "The text of the training example",
 | 
					 | 
				
			||||||
                "type": "string",
 | 
					 | 
				
			||||||
                "minLength": 1,
 | 
					 | 
				
			||||||
            },
 | 
					 | 
				
			||||||
            "ents": {
 | 
					 | 
				
			||||||
                "title": "Named entity spans in the text",
 | 
					 | 
				
			||||||
                "type": "array",
 | 
					 | 
				
			||||||
                "items": {
 | 
					 | 
				
			||||||
                    "type": "object",
 | 
					 | 
				
			||||||
                    "properties": {
 | 
					 | 
				
			||||||
                        "start": {
 | 
					 | 
				
			||||||
                            "title": "Start character offset of the span",
 | 
					 | 
				
			||||||
                            "type": "integer",
 | 
					 | 
				
			||||||
                            "minimum": 0,
 | 
					 | 
				
			||||||
                        },
 | 
					 | 
				
			||||||
                        "end": {
 | 
					 | 
				
			||||||
                            "title": "End character offset of the span",
 | 
					 | 
				
			||||||
                            "type": "integer",
 | 
					 | 
				
			||||||
                            "minimum": 0,
 | 
					 | 
				
			||||||
                        },
 | 
					 | 
				
			||||||
                        "label": {
 | 
					 | 
				
			||||||
                            "title": "Entity label",
 | 
					 | 
				
			||||||
                            "type": "string",
 | 
					 | 
				
			||||||
                            "minLength": 1,
 | 
					 | 
				
			||||||
                            "pattern": "^[A-Z0-9]*$",
 | 
					 | 
				
			||||||
                        },
 | 
					 | 
				
			||||||
                    },
 | 
					 | 
				
			||||||
                    "required": ["start", "end", "label"],
 | 
					 | 
				
			||||||
                },
 | 
					 | 
				
			||||||
            },
 | 
					 | 
				
			||||||
            "sents": {
 | 
					 | 
				
			||||||
                "title": "Sentence spans in the text",
 | 
					 | 
				
			||||||
                "type": "array",
 | 
					 | 
				
			||||||
                "items": {
 | 
					 | 
				
			||||||
                    "type": "object",
 | 
					 | 
				
			||||||
                    "properties": {
 | 
					 | 
				
			||||||
                        "start": {
 | 
					 | 
				
			||||||
                            "title": "Start character offset of the span",
 | 
					 | 
				
			||||||
                            "type": "integer",
 | 
					 | 
				
			||||||
                            "minimum": 0,
 | 
					 | 
				
			||||||
                        },
 | 
					 | 
				
			||||||
                        "end": {
 | 
					 | 
				
			||||||
                            "title": "End character offset of the span",
 | 
					 | 
				
			||||||
                            "type": "integer",
 | 
					 | 
				
			||||||
                            "minimum": 0,
 | 
					 | 
				
			||||||
                        },
 | 
					 | 
				
			||||||
                    },
 | 
					 | 
				
			||||||
                    "required": ["start", "end"],
 | 
					 | 
				
			||||||
                },
 | 
					 | 
				
			||||||
            },
 | 
					 | 
				
			||||||
            "cats": {
 | 
					 | 
				
			||||||
                "title": "Text categories for the text classifier",
 | 
					 | 
				
			||||||
                "type": "object",
 | 
					 | 
				
			||||||
                "patternProperties": {
 | 
					 | 
				
			||||||
                    "*": {
 | 
					 | 
				
			||||||
                        "title": "A text category",
 | 
					 | 
				
			||||||
                        "oneOf": [
 | 
					 | 
				
			||||||
                            {"type": "boolean"},
 | 
					 | 
				
			||||||
                            {"type": "number", "minimum": 0},
 | 
					 | 
				
			||||||
                        ],
 | 
					 | 
				
			||||||
                    }
 | 
					 | 
				
			||||||
                },
 | 
					 | 
				
			||||||
                "propertyNames": {"pattern": "^[A-Z0-9]*$", "minLength": 1},
 | 
					 | 
				
			||||||
            },
 | 
					 | 
				
			||||||
            "tokens": {
 | 
					 | 
				
			||||||
                "title": "The tokens in the text",
 | 
					 | 
				
			||||||
                "type": "array",
 | 
					 | 
				
			||||||
                "items": {
 | 
					 | 
				
			||||||
                    "type": "object",
 | 
					 | 
				
			||||||
                    "minProperties": 1,
 | 
					 | 
				
			||||||
                    "properties": {
 | 
					 | 
				
			||||||
                        "id": {
 | 
					 | 
				
			||||||
                            "title": "Token ID, usually token index",
 | 
					 | 
				
			||||||
                            "type": "integer",
 | 
					 | 
				
			||||||
                            "minimum": 0,
 | 
					 | 
				
			||||||
                        },
 | 
					 | 
				
			||||||
                        "start": {
 | 
					 | 
				
			||||||
                            "title": "Start character offset of the token",
 | 
					 | 
				
			||||||
                            "type": "integer",
 | 
					 | 
				
			||||||
                            "minimum": 0,
 | 
					 | 
				
			||||||
                        },
 | 
					 | 
				
			||||||
                        "end": {
 | 
					 | 
				
			||||||
                            "title": "End character offset of the token",
 | 
					 | 
				
			||||||
                            "type": "integer",
 | 
					 | 
				
			||||||
                            "minimum": 0,
 | 
					 | 
				
			||||||
                        },
 | 
					 | 
				
			||||||
                        "pos": {
 | 
					 | 
				
			||||||
                            "title": "Coarse-grained part-of-speech tag",
 | 
					 | 
				
			||||||
                            "type": "string",
 | 
					 | 
				
			||||||
                            "minLength": 1,
 | 
					 | 
				
			||||||
                        },
 | 
					 | 
				
			||||||
                        "tag": {
 | 
					 | 
				
			||||||
                            "title": "Fine-grained part-of-speech tag",
 | 
					 | 
				
			||||||
                            "type": "string",
 | 
					 | 
				
			||||||
                            "minLength": 1,
 | 
					 | 
				
			||||||
                        },
 | 
					 | 
				
			||||||
                        "dep": {
 | 
					 | 
				
			||||||
                            "title": "Dependency label",
 | 
					 | 
				
			||||||
                            "type": "string",
 | 
					 | 
				
			||||||
                            "minLength": 1,
 | 
					 | 
				
			||||||
                        },
 | 
					 | 
				
			||||||
                        "head": {
 | 
					 | 
				
			||||||
                            "title": "Index of the token's head",
 | 
					 | 
				
			||||||
                            "type": "integer",
 | 
					 | 
				
			||||||
                            "minimum": 0,
 | 
					 | 
				
			||||||
                        },
 | 
					 | 
				
			||||||
                    },
 | 
					 | 
				
			||||||
                    "required": ["start", "end"],
 | 
					 | 
				
			||||||
                },
 | 
					 | 
				
			||||||
            },
 | 
					 | 
				
			||||||
            "_": {"title": "Custom user space", "type": "object"},
 | 
					 | 
				
			||||||
        },
 | 
					 | 
				
			||||||
        "required": ["text"],
 | 
					 | 
				
			||||||
    },
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
META_SCHEMA = {
 | 
					 | 
				
			||||||
    "$schema": "http://json-schema.org/draft-06/schema",
 | 
					 | 
				
			||||||
    "type": "object",
 | 
					 | 
				
			||||||
    "properties": {
 | 
					 | 
				
			||||||
        "lang": {
 | 
					 | 
				
			||||||
            "title": "Two-letter language code, e.g. 'en'",
 | 
					 | 
				
			||||||
            "type": "string",
 | 
					 | 
				
			||||||
            "minLength": 2,
 | 
					 | 
				
			||||||
            "maxLength": 2,
 | 
					 | 
				
			||||||
            "pattern": "^[a-z]*$",
 | 
					 | 
				
			||||||
        },
 | 
					 | 
				
			||||||
        "name": {
 | 
					 | 
				
			||||||
            "title": "Model name",
 | 
					 | 
				
			||||||
            "type": "string",
 | 
					 | 
				
			||||||
            "minLength": 1,
 | 
					 | 
				
			||||||
            "pattern": "^[a-z_]*$",
 | 
					 | 
				
			||||||
        },
 | 
					 | 
				
			||||||
        "version": {
 | 
					 | 
				
			||||||
            "title": "Model version",
 | 
					 | 
				
			||||||
            "type": "string",
 | 
					 | 
				
			||||||
            "minLength": 1,
 | 
					 | 
				
			||||||
            "pattern": "^[0-9a-z.-]*$",
 | 
					 | 
				
			||||||
        },
 | 
					 | 
				
			||||||
        "spacy_version": {
 | 
					 | 
				
			||||||
            "title": "Compatible spaCy version identifier",
 | 
					 | 
				
			||||||
            "type": "string",
 | 
					 | 
				
			||||||
            "minLength": 1,
 | 
					 | 
				
			||||||
            "pattern": "^[0-9a-z.-><=]*$",
 | 
					 | 
				
			||||||
        },
 | 
					 | 
				
			||||||
        "parent_package": {
 | 
					 | 
				
			||||||
            "title": "Name of parent spaCy package, e.g. spacy or spacy-nightly",
 | 
					 | 
				
			||||||
            "type": "string",
 | 
					 | 
				
			||||||
            "minLength": 1,
 | 
					 | 
				
			||||||
            "default": "spacy",
 | 
					 | 
				
			||||||
        },
 | 
					 | 
				
			||||||
        "pipeline": {
 | 
					 | 
				
			||||||
            "title": "Names of pipeline components",
 | 
					 | 
				
			||||||
            "type": "array",
 | 
					 | 
				
			||||||
            "items": {"type": "string", "minLength": 1},
 | 
					 | 
				
			||||||
        },
 | 
					 | 
				
			||||||
        "description": {"title": "Model description", "type": "string"},
 | 
					 | 
				
			||||||
        "license": {"title": "Model license", "type": "string"},
 | 
					 | 
				
			||||||
        "author": {"title": "Model author name", "type": "string"},
 | 
					 | 
				
			||||||
        "email": {"title": "Model author email", "type": "string", "format": "email"},
 | 
					 | 
				
			||||||
        "url": {"title": "Model author URL", "type": "string", "format": "uri"},
 | 
					 | 
				
			||||||
        "sources": {
 | 
					 | 
				
			||||||
            "title": "Training data sources",
 | 
					 | 
				
			||||||
            "type": "array",
 | 
					 | 
				
			||||||
            "items": {"type": "string"},
 | 
					 | 
				
			||||||
        },
 | 
					 | 
				
			||||||
        "vectors": {
 | 
					 | 
				
			||||||
            "title": "Included word vectors",
 | 
					 | 
				
			||||||
            "type": "object",
 | 
					 | 
				
			||||||
            "properties": {
 | 
					 | 
				
			||||||
                "keys": {
 | 
					 | 
				
			||||||
                    "title": "Number of unique keys",
 | 
					 | 
				
			||||||
                    "type": "integer",
 | 
					 | 
				
			||||||
                    "minimum": 0,
 | 
					 | 
				
			||||||
                },
 | 
					 | 
				
			||||||
                "vectors": {
 | 
					 | 
				
			||||||
                    "title": "Number of unique vectors",
 | 
					 | 
				
			||||||
                    "type": "integer",
 | 
					 | 
				
			||||||
                    "minimum": 0,
 | 
					 | 
				
			||||||
                },
 | 
					 | 
				
			||||||
                "width": {
 | 
					 | 
				
			||||||
                    "title": "Number of dimensions",
 | 
					 | 
				
			||||||
                    "type": "integer",
 | 
					 | 
				
			||||||
                    "minimum": 0,
 | 
					 | 
				
			||||||
                },
 | 
					 | 
				
			||||||
            },
 | 
					 | 
				
			||||||
        },
 | 
					 | 
				
			||||||
        "accuracy": {
 | 
					 | 
				
			||||||
            "title": "Accuracy numbers",
 | 
					 | 
				
			||||||
            "type": "object",
 | 
					 | 
				
			||||||
            "patternProperties": {"*": {"type": "number", "minimum": 0.0}},
 | 
					 | 
				
			||||||
        },
 | 
					 | 
				
			||||||
        "speed": {
 | 
					 | 
				
			||||||
            "title": "Speed evaluation numbers",
 | 
					 | 
				
			||||||
            "type": "object",
 | 
					 | 
				
			||||||
            "patternProperties": {
 | 
					 | 
				
			||||||
                "*": {
 | 
					 | 
				
			||||||
                    "oneOf": [
 | 
					 | 
				
			||||||
                        {"type": "number", "minimum": 0.0},
 | 
					 | 
				
			||||||
                        {"type": "integer", "minimum": 0},
 | 
					 | 
				
			||||||
                    ]
 | 
					 | 
				
			||||||
                }
 | 
					 | 
				
			||||||
            },
 | 
					 | 
				
			||||||
        },
 | 
					 | 
				
			||||||
    },
 | 
					 | 
				
			||||||
    "required": ["lang", "name", "version"],
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
| 
						 | 
					@ -1,7 +1,3 @@
 | 
				
			||||||
# coding: utf8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
import plac
 | 
					 | 
				
			||||||
from pathlib import Path
 | 
					from pathlib import Path
 | 
				
			||||||
from wasabi import Printer
 | 
					from wasabi import Printer
 | 
				
			||||||
import srsly
 | 
					import srsly
 | 
				
			||||||
| 
						 | 
					@ -29,27 +25,20 @@ FILE_TYPES = ("json", "jsonl", "msg")
 | 
				
			||||||
FILE_TYPES_STDOUT = ("json", "jsonl")
 | 
					FILE_TYPES_STDOUT = ("json", "jsonl")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@plac.annotations(
 | 
					 | 
				
			||||||
    input_file=("Input file", "positional", None, str),
 | 
					 | 
				
			||||||
    output_dir=("Output directory. '-' for stdout.", "positional", None, str),
 | 
					 | 
				
			||||||
    file_type=("Type of data to produce: {}".format(FILE_TYPES), "option", "t", str),
 | 
					 | 
				
			||||||
    n_sents=("Number of sentences per doc (0 to disable)", "option", "n", int),
 | 
					 | 
				
			||||||
    seg_sents=("Segment sentences (for -c ner)", "flag", "s"),
 | 
					 | 
				
			||||||
    model=("Model for sentence segmentation (for -s)", "option", "b", str),
 | 
					 | 
				
			||||||
    converter=("Converter: {}".format(tuple(CONVERTERS.keys())), "option", "c", str),
 | 
					 | 
				
			||||||
    lang=("Language (if tokenizer required)", "option", "l", str),
 | 
					 | 
				
			||||||
    morphology=("Enable appending morphology to tags", "flag", "m", bool),
 | 
					 | 
				
			||||||
)
 | 
					 | 
				
			||||||
def convert(
 | 
					def convert(
 | 
				
			||||||
    input_file,
 | 
					    # fmt: off
 | 
				
			||||||
    output_dir="-",
 | 
					    input_file: ("Input file", "positional", None, str),
 | 
				
			||||||
    file_type="json",
 | 
					    output_dir: ("Output directory. '-' for stdout.", "positional", None, str) = "-",
 | 
				
			||||||
    n_sents=1,
 | 
					    file_type: (f"Type of data to produce: {FILE_TYPES}", "option", "t", str, FILE_TYPES) = "json",
 | 
				
			||||||
    seg_sents=False,
 | 
					    n_sents: ("Number of sentences per doc (0 to disable)", "option", "n", int) = 1,
 | 
				
			||||||
    model=None,
 | 
					    seg_sents: ("Segment sentences (for -c ner)", "flag", "s") = False,
 | 
				
			||||||
    morphology=False,
 | 
					    model: ("Model for sentence segmentation (for -s)", "option", "b", str) = None,
 | 
				
			||||||
    converter="auto",
 | 
					    morphology: ("Enable appending morphology to tags", "flag", "m", bool) = False,
 | 
				
			||||||
    lang=None,
 | 
					    merge_subtokens: ("Merge CoNLL-U subtokens", "flag", "T", bool) = False,
 | 
				
			||||||
 | 
					    converter: (f"Converter: {tuple(CONVERTERS.keys())}", "option", "c", str) = "auto",
 | 
				
			||||||
 | 
					    ner_map_path: ("NER tag mapping (as JSON-encoded dict of entity types)", "option", "N", Path) = None,
 | 
				
			||||||
 | 
					    lang: ("Language (if tokenizer required)", "option", "l", str) = None,
 | 
				
			||||||
 | 
					    # fmt: on
 | 
				
			||||||
):
 | 
					):
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    Convert files into JSON format for use with train command and other
 | 
					    Convert files into JSON format for use with train command and other
 | 
				
			||||||
| 
						 | 
					@ -60,16 +49,10 @@ def convert(
 | 
				
			||||||
    no_print = output_dir == "-"
 | 
					    no_print = output_dir == "-"
 | 
				
			||||||
    msg = Printer(no_print=no_print)
 | 
					    msg = Printer(no_print=no_print)
 | 
				
			||||||
    input_path = Path(input_file)
 | 
					    input_path = Path(input_file)
 | 
				
			||||||
    if file_type not in FILE_TYPES:
 | 
					 | 
				
			||||||
        msg.fail(
 | 
					 | 
				
			||||||
            "Unknown file type: '{}'".format(file_type),
 | 
					 | 
				
			||||||
            "Supported file types: '{}'".format(", ".join(FILE_TYPES)),
 | 
					 | 
				
			||||||
            exits=1,
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
    if file_type not in FILE_TYPES_STDOUT and output_dir == "-":
 | 
					    if file_type not in FILE_TYPES_STDOUT and output_dir == "-":
 | 
				
			||||||
        # TODO: support msgpack via stdout in srsly?
 | 
					        # TODO: support msgpack via stdout in srsly?
 | 
				
			||||||
        msg.fail(
 | 
					        msg.fail(
 | 
				
			||||||
            "Can't write .{} data to stdout.".format(file_type),
 | 
					            f"Can't write .{file_type} data to stdout",
 | 
				
			||||||
            "Please specify an output directory.",
 | 
					            "Please specify an output directory.",
 | 
				
			||||||
            exits=1,
 | 
					            exits=1,
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
| 
						 | 
					@ -93,21 +76,26 @@ def convert(
 | 
				
			||||||
                "Can't automatically detect NER format. Conversion may not succeed. See https://spacy.io/api/cli#convert"
 | 
					                "Can't automatically detect NER format. Conversion may not succeed. See https://spacy.io/api/cli#convert"
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
    if converter not in CONVERTERS:
 | 
					    if converter not in CONVERTERS:
 | 
				
			||||||
        msg.fail("Can't find converter for {}".format(converter), exits=1)
 | 
					        msg.fail(f"Can't find converter for {converter}", exits=1)
 | 
				
			||||||
 | 
					    ner_map = None
 | 
				
			||||||
 | 
					    if ner_map_path is not None:
 | 
				
			||||||
 | 
					        ner_map = srsly.read_json(ner_map_path)
 | 
				
			||||||
    # Use converter function to convert data
 | 
					    # Use converter function to convert data
 | 
				
			||||||
    func = CONVERTERS[converter]
 | 
					    func = CONVERTERS[converter]
 | 
				
			||||||
    data = func(
 | 
					    data = func(
 | 
				
			||||||
        input_data,
 | 
					        input_data,
 | 
				
			||||||
        n_sents=n_sents,
 | 
					        n_sents=n_sents,
 | 
				
			||||||
        seg_sents=seg_sents,
 | 
					        seg_sents=seg_sents,
 | 
				
			||||||
        use_morphology=morphology,
 | 
					        append_morphology=morphology,
 | 
				
			||||||
 | 
					        merge_subtokens=merge_subtokens,
 | 
				
			||||||
        lang=lang,
 | 
					        lang=lang,
 | 
				
			||||||
        model=model,
 | 
					        model=model,
 | 
				
			||||||
        no_print=no_print,
 | 
					        no_print=no_print,
 | 
				
			||||||
 | 
					        ner_map=ner_map,
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
    if output_dir != "-":
 | 
					    if output_dir != "-":
 | 
				
			||||||
        # Export data to a file
 | 
					        # Export data to a file
 | 
				
			||||||
        suffix = ".{}".format(file_type)
 | 
					        suffix = f".{file_type}"
 | 
				
			||||||
        output_file = Path(output_dir) / Path(input_path.parts[-1]).with_suffix(suffix)
 | 
					        output_file = Path(output_dir) / Path(input_path.parts[-1]).with_suffix(suffix)
 | 
				
			||||||
        if file_type == "json":
 | 
					        if file_type == "json":
 | 
				
			||||||
            srsly.write_json(output_file, data)
 | 
					            srsly.write_json(output_file, data)
 | 
				
			||||||
| 
						 | 
					@ -115,9 +103,7 @@ def convert(
 | 
				
			||||||
            srsly.write_jsonl(output_file, data)
 | 
					            srsly.write_jsonl(output_file, data)
 | 
				
			||||||
        elif file_type == "msg":
 | 
					        elif file_type == "msg":
 | 
				
			||||||
            srsly.write_msgpack(output_file, data)
 | 
					            srsly.write_msgpack(output_file, data)
 | 
				
			||||||
        msg.good(
 | 
					        msg.good(f"Generated output file ({len(data)} documents): {output_file}")
 | 
				
			||||||
            "Generated output file ({} documents): {}".format(len(data), output_file)
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        # Print to stdout
 | 
					        # Print to stdout
 | 
				
			||||||
        if file_type == "json":
 | 
					        if file_type == "json":
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,3 @@
 | 
				
			||||||
# coding: utf8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from wasabi import Printer
 | 
					from wasabi import Printer
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ...gold import iob_to_biluo
 | 
					from ...gold import iob_to_biluo
 | 
				
			||||||
| 
						 | 
					@ -64,9 +61,9 @@ def conll_ner2json(
 | 
				
			||||||
        # sentence segmentation required for document segmentation
 | 
					        # sentence segmentation required for document segmentation
 | 
				
			||||||
        if n_sents > 0 and not seg_sents:
 | 
					        if n_sents > 0 and not seg_sents:
 | 
				
			||||||
            msg.warn(
 | 
					            msg.warn(
 | 
				
			||||||
                "No sentence boundaries found to use with option `-n {}`. "
 | 
					                f"No sentence boundaries found to use with option `-n {n_sents}`. "
 | 
				
			||||||
                "Use `-s` to automatically segment sentences or `-n 0` "
 | 
					                f"Use `-s` to automatically segment sentences or `-n 0` "
 | 
				
			||||||
                "to disable.".format(n_sents)
 | 
					                f"to disable."
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            n_sents_info(msg, n_sents)
 | 
					            n_sents_info(msg, n_sents)
 | 
				
			||||||
| 
						 | 
					@ -129,7 +126,7 @@ def segment_sents_and_docs(doc, n_sents, doc_delimiter, model=None, msg=None):
 | 
				
			||||||
    if model:
 | 
					    if model:
 | 
				
			||||||
        nlp = load_model(model)
 | 
					        nlp = load_model(model)
 | 
				
			||||||
        if "parser" in nlp.pipe_names:
 | 
					        if "parser" in nlp.pipe_names:
 | 
				
			||||||
            msg.info("Segmenting sentences with parser from model '{}'.".format(model))
 | 
					            msg.info(f"Segmenting sentences with parser from model '{model}'.")
 | 
				
			||||||
            sentencizer = nlp.get_pipe("parser")
 | 
					            sentencizer = nlp.get_pipe("parser")
 | 
				
			||||||
    if not sentencizer:
 | 
					    if not sentencizer:
 | 
				
			||||||
        msg.info(
 | 
					        msg.info(
 | 
				
			||||||
| 
						 | 
					@ -166,7 +163,7 @@ def segment_docs(input_data, n_sents, doc_delimiter):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def n_sents_info(msg, n_sents):
 | 
					def n_sents_info(msg, n_sents):
 | 
				
			||||||
    msg.info("Grouping every {} sentences into a document.".format(n_sents))
 | 
					    msg.info(f"Grouping every {n_sents} sentences into a document.")
 | 
				
			||||||
    if n_sents == 1:
 | 
					    if n_sents == 1:
 | 
				
			||||||
        msg.warn(
 | 
					        msg.warn(
 | 
				
			||||||
            "To generate better training data, you may want to group "
 | 
					            "To generate better training data, you may want to group "
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,141 +1,348 @@
 | 
				
			||||||
# coding: utf8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
import re
 | 
					import re
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ...gold import iob_to_biluo
 | 
					from ...gold import Example
 | 
				
			||||||
 | 
					from ...gold import iob_to_biluo, spans_from_biluo_tags, biluo_tags_from_offsets
 | 
				
			||||||
 | 
					from ...language import Language
 | 
				
			||||||
 | 
					from ...tokens import Doc, Token
 | 
				
			||||||
 | 
					from .conll_ner2json import n_sents_info
 | 
				
			||||||
 | 
					from wasabi import Printer
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def conllu2json(input_data, n_sents=10, use_morphology=False, lang=None, **_):
 | 
					def conllu2json(
 | 
				
			||||||
 | 
					    input_data,
 | 
				
			||||||
 | 
					    n_sents=10,
 | 
				
			||||||
 | 
					    append_morphology=False,
 | 
				
			||||||
 | 
					    lang=None,
 | 
				
			||||||
 | 
					    ner_map=None,
 | 
				
			||||||
 | 
					    merge_subtokens=False,
 | 
				
			||||||
 | 
					    no_print=False,
 | 
				
			||||||
 | 
					    **_
 | 
				
			||||||
 | 
					):
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    Convert conllu files into JSON format for use with train cli.
 | 
					    Convert conllu files into JSON format for use with train cli.
 | 
				
			||||||
    use_morphology parameter enables appending morphology to tags, which is
 | 
					    append_morphology parameter enables appending morphology to tags, which is
 | 
				
			||||||
    useful for languages such as Spanish, where UD tags are not so rich.
 | 
					    useful for languages such as Spanish, where UD tags are not so rich.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    Extract NER tags if available and convert them so that they follow
 | 
					    Extract NER tags if available and convert them so that they follow
 | 
				
			||||||
    BILUO and the Wikipedia scheme
 | 
					    BILUO and the Wikipedia scheme
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    # by @dvsrepo, via #11 explosion/spacy-dev-resources
 | 
					    MISC_NER_PATTERN = "\|?(?:name=)?(([A-Z_]+)-([A-Z_]+)|O)\|?"
 | 
				
			||||||
    # by @katarkor
 | 
					    msg = Printer(no_print=no_print)
 | 
				
			||||||
 | 
					    n_sents_info(msg, n_sents)
 | 
				
			||||||
    docs = []
 | 
					    docs = []
 | 
				
			||||||
 | 
					    raw = ""
 | 
				
			||||||
    sentences = []
 | 
					    sentences = []
 | 
				
			||||||
    conll_tuples = read_conllx(input_data, use_morphology=use_morphology)
 | 
					    conll_data = read_conllx(
 | 
				
			||||||
    checked_for_ner = False
 | 
					        input_data,
 | 
				
			||||||
    has_ner_tags = False
 | 
					        append_morphology=append_morphology,
 | 
				
			||||||
    for i, (raw_text, tokens) in enumerate(conll_tuples):
 | 
					        ner_tag_pattern=MISC_NER_PATTERN,
 | 
				
			||||||
        sentence, brackets = tokens[0]
 | 
					        ner_map=ner_map,
 | 
				
			||||||
        if not checked_for_ner:
 | 
					        merge_subtokens=merge_subtokens,
 | 
				
			||||||
            has_ner_tags = is_ner(sentence[5][0])
 | 
					    )
 | 
				
			||||||
            checked_for_ner = True
 | 
					    has_ner_tags = has_ner(input_data, ner_tag_pattern=MISC_NER_PATTERN)
 | 
				
			||||||
        sentences.append(generate_sentence(sentence, has_ner_tags))
 | 
					    for i, example in enumerate(conll_data):
 | 
				
			||||||
 | 
					        raw += example.text
 | 
				
			||||||
 | 
					        sentences.append(
 | 
				
			||||||
 | 
					            generate_sentence(
 | 
				
			||||||
 | 
					                example.token_annotation,
 | 
				
			||||||
 | 
					                has_ner_tags,
 | 
				
			||||||
 | 
					                MISC_NER_PATTERN,
 | 
				
			||||||
 | 
					                ner_map=ner_map,
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
        # Real-sized documents could be extracted using the comments on the
 | 
					        # Real-sized documents could be extracted using the comments on the
 | 
				
			||||||
        # conluu document
 | 
					        # conllu document
 | 
				
			||||||
        if len(sentences) % n_sents == 0:
 | 
					        if len(sentences) % n_sents == 0:
 | 
				
			||||||
            doc = create_doc(sentences, i)
 | 
					            doc = create_json_doc(raw, sentences, i)
 | 
				
			||||||
            docs.append(doc)
 | 
					            docs.append(doc)
 | 
				
			||||||
 | 
					            raw = ""
 | 
				
			||||||
            sentences = []
 | 
					            sentences = []
 | 
				
			||||||
    if sentences:
 | 
					    if sentences:
 | 
				
			||||||
        doc = create_doc(sentences, i)
 | 
					        doc = create_json_doc(raw, sentences, i)
 | 
				
			||||||
        docs.append(doc)
 | 
					        docs.append(doc)
 | 
				
			||||||
    return docs
 | 
					    return docs
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def is_ner(tag):
 | 
					def has_ner(input_data, ner_tag_pattern):
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    Check the 10th column of the first token to determine if the file contains
 | 
					    Check the 10th column of the first token to determine if the file contains
 | 
				
			||||||
    NER tags
 | 
					    NER tags
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    tag_match = re.match("([A-Z_]+)-([A-Z_]+)", tag)
 | 
					 | 
				
			||||||
    if tag_match:
 | 
					 | 
				
			||||||
        return True
 | 
					 | 
				
			||||||
    elif tag == "O":
 | 
					 | 
				
			||||||
        return True
 | 
					 | 
				
			||||||
    else:
 | 
					 | 
				
			||||||
        return False
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def read_conllx(input_data, use_morphology=False, n=0):
 | 
					 | 
				
			||||||
    i = 0
 | 
					 | 
				
			||||||
    for sent in input_data.strip().split("\n\n"):
 | 
					    for sent in input_data.strip().split("\n\n"):
 | 
				
			||||||
        lines = sent.strip().split("\n")
 | 
					        lines = sent.strip().split("\n")
 | 
				
			||||||
        if lines:
 | 
					        if lines:
 | 
				
			||||||
            while lines[0].startswith("#"):
 | 
					            while lines[0].startswith("#"):
 | 
				
			||||||
                lines.pop(0)
 | 
					                lines.pop(0)
 | 
				
			||||||
            tokens = []
 | 
					            if lines:
 | 
				
			||||||
            for line in lines:
 | 
					                parts = lines[0].split("\t")
 | 
				
			||||||
 | 
					                id_, word, lemma, pos, tag, morph, head, dep, _1, misc = parts
 | 
				
			||||||
 | 
					                if re.search(ner_tag_pattern, misc):
 | 
				
			||||||
 | 
					                    return True
 | 
				
			||||||
 | 
					                else:
 | 
				
			||||||
 | 
					                    return False
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def read_conllx(
 | 
				
			||||||
 | 
					    input_data,
 | 
				
			||||||
 | 
					    append_morphology=False,
 | 
				
			||||||
 | 
					    merge_subtokens=False,
 | 
				
			||||||
 | 
					    ner_tag_pattern="",
 | 
				
			||||||
 | 
					    ner_map=None,
 | 
				
			||||||
 | 
					):
 | 
				
			||||||
 | 
					    """ Yield examples, one for each sentence """
 | 
				
			||||||
 | 
					    vocab = Language.Defaults.create_vocab()  # need vocab to make a minimal Doc
 | 
				
			||||||
 | 
					    for sent in input_data.strip().split("\n\n"):
 | 
				
			||||||
 | 
					        lines = sent.strip().split("\n")
 | 
				
			||||||
 | 
					        if lines:
 | 
				
			||||||
 | 
					            while lines[0].startswith("#"):
 | 
				
			||||||
 | 
					                lines.pop(0)
 | 
				
			||||||
 | 
					            example = example_from_conllu_sentence(
 | 
				
			||||||
 | 
					                vocab,
 | 
				
			||||||
 | 
					                lines,
 | 
				
			||||||
 | 
					                ner_tag_pattern,
 | 
				
			||||||
 | 
					                merge_subtokens=merge_subtokens,
 | 
				
			||||||
 | 
					                append_morphology=append_morphology,
 | 
				
			||||||
 | 
					                ner_map=ner_map,
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					            yield example
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def get_entities(lines, tag_pattern, ner_map=None):
 | 
				
			||||||
 | 
					    """Find entities in the MISC column according to the pattern and map to
 | 
				
			||||||
 | 
					    final entity type with `ner_map` if mapping present. Entity tag is 'O' if
 | 
				
			||||||
 | 
					    the pattern is not matched.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    lines (unicode): CONLL-U lines for one sentences
 | 
				
			||||||
 | 
					    tag_pattern (unicode): Regex pattern for entity tag
 | 
				
			||||||
 | 
					    ner_map (dict): Map old NER tag names to new ones, '' maps to O.
 | 
				
			||||||
 | 
					    RETURNS (list): List of BILUO entity tags
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    miscs = []
 | 
				
			||||||
 | 
					    for line in lines:
 | 
				
			||||||
        parts = line.split("\t")
 | 
					        parts = line.split("\t")
 | 
				
			||||||
                id_, word, lemma, pos, tag, morph, head, dep, _1, iob = parts
 | 
					        id_, word, lemma, pos, tag, morph, head, dep, _1, misc = parts
 | 
				
			||||||
        if "-" in id_ or "." in id_:
 | 
					        if "-" in id_ or "." in id_:
 | 
				
			||||||
            continue
 | 
					            continue
 | 
				
			||||||
                try:
 | 
					        miscs.append(misc)
 | 
				
			||||||
                    id_ = int(id_) - 1
 | 
					 | 
				
			||||||
                    head = (int(head) - 1) if head not in ["0", "_"] else id_
 | 
					 | 
				
			||||||
                    dep = "ROOT" if dep == "root" else dep
 | 
					 | 
				
			||||||
                    tag = pos if tag == "_" else tag
 | 
					 | 
				
			||||||
                    tag = tag + "__" + morph if use_morphology else tag
 | 
					 | 
				
			||||||
                    iob = iob if iob else "O"
 | 
					 | 
				
			||||||
                    tokens.append((id_, word, tag, head, dep, iob))
 | 
					 | 
				
			||||||
                except:  # noqa: E722
 | 
					 | 
				
			||||||
                    print(line)
 | 
					 | 
				
			||||||
                    raise
 | 
					 | 
				
			||||||
            tuples = [list(t) for t in zip(*tokens)]
 | 
					 | 
				
			||||||
            yield (None, [[tuples, []]])
 | 
					 | 
				
			||||||
            i += 1
 | 
					 | 
				
			||||||
            if n >= 1 and i >= n:
 | 
					 | 
				
			||||||
                break
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    iob = []
 | 
				
			||||||
def simplify_tags(iob):
 | 
					    for misc in miscs:
 | 
				
			||||||
    """
 | 
					        tag_match = re.search(tag_pattern, misc)
 | 
				
			||||||
    Simplify tags obtained from the dataset in order to follow Wikipedia
 | 
					        iob_tag = "O"
 | 
				
			||||||
    scheme (PER, LOC, ORG, MISC). 'PER', 'LOC' and 'ORG' keep their tags, while
 | 
					 | 
				
			||||||
    'GPE_LOC' is simplified to 'LOC', 'GPE_ORG' to 'ORG' and all remaining tags to
 | 
					 | 
				
			||||||
    'MISC'.
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
    new_iob = []
 | 
					 | 
				
			||||||
    for tag in iob:
 | 
					 | 
				
			||||||
        tag_match = re.match("([A-Z_]+)-([A-Z_]+)", tag)
 | 
					 | 
				
			||||||
        if tag_match:
 | 
					        if tag_match:
 | 
				
			||||||
            prefix = tag_match.group(1)
 | 
					            prefix = tag_match.group(2)
 | 
				
			||||||
            suffix = tag_match.group(2)
 | 
					            suffix = tag_match.group(3)
 | 
				
			||||||
            if suffix == "GPE_LOC":
 | 
					            if prefix and suffix:
 | 
				
			||||||
                suffix = "LOC"
 | 
					                iob_tag = prefix + "-" + suffix
 | 
				
			||||||
            elif suffix == "GPE_ORG":
 | 
					                if ner_map:
 | 
				
			||||||
                suffix = "ORG"
 | 
					                    suffix = ner_map.get(suffix, suffix)
 | 
				
			||||||
            elif suffix != "PER" and suffix != "LOC" and suffix != "ORG":
 | 
					                    if suffix == "":
 | 
				
			||||||
                suffix = "MISC"
 | 
					                        iob_tag = "O"
 | 
				
			||||||
            tag = prefix + "-" + suffix
 | 
					                    else:
 | 
				
			||||||
        new_iob.append(tag)
 | 
					                        iob_tag = prefix + "-" + suffix
 | 
				
			||||||
    return new_iob
 | 
					        iob.append(iob_tag)
 | 
				
			||||||
 | 
					    return iob_to_biluo(iob)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def generate_sentence(sent, has_ner_tags):
 | 
					def generate_sentence(token_annotation, has_ner_tags, tag_pattern, ner_map=None):
 | 
				
			||||||
    (id_, word, tag, head, dep, iob) = sent
 | 
					 | 
				
			||||||
    sentence = {}
 | 
					    sentence = {}
 | 
				
			||||||
    tokens = []
 | 
					    tokens = []
 | 
				
			||||||
    if has_ner_tags:
 | 
					    for i, id_ in enumerate(token_annotation.ids):
 | 
				
			||||||
        iob = simplify_tags(iob)
 | 
					 | 
				
			||||||
        biluo = iob_to_biluo(iob)
 | 
					 | 
				
			||||||
    for i, id in enumerate(id_):
 | 
					 | 
				
			||||||
        token = {}
 | 
					        token = {}
 | 
				
			||||||
        token["id"] = id
 | 
					        token["id"] = id_
 | 
				
			||||||
        token["orth"] = word[i]
 | 
					        token["orth"] = token_annotation.get_word(i)
 | 
				
			||||||
        token["tag"] = tag[i]
 | 
					        token["tag"] = token_annotation.get_tag(i)
 | 
				
			||||||
        token["head"] = head[i] - id
 | 
					        token["pos"] = token_annotation.get_pos(i)
 | 
				
			||||||
        token["dep"] = dep[i]
 | 
					        token["lemma"] = token_annotation.get_lemma(i)
 | 
				
			||||||
 | 
					        token["morph"] = token_annotation.get_morph(i)
 | 
				
			||||||
 | 
					        token["head"] = token_annotation.get_head(i) - id_
 | 
				
			||||||
 | 
					        token["dep"] = token_annotation.get_dep(i)
 | 
				
			||||||
        if has_ner_tags:
 | 
					        if has_ner_tags:
 | 
				
			||||||
            token["ner"] = biluo[i]
 | 
					            token["ner"] = token_annotation.get_entity(i)
 | 
				
			||||||
        tokens.append(token)
 | 
					        tokens.append(token)
 | 
				
			||||||
    sentence["tokens"] = tokens
 | 
					    sentence["tokens"] = tokens
 | 
				
			||||||
    return sentence
 | 
					    return sentence
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def create_doc(sentences, id):
 | 
					def create_json_doc(raw, sentences, id_):
 | 
				
			||||||
    doc = {}
 | 
					    doc = {}
 | 
				
			||||||
    paragraph = {}
 | 
					    paragraph = {}
 | 
				
			||||||
    doc["id"] = id
 | 
					    doc["id"] = id_
 | 
				
			||||||
    doc["paragraphs"] = []
 | 
					    doc["paragraphs"] = []
 | 
				
			||||||
 | 
					    paragraph["raw"] = raw.strip()
 | 
				
			||||||
    paragraph["sentences"] = sentences
 | 
					    paragraph["sentences"] = sentences
 | 
				
			||||||
    doc["paragraphs"].append(paragraph)
 | 
					    doc["paragraphs"].append(paragraph)
 | 
				
			||||||
    return doc
 | 
					    return doc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def example_from_conllu_sentence(
 | 
				
			||||||
 | 
					    vocab,
 | 
				
			||||||
 | 
					    lines,
 | 
				
			||||||
 | 
					    ner_tag_pattern,
 | 
				
			||||||
 | 
					    merge_subtokens=False,
 | 
				
			||||||
 | 
					    append_morphology=False,
 | 
				
			||||||
 | 
					    ner_map=None,
 | 
				
			||||||
 | 
					):
 | 
				
			||||||
 | 
					    """Create an Example from the lines for one CoNLL-U sentence, merging
 | 
				
			||||||
 | 
					    subtokens and appending morphology to tags if required.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    lines (unicode): The non-comment lines for a CoNLL-U sentence
 | 
				
			||||||
 | 
					    ner_tag_pattern (unicode): The regex pattern for matching NER in MISC col
 | 
				
			||||||
 | 
					    RETURNS (Example): An example containing the annotation
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    # create a Doc with each subtoken as its own token
 | 
				
			||||||
 | 
					    # if merging subtokens, each subtoken orth is the merged subtoken form
 | 
				
			||||||
 | 
					    if not Token.has_extension("merged_orth"):
 | 
				
			||||||
 | 
					        Token.set_extension("merged_orth", default="")
 | 
				
			||||||
 | 
					    if not Token.has_extension("merged_lemma"):
 | 
				
			||||||
 | 
					        Token.set_extension("merged_lemma", default="")
 | 
				
			||||||
 | 
					    if not Token.has_extension("merged_morph"):
 | 
				
			||||||
 | 
					        Token.set_extension("merged_morph", default="")
 | 
				
			||||||
 | 
					    if not Token.has_extension("merged_spaceafter"):
 | 
				
			||||||
 | 
					        Token.set_extension("merged_spaceafter", default="")
 | 
				
			||||||
 | 
					    words, spaces, tags, poses, morphs, lemmas = [], [], [], [], [], []
 | 
				
			||||||
 | 
					    heads, deps = [], []
 | 
				
			||||||
 | 
					    subtok_word = ""
 | 
				
			||||||
 | 
					    in_subtok = False
 | 
				
			||||||
 | 
					    for i in range(len(lines)):
 | 
				
			||||||
 | 
					        line = lines[i]
 | 
				
			||||||
 | 
					        parts = line.split("\t")
 | 
				
			||||||
 | 
					        id_, word, lemma, pos, tag, morph, head, dep, _1, misc = parts
 | 
				
			||||||
 | 
					        if "." in id_:
 | 
				
			||||||
 | 
					            continue
 | 
				
			||||||
 | 
					        if "-" in id_:
 | 
				
			||||||
 | 
					            in_subtok = True
 | 
				
			||||||
 | 
					        if "-" in id_:
 | 
				
			||||||
 | 
					            in_subtok = True
 | 
				
			||||||
 | 
					            subtok_word = word
 | 
				
			||||||
 | 
					            subtok_start, subtok_end = id_.split("-")
 | 
				
			||||||
 | 
					            subtok_spaceafter = "SpaceAfter=No" not in misc
 | 
				
			||||||
 | 
					            continue
 | 
				
			||||||
 | 
					        if merge_subtokens and in_subtok:
 | 
				
			||||||
 | 
					            words.append(subtok_word)
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            words.append(word)
 | 
				
			||||||
 | 
					        if in_subtok:
 | 
				
			||||||
 | 
					            if id_ == subtok_end:
 | 
				
			||||||
 | 
					                spaces.append(subtok_spaceafter)
 | 
				
			||||||
 | 
					            else:
 | 
				
			||||||
 | 
					                spaces.append(False)
 | 
				
			||||||
 | 
					        elif "SpaceAfter=No" in misc:
 | 
				
			||||||
 | 
					            spaces.append(False)
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            spaces.append(True)
 | 
				
			||||||
 | 
					        if in_subtok and id_ == subtok_end:
 | 
				
			||||||
 | 
					            subtok_word = ""
 | 
				
			||||||
 | 
					            in_subtok = False
 | 
				
			||||||
 | 
					        id_ = int(id_) - 1
 | 
				
			||||||
 | 
					        head = (int(head) - 1) if head not in ("0", "_") else id_
 | 
				
			||||||
 | 
					        tag = pos if tag == "_" else tag
 | 
				
			||||||
 | 
					        morph = morph if morph != "_" else ""
 | 
				
			||||||
 | 
					        dep = "ROOT" if dep == "root" else dep
 | 
				
			||||||
 | 
					        lemmas.append(lemma)
 | 
				
			||||||
 | 
					        poses.append(pos)
 | 
				
			||||||
 | 
					        tags.append(tag)
 | 
				
			||||||
 | 
					        morphs.append(morph)
 | 
				
			||||||
 | 
					        heads.append(head)
 | 
				
			||||||
 | 
					        deps.append(dep)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    doc = Doc(vocab, words=words, spaces=spaces)
 | 
				
			||||||
 | 
					    for i in range(len(doc)):
 | 
				
			||||||
 | 
					        doc[i].tag_ = tags[i]
 | 
				
			||||||
 | 
					        doc[i].pos_ = poses[i]
 | 
				
			||||||
 | 
					        doc[i].dep_ = deps[i]
 | 
				
			||||||
 | 
					        doc[i].lemma_ = lemmas[i]
 | 
				
			||||||
 | 
					        doc[i].head = doc[heads[i]]
 | 
				
			||||||
 | 
					        doc[i]._.merged_orth = words[i]
 | 
				
			||||||
 | 
					        doc[i]._.merged_morph = morphs[i]
 | 
				
			||||||
 | 
					        doc[i]._.merged_lemma = lemmas[i]
 | 
				
			||||||
 | 
					        doc[i]._.merged_spaceafter = spaces[i]
 | 
				
			||||||
 | 
					    ents = get_entities(lines, ner_tag_pattern, ner_map)
 | 
				
			||||||
 | 
					    doc.ents = spans_from_biluo_tags(doc, ents)
 | 
				
			||||||
 | 
					    doc.is_parsed = True
 | 
				
			||||||
 | 
					    doc.is_tagged = True
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if merge_subtokens:
 | 
				
			||||||
 | 
					        doc = merge_conllu_subtokens(lines, doc)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # create Example from custom Doc annotation
 | 
				
			||||||
 | 
					    ids, words, tags, heads, deps = [], [], [], [], []
 | 
				
			||||||
 | 
					    pos, lemmas, morphs, spaces = [], [], [], []
 | 
				
			||||||
 | 
					    for i, t in enumerate(doc):
 | 
				
			||||||
 | 
					        ids.append(i)
 | 
				
			||||||
 | 
					        words.append(t._.merged_orth)
 | 
				
			||||||
 | 
					        if append_morphology and t._.merged_morph:
 | 
				
			||||||
 | 
					            tags.append(t.tag_ + "__" + t._.merged_morph)
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            tags.append(t.tag_)
 | 
				
			||||||
 | 
					        pos.append(t.pos_)
 | 
				
			||||||
 | 
					        morphs.append(t._.merged_morph)
 | 
				
			||||||
 | 
					        lemmas.append(t._.merged_lemma)
 | 
				
			||||||
 | 
					        heads.append(t.head.i)
 | 
				
			||||||
 | 
					        deps.append(t.dep_)
 | 
				
			||||||
 | 
					        spaces.append(t._.merged_spaceafter)
 | 
				
			||||||
 | 
					    ent_offsets = [(e.start_char, e.end_char, e.label_) for e in doc.ents]
 | 
				
			||||||
 | 
					    ents = biluo_tags_from_offsets(doc, ent_offsets)
 | 
				
			||||||
 | 
					    raw = ""
 | 
				
			||||||
 | 
					    for word, space in zip(words, spaces):
 | 
				
			||||||
 | 
					        raw += word
 | 
				
			||||||
 | 
					        if space:
 | 
				
			||||||
 | 
					            raw += " "
 | 
				
			||||||
 | 
					    example = Example(doc=raw)
 | 
				
			||||||
 | 
					    example.set_token_annotation(
 | 
				
			||||||
 | 
					        ids=ids,
 | 
				
			||||||
 | 
					        words=words,
 | 
				
			||||||
 | 
					        tags=tags,
 | 
				
			||||||
 | 
					        pos=pos,
 | 
				
			||||||
 | 
					        morphs=morphs,
 | 
				
			||||||
 | 
					        lemmas=lemmas,
 | 
				
			||||||
 | 
					        heads=heads,
 | 
				
			||||||
 | 
					        deps=deps,
 | 
				
			||||||
 | 
					        entities=ents,
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					    return example
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def merge_conllu_subtokens(lines, doc):
 | 
				
			||||||
 | 
					    # identify and process all subtoken spans to prepare attrs for merging
 | 
				
			||||||
 | 
					    subtok_spans = []
 | 
				
			||||||
 | 
					    for line in lines:
 | 
				
			||||||
 | 
					        parts = line.split("\t")
 | 
				
			||||||
 | 
					        id_, word, lemma, pos, tag, morph, head, dep, _1, misc = parts
 | 
				
			||||||
 | 
					        if "-" in id_:
 | 
				
			||||||
 | 
					            subtok_start, subtok_end = id_.split("-")
 | 
				
			||||||
 | 
					            subtok_span = doc[int(subtok_start) - 1 : int(subtok_end)]
 | 
				
			||||||
 | 
					            subtok_spans.append(subtok_span)
 | 
				
			||||||
 | 
					            # create merged tag, morph, and lemma values
 | 
				
			||||||
 | 
					            tags = []
 | 
				
			||||||
 | 
					            morphs = {}
 | 
				
			||||||
 | 
					            lemmas = []
 | 
				
			||||||
 | 
					            for token in subtok_span:
 | 
				
			||||||
 | 
					                tags.append(token.tag_)
 | 
				
			||||||
 | 
					                lemmas.append(token.lemma_)
 | 
				
			||||||
 | 
					                if token._.merged_morph:
 | 
				
			||||||
 | 
					                    for feature in token._.merged_morph.split("|"):
 | 
				
			||||||
 | 
					                        field, values = feature.split("=", 1)
 | 
				
			||||||
 | 
					                        if field not in morphs:
 | 
				
			||||||
 | 
					                            morphs[field] = set()
 | 
				
			||||||
 | 
					                        for value in values.split(","):
 | 
				
			||||||
 | 
					                            morphs[field].add(value)
 | 
				
			||||||
 | 
					            # create merged features for each morph field
 | 
				
			||||||
 | 
					            for field, values in morphs.items():
 | 
				
			||||||
 | 
					                morphs[field] = field + "=" + ",".join(sorted(values))
 | 
				
			||||||
 | 
					            # set the same attrs on all subtok tokens so that whatever head the
 | 
				
			||||||
 | 
					            # retokenizer chooses, the final attrs are available on that token
 | 
				
			||||||
 | 
					            for token in subtok_span:
 | 
				
			||||||
 | 
					                token._.merged_orth = token.orth_
 | 
				
			||||||
 | 
					                token._.merged_lemma = " ".join(lemmas)
 | 
				
			||||||
 | 
					                token.tag_ = "_".join(tags)
 | 
				
			||||||
 | 
					                token._.merged_morph = "|".join(sorted(morphs.values()))
 | 
				
			||||||
 | 
					                token._.merged_spaceafter = (
 | 
				
			||||||
 | 
					                    True if subtok_span[-1].whitespace_ else False
 | 
				
			||||||
 | 
					                )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    with doc.retokenize() as retokenizer:
 | 
				
			||||||
 | 
					        for span in subtok_spans:
 | 
				
			||||||
 | 
					            retokenizer.merge(span)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    return doc
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,3 @@
 | 
				
			||||||
# coding: utf8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from wasabi import Printer
 | 
					from wasabi import Printer
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ...gold import iob_to_biluo
 | 
					from ...gold import iob_to_biluo
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,3 @@
 | 
				
			||||||
# coding: utf8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
import srsly
 | 
					import srsly
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ...gold import docs_to_json
 | 
					from ...gold import docs_to_json
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,9 +1,5 @@
 | 
				
			||||||
# coding: utf8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals, print_function
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from pathlib import Path
 | 
					from pathlib import Path
 | 
				
			||||||
from collections import Counter
 | 
					from collections import Counter
 | 
				
			||||||
import plac
 | 
					 | 
				
			||||||
import sys
 | 
					import sys
 | 
				
			||||||
import srsly
 | 
					import srsly
 | 
				
			||||||
from wasabi import Printer, MESSAGES
 | 
					from wasabi import Printer, MESSAGES
 | 
				
			||||||
| 
						 | 
					@ -22,30 +18,18 @@ BLANK_MODEL_MIN_THRESHOLD = 100
 | 
				
			||||||
BLANK_MODEL_THRESHOLD = 2000
 | 
					BLANK_MODEL_THRESHOLD = 2000
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@plac.annotations(
 | 
					 | 
				
			||||||
    lang=("model language", "positional", None, str),
 | 
					 | 
				
			||||||
    train_path=("location of JSON-formatted training data", "positional", None, Path),
 | 
					 | 
				
			||||||
    dev_path=("location of JSON-formatted development data", "positional", None, Path),
 | 
					 | 
				
			||||||
    base_model=("name of model to update (optional)", "option", "b", str),
 | 
					 | 
				
			||||||
    pipeline=(
 | 
					 | 
				
			||||||
        "Comma-separated names of pipeline components to train",
 | 
					 | 
				
			||||||
        "option",
 | 
					 | 
				
			||||||
        "p",
 | 
					 | 
				
			||||||
        str,
 | 
					 | 
				
			||||||
    ),
 | 
					 | 
				
			||||||
    ignore_warnings=("Ignore warnings, only show stats and errors", "flag", "IW", bool),
 | 
					 | 
				
			||||||
    verbose=("Print additional information and explanations", "flag", "V", bool),
 | 
					 | 
				
			||||||
    no_format=("Don't pretty-print the results", "flag", "NF", bool),
 | 
					 | 
				
			||||||
)
 | 
					 | 
				
			||||||
def debug_data(
 | 
					def debug_data(
 | 
				
			||||||
    lang,
 | 
					    # fmt: off
 | 
				
			||||||
    train_path,
 | 
					    lang: ("Model language", "positional", None, str),
 | 
				
			||||||
    dev_path,
 | 
					    train_path: ("Location of JSON-formatted training data", "positional", None, Path),
 | 
				
			||||||
    base_model=None,
 | 
					    dev_path: ("Location of JSON-formatted development data", "positional", None, Path),
 | 
				
			||||||
    pipeline="tagger,parser,ner",
 | 
					    tag_map_path: ("Location of JSON-formatted tag map", "option", "tm", Path) = None,
 | 
				
			||||||
    ignore_warnings=False,
 | 
					    base_model: ("Name of model to update (optional)", "option", "b", str) = None,
 | 
				
			||||||
    verbose=False,
 | 
					    pipeline: ("Comma-separated names of pipeline components to train", "option", "p", str) = "tagger,parser,ner",
 | 
				
			||||||
    no_format=False,
 | 
					    ignore_warnings: ("Ignore warnings, only show stats and errors", "flag", "IW", bool) = False,
 | 
				
			||||||
 | 
					    verbose: ("Print additional information and explanations", "flag", "V", bool) = False,
 | 
				
			||||||
 | 
					    no_format: ("Don't pretty-print the results", "flag", "NF", bool) = False,
 | 
				
			||||||
 | 
					    # fmt: on
 | 
				
			||||||
):
 | 
					):
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    Analyze, debug and validate your training and development data, get useful
 | 
					    Analyze, debug and validate your training and development data, get useful
 | 
				
			||||||
| 
						 | 
					@ -60,6 +44,10 @@ def debug_data(
 | 
				
			||||||
    if not dev_path.exists():
 | 
					    if not dev_path.exists():
 | 
				
			||||||
        msg.fail("Development data not found", dev_path, exits=1)
 | 
					        msg.fail("Development data not found", dev_path, exits=1)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    tag_map = {}
 | 
				
			||||||
 | 
					    if tag_map_path is not None:
 | 
				
			||||||
 | 
					        tag_map = srsly.read_json(tag_map_path)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # Initialize the model and pipeline
 | 
					    # Initialize the model and pipeline
 | 
				
			||||||
    pipeline = [p.strip() for p in pipeline.split(",")]
 | 
					    pipeline = [p.strip() for p in pipeline.split(",")]
 | 
				
			||||||
    if base_model:
 | 
					    if base_model:
 | 
				
			||||||
| 
						 | 
					@ -67,6 +55,8 @@ def debug_data(
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        lang_cls = get_lang_class(lang)
 | 
					        lang_cls = get_lang_class(lang)
 | 
				
			||||||
        nlp = lang_cls()
 | 
					        nlp = lang_cls()
 | 
				
			||||||
 | 
					    # Update tag map with provided mapping
 | 
				
			||||||
 | 
					    nlp.vocab.morphology.tag_map.update(tag_map)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    msg.divider("Data format validation")
 | 
					    msg.divider("Data format validation")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -80,20 +70,16 @@ def debug_data(
 | 
				
			||||||
    with msg.loading("Loading corpus..."):
 | 
					    with msg.loading("Loading corpus..."):
 | 
				
			||||||
        corpus = GoldCorpus(train_path, dev_path)
 | 
					        corpus = GoldCorpus(train_path, dev_path)
 | 
				
			||||||
        try:
 | 
					        try:
 | 
				
			||||||
            train_docs = list(corpus.train_docs(nlp))
 | 
					            train_dataset = list(corpus.train_dataset(nlp))
 | 
				
			||||||
            train_docs_unpreprocessed = list(
 | 
					            train_dataset_unpreprocessed = list(
 | 
				
			||||||
                corpus.train_docs_without_preprocessing(nlp)
 | 
					                corpus.train_dataset_without_preprocessing(nlp)
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
        except ValueError as e:
 | 
					        except ValueError as e:
 | 
				
			||||||
            loading_train_error_message = "Training data cannot be loaded: {}".format(
 | 
					            loading_train_error_message = f"Training data cannot be loaded: {e}"
 | 
				
			||||||
                str(e)
 | 
					 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
        try:
 | 
					        try:
 | 
				
			||||||
            dev_docs = list(corpus.dev_docs(nlp))
 | 
					            dev_dataset = list(corpus.dev_dataset(nlp))
 | 
				
			||||||
        except ValueError as e:
 | 
					        except ValueError as e:
 | 
				
			||||||
            loading_dev_error_message = "Development data cannot be loaded: {}".format(
 | 
					            loading_dev_error_message = f"Development data cannot be loaded: {e}"
 | 
				
			||||||
                str(e)
 | 
					 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
    if loading_train_error_message or loading_dev_error_message:
 | 
					    if loading_train_error_message or loading_dev_error_message:
 | 
				
			||||||
        if loading_train_error_message:
 | 
					        if loading_train_error_message:
 | 
				
			||||||
            msg.fail(loading_train_error_message)
 | 
					            msg.fail(loading_train_error_message)
 | 
				
			||||||
| 
						 | 
					@ -102,80 +88,68 @@ def debug_data(
 | 
				
			||||||
        sys.exit(1)
 | 
					        sys.exit(1)
 | 
				
			||||||
    msg.good("Corpus is loadable")
 | 
					    msg.good("Corpus is loadable")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # Create all gold data here to avoid iterating over the train_docs constantly
 | 
					    # Create all gold data here to avoid iterating over the train_dataset constantly
 | 
				
			||||||
    gold_train_data = _compile_gold(train_docs, pipeline)
 | 
					    gold_train_data = _compile_gold(train_dataset, pipeline)
 | 
				
			||||||
    gold_train_unpreprocessed_data = _compile_gold(train_docs_unpreprocessed, pipeline)
 | 
					    gold_train_unpreprocessed_data = _compile_gold(
 | 
				
			||||||
    gold_dev_data = _compile_gold(dev_docs, pipeline)
 | 
					        train_dataset_unpreprocessed, pipeline
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					    gold_dev_data = _compile_gold(dev_dataset, pipeline)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    train_texts = gold_train_data["texts"]
 | 
					    train_texts = gold_train_data["texts"]
 | 
				
			||||||
    dev_texts = gold_dev_data["texts"]
 | 
					    dev_texts = gold_dev_data["texts"]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    msg.divider("Training stats")
 | 
					    msg.divider("Training stats")
 | 
				
			||||||
    msg.text("Training pipeline: {}".format(", ".join(pipeline)))
 | 
					    msg.text(f"Training pipeline: {', '.join(pipeline)}")
 | 
				
			||||||
    for pipe in [p for p in pipeline if p not in nlp.factories]:
 | 
					    for pipe in [p for p in pipeline if p not in nlp.factories]:
 | 
				
			||||||
        msg.fail("Pipeline component '{}' not available in factories".format(pipe))
 | 
					        msg.fail(f"Pipeline component '{pipe}' not available in factories")
 | 
				
			||||||
    if base_model:
 | 
					    if base_model:
 | 
				
			||||||
        msg.text("Starting with base model '{}'".format(base_model))
 | 
					        msg.text(f"Starting with base model '{base_model}'")
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        msg.text("Starting with blank model '{}'".format(lang))
 | 
					        msg.text(f"Starting with blank model '{lang}'")
 | 
				
			||||||
    msg.text("{} training docs".format(len(train_docs)))
 | 
					    msg.text(f"{len(train_dataset)} training docs")
 | 
				
			||||||
    msg.text("{} evaluation docs".format(len(dev_docs)))
 | 
					    msg.text(f"{len(dev_dataset)} evaluation docs")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if not len(dev_docs):
 | 
					    if not len(gold_dev_data):
 | 
				
			||||||
        msg.fail("No evaluation docs")
 | 
					        msg.fail("No evaluation docs")
 | 
				
			||||||
    overlap = len(train_texts.intersection(dev_texts))
 | 
					    overlap = len(train_texts.intersection(dev_texts))
 | 
				
			||||||
    if overlap:
 | 
					    if overlap:
 | 
				
			||||||
        msg.warn("{} training examples also in evaluation data".format(overlap))
 | 
					        msg.warn(f"{overlap} training examples also in evaluation data")
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        msg.good("No overlap between training and evaluation data")
 | 
					        msg.good("No overlap between training and evaluation data")
 | 
				
			||||||
    if not base_model and len(train_docs) < BLANK_MODEL_THRESHOLD:
 | 
					    if not base_model and len(train_dataset) < BLANK_MODEL_THRESHOLD:
 | 
				
			||||||
        text = "Low number of examples to train from a blank model ({})".format(
 | 
					        text = (
 | 
				
			||||||
            len(train_docs)
 | 
					            f"Low number of examples to train from a blank model ({len(train_dataset)})"
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
        if len(train_docs) < BLANK_MODEL_MIN_THRESHOLD:
 | 
					        if len(train_dataset) < BLANK_MODEL_MIN_THRESHOLD:
 | 
				
			||||||
            msg.fail(text)
 | 
					            msg.fail(text)
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            msg.warn(text)
 | 
					            msg.warn(text)
 | 
				
			||||||
        msg.text(
 | 
					        msg.text(
 | 
				
			||||||
            "It's recommended to use at least {} examples (minimum {})".format(
 | 
					            f"It's recommended to use at least {BLANK_MODEL_THRESHOLD} examples "
 | 
				
			||||||
                BLANK_MODEL_THRESHOLD, BLANK_MODEL_MIN_THRESHOLD
 | 
					            f"(minimum {BLANK_MODEL_MIN_THRESHOLD})",
 | 
				
			||||||
            ),
 | 
					 | 
				
			||||||
            show=verbose,
 | 
					            show=verbose,
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    msg.divider("Vocab & Vectors")
 | 
					    msg.divider("Vocab & Vectors")
 | 
				
			||||||
    n_words = gold_train_data["n_words"]
 | 
					    n_words = gold_train_data["n_words"]
 | 
				
			||||||
    msg.info(
 | 
					    msg.info(
 | 
				
			||||||
        "{} total {} in the data ({} unique)".format(
 | 
					        f"{n_words} total word(s) in the data ({len(gold_train_data['words'])} unique)"
 | 
				
			||||||
            n_words, "word" if n_words == 1 else "words", len(gold_train_data["words"])
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
    if gold_train_data["n_misaligned_words"] > 0:
 | 
					    if gold_train_data["n_misaligned_words"] > 0:
 | 
				
			||||||
        msg.warn(
 | 
					        n_misaligned = gold_train_data["n_misaligned_words"]
 | 
				
			||||||
            "{} misaligned tokens in the training data".format(
 | 
					        msg.warn(f"{n_misaligned} misaligned tokens in the training data")
 | 
				
			||||||
                gold_train_data["n_misaligned_words"]
 | 
					 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
    if gold_dev_data["n_misaligned_words"] > 0:
 | 
					    if gold_dev_data["n_misaligned_words"] > 0:
 | 
				
			||||||
        msg.warn(
 | 
					        n_misaligned = gold_dev_data["n_misaligned_words"]
 | 
				
			||||||
            "{} misaligned tokens in the dev data".format(
 | 
					        msg.warn(f"{n_misaligned} misaligned tokens in the dev data")
 | 
				
			||||||
                gold_dev_data["n_misaligned_words"]
 | 
					 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
    most_common_words = gold_train_data["words"].most_common(10)
 | 
					    most_common_words = gold_train_data["words"].most_common(10)
 | 
				
			||||||
    msg.text(
 | 
					    msg.text(
 | 
				
			||||||
        "10 most common words: {}".format(
 | 
					        f"10 most common words: {_format_labels(most_common_words, counts=True)}",
 | 
				
			||||||
            _format_labels(most_common_words, counts=True)
 | 
					 | 
				
			||||||
        ),
 | 
					 | 
				
			||||||
        show=verbose,
 | 
					        show=verbose,
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
    if len(nlp.vocab.vectors):
 | 
					    if len(nlp.vocab.vectors):
 | 
				
			||||||
        msg.info(
 | 
					        msg.info(
 | 
				
			||||||
            "{} vectors ({} unique keys, {} dimensions)".format(
 | 
					            f"{len(nlp.vocab.vectors)} vectors ({nlp.vocab.vectors.n_keys} "
 | 
				
			||||||
                len(nlp.vocab.vectors),
 | 
					            f"unique keys, {nlp.vocab.vectors_length} dimensions)"
 | 
				
			||||||
                nlp.vocab.vectors.n_keys,
 | 
					 | 
				
			||||||
                nlp.vocab.vectors_length,
 | 
					 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        msg.info("No word vectors present in the model")
 | 
					        msg.info("No word vectors present in the model")
 | 
				
			||||||
| 
						 | 
					@ -183,7 +157,7 @@ def debug_data(
 | 
				
			||||||
    if "ner" in pipeline:
 | 
					    if "ner" in pipeline:
 | 
				
			||||||
        # Get all unique NER labels present in the data
 | 
					        # Get all unique NER labels present in the data
 | 
				
			||||||
        labels = set(
 | 
					        labels = set(
 | 
				
			||||||
            label for label in gold_train_data["ner"] if label not in ("O", "-")
 | 
					            label for label in gold_train_data["ner"] if label not in ("O", "-", None)
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
        label_counts = gold_train_data["ner"]
 | 
					        label_counts = gold_train_data["ner"]
 | 
				
			||||||
        model_labels = _get_labels_from_model(nlp, "ner")
 | 
					        model_labels = _get_labels_from_model(nlp, "ner")
 | 
				
			||||||
| 
						 | 
					@ -196,19 +170,10 @@ def debug_data(
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        msg.divider("Named Entity Recognition")
 | 
					        msg.divider("Named Entity Recognition")
 | 
				
			||||||
        msg.info(
 | 
					        msg.info(
 | 
				
			||||||
            "{} new {}, {} existing {}".format(
 | 
					            f"{len(new_labels)} new label(s), {len(existing_labels)} existing label(s)"
 | 
				
			||||||
                len(new_labels),
 | 
					 | 
				
			||||||
                "label" if len(new_labels) == 1 else "labels",
 | 
					 | 
				
			||||||
                len(existing_labels),
 | 
					 | 
				
			||||||
                "label" if len(existing_labels) == 1 else "labels",
 | 
					 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
        missing_values = label_counts["-"]
 | 
					        missing_values = label_counts["-"]
 | 
				
			||||||
        msg.text(
 | 
					        msg.text(f"{missing_values} missing value(s) (tokens with '-' label)")
 | 
				
			||||||
            "{} missing {} (tokens with '-' label)".format(
 | 
					 | 
				
			||||||
                missing_values, "value" if missing_values == 1 else "values"
 | 
					 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
        for label in new_labels:
 | 
					        for label in new_labels:
 | 
				
			||||||
            if len(label) == 0:
 | 
					            if len(label) == 0:
 | 
				
			||||||
                msg.fail("Empty label found in new labels")
 | 
					                msg.fail("Empty label found in new labels")
 | 
				
			||||||
| 
						 | 
					@ -219,39 +184,28 @@ def debug_data(
 | 
				
			||||||
                if label != "-"
 | 
					                if label != "-"
 | 
				
			||||||
            ]
 | 
					            ]
 | 
				
			||||||
            labels_with_counts = _format_labels(labels_with_counts, counts=True)
 | 
					            labels_with_counts = _format_labels(labels_with_counts, counts=True)
 | 
				
			||||||
            msg.text("New: {}".format(labels_with_counts), show=verbose)
 | 
					            msg.text(f"New: {labels_with_counts}", show=verbose)
 | 
				
			||||||
        if existing_labels:
 | 
					        if existing_labels:
 | 
				
			||||||
            msg.text(
 | 
					            msg.text(f"Existing: {_format_labels(existing_labels)}", show=verbose)
 | 
				
			||||||
                "Existing: {}".format(_format_labels(existing_labels)), show=verbose
 | 
					 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        if gold_train_data["ws_ents"]:
 | 
					        if gold_train_data["ws_ents"]:
 | 
				
			||||||
            msg.fail(
 | 
					            msg.fail(f"{gold_train_data['ws_ents']} invalid whitespace entity spans")
 | 
				
			||||||
                "{} invalid whitespace entity span(s)".format(gold_train_data["ws_ents"])
 | 
					 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
            has_ws_ents_error = True
 | 
					            has_ws_ents_error = True
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if gold_train_data["punct_ents"]:
 | 
					        if gold_train_data["punct_ents"]:
 | 
				
			||||||
            msg.warn(
 | 
					            msg.warn(f"{gold_train_data['punct_ents']} entity span(s) with punctuation")
 | 
				
			||||||
                "{} entity span(s) with punctuation".format(gold_train_data["punct_ents"])
 | 
					 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
            has_punct_ents_warning = True
 | 
					            has_punct_ents_warning = True
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        for label in new_labels:
 | 
					        for label in new_labels:
 | 
				
			||||||
            if label_counts[label] <= NEW_LABEL_THRESHOLD:
 | 
					            if label_counts[label] <= NEW_LABEL_THRESHOLD:
 | 
				
			||||||
                msg.warn(
 | 
					                msg.warn(
 | 
				
			||||||
                    "Low number of examples for new label '{}' ({})".format(
 | 
					                    f"Low number of examples for new label '{label}' ({label_counts[label]})"
 | 
				
			||||||
                        label, label_counts[label]
 | 
					 | 
				
			||||||
                    )
 | 
					 | 
				
			||||||
                )
 | 
					                )
 | 
				
			||||||
                has_low_data_warning = True
 | 
					                has_low_data_warning = True
 | 
				
			||||||
 | 
					
 | 
				
			||||||
                with msg.loading("Analyzing label distribution..."):
 | 
					                with msg.loading("Analyzing label distribution..."):
 | 
				
			||||||
                    neg_docs = _get_examples_without_label(train_docs, label)
 | 
					                    neg_docs = _get_examples_without_label(train_dataset, label)
 | 
				
			||||||
                if neg_docs == 0:
 | 
					                if neg_docs == 0:
 | 
				
			||||||
                    msg.warn(
 | 
					                    msg.warn(f"No examples for texts WITHOUT new label '{label}'")
 | 
				
			||||||
                        "No examples for texts WITHOUT new label '{}'".format(label)
 | 
					 | 
				
			||||||
                    )
 | 
					 | 
				
			||||||
                    has_no_neg_warning = True
 | 
					                    has_no_neg_warning = True
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if not has_low_data_warning:
 | 
					        if not has_low_data_warning:
 | 
				
			||||||
| 
						 | 
					@ -265,8 +219,8 @@ def debug_data(
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if has_low_data_warning:
 | 
					        if has_low_data_warning:
 | 
				
			||||||
            msg.text(
 | 
					            msg.text(
 | 
				
			||||||
                "To train a new entity type, your data should include at "
 | 
					                f"To train a new entity type, your data should include at "
 | 
				
			||||||
                "least {} instances of the new label".format(NEW_LABEL_THRESHOLD),
 | 
					                f"least {NEW_LABEL_THRESHOLD} instances of the new label",
 | 
				
			||||||
                show=verbose,
 | 
					                show=verbose,
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
        if has_no_neg_warning:
 | 
					        if has_no_neg_warning:
 | 
				
			||||||
| 
						 | 
					@ -295,27 +249,21 @@ def debug_data(
 | 
				
			||||||
        new_labels = [l for l in labels if l not in model_labels]
 | 
					        new_labels = [l for l in labels if l not in model_labels]
 | 
				
			||||||
        existing_labels = [l for l in labels if l in model_labels]
 | 
					        existing_labels = [l for l in labels if l in model_labels]
 | 
				
			||||||
        msg.info(
 | 
					        msg.info(
 | 
				
			||||||
            "Text Classification: {} new label(s), {} existing label(s)".format(
 | 
					            f"Text Classification: {len(new_labels)} new label(s), "
 | 
				
			||||||
                len(new_labels), len(existing_labels)
 | 
					            f"{len(existing_labels)} existing label(s)"
 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
        if new_labels:
 | 
					        if new_labels:
 | 
				
			||||||
            labels_with_counts = _format_labels(
 | 
					            labels_with_counts = _format_labels(
 | 
				
			||||||
                gold_train_data["cats"].most_common(), counts=True
 | 
					                gold_train_data["cats"].most_common(), counts=True
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
            msg.text("New: {}".format(labels_with_counts), show=verbose)
 | 
					            msg.text(f"New: {labels_with_counts}", show=verbose)
 | 
				
			||||||
        if existing_labels:
 | 
					        if existing_labels:
 | 
				
			||||||
            msg.text(
 | 
					            msg.text(f"Existing: {_format_labels(existing_labels)}", show=verbose)
 | 
				
			||||||
                "Existing: {}".format(_format_labels(existing_labels)), show=verbose
 | 
					 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
        if set(gold_train_data["cats"]) != set(gold_dev_data["cats"]):
 | 
					        if set(gold_train_data["cats"]) != set(gold_dev_data["cats"]):
 | 
				
			||||||
            msg.fail(
 | 
					            msg.fail(
 | 
				
			||||||
                "The train and dev labels are not the same. "
 | 
					                f"The train and dev labels are not the same. "
 | 
				
			||||||
                "Train labels: {}. "
 | 
					                f"Train labels: {_format_labels(gold_train_data['cats'])}. "
 | 
				
			||||||
                "Dev labels: {}.".format(
 | 
					                f"Dev labels: {_format_labels(gold_dev_data['cats'])}."
 | 
				
			||||||
                    _format_labels(gold_train_data["cats"]),
 | 
					 | 
				
			||||||
                    _format_labels(gold_dev_data["cats"]),
 | 
					 | 
				
			||||||
                )
 | 
					 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
        if gold_train_data["n_cats_multilabel"] > 0:
 | 
					        if gold_train_data["n_cats_multilabel"] > 0:
 | 
				
			||||||
            msg.info(
 | 
					            msg.info(
 | 
				
			||||||
| 
						 | 
					@ -344,28 +292,17 @@ def debug_data(
 | 
				
			||||||
    if "tagger" in pipeline:
 | 
					    if "tagger" in pipeline:
 | 
				
			||||||
        msg.divider("Part-of-speech Tagging")
 | 
					        msg.divider("Part-of-speech Tagging")
 | 
				
			||||||
        labels = [label for label in gold_train_data["tags"]]
 | 
					        labels = [label for label in gold_train_data["tags"]]
 | 
				
			||||||
        tag_map = nlp.Defaults.tag_map
 | 
					        tag_map = nlp.vocab.morphology.tag_map
 | 
				
			||||||
        msg.info(
 | 
					        msg.info(f"{len(labels)} label(s) in data ({len(tag_map)} label(s) in tag map)")
 | 
				
			||||||
            "{} {} in data ({} {} in tag map)".format(
 | 
					 | 
				
			||||||
                len(labels),
 | 
					 | 
				
			||||||
                "label" if len(labels) == 1 else "labels",
 | 
					 | 
				
			||||||
                len(tag_map),
 | 
					 | 
				
			||||||
                "label" if len(tag_map) == 1 else "labels",
 | 
					 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
        labels_with_counts = _format_labels(
 | 
					        labels_with_counts = _format_labels(
 | 
				
			||||||
            gold_train_data["tags"].most_common(), counts=True
 | 
					            gold_train_data["tags"].most_common(), counts=True
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
        msg.text(labels_with_counts, show=verbose)
 | 
					        msg.text(labels_with_counts, show=verbose)
 | 
				
			||||||
        non_tagmap = [l for l in labels if l not in tag_map]
 | 
					        non_tagmap = [l for l in labels if l not in tag_map]
 | 
				
			||||||
        if not non_tagmap:
 | 
					        if not non_tagmap:
 | 
				
			||||||
            msg.good("All labels present in tag map for language '{}'".format(nlp.lang))
 | 
					            msg.good(f"All labels present in tag map for language '{nlp.lang}'")
 | 
				
			||||||
        for label in non_tagmap:
 | 
					        for label in non_tagmap:
 | 
				
			||||||
            msg.fail(
 | 
					            msg.fail(f"Label '{label}' not found in tag map for language '{nlp.lang}'")
 | 
				
			||||||
                "Label '{}' not found in tag map for language '{}'".format(
 | 
					 | 
				
			||||||
                    label, nlp.lang
 | 
					 | 
				
			||||||
                )
 | 
					 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if "parser" in pipeline:
 | 
					    if "parser" in pipeline:
 | 
				
			||||||
        has_low_data_warning = False
 | 
					        has_low_data_warning = False
 | 
				
			||||||
| 
						 | 
					@ -373,21 +310,18 @@ def debug_data(
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # profile sentence length
 | 
					        # profile sentence length
 | 
				
			||||||
        msg.info(
 | 
					        msg.info(
 | 
				
			||||||
            "Found {} sentence{} with an average length of {:.1f} words.".format(
 | 
					            f"Found {gold_train_data['n_sents']} sentence(s) with an average "
 | 
				
			||||||
                gold_train_data["n_sents"],
 | 
					            f"length of {gold_train_data['n_words'] / gold_train_data['n_sents']:.1f} words."
 | 
				
			||||||
                "s" if len(train_docs) > 1 else "",
 | 
					 | 
				
			||||||
                gold_train_data["n_words"] / gold_train_data["n_sents"],
 | 
					 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # check for documents with multiple sentences
 | 
					        # check for documents with multiple sentences
 | 
				
			||||||
        sents_per_doc = gold_train_data["n_sents"] / len(gold_train_data["texts"])
 | 
					        sents_per_doc = gold_train_data["n_sents"] / len(gold_train_data["texts"])
 | 
				
			||||||
        if sents_per_doc < 1.1:
 | 
					        if sents_per_doc < 1.1:
 | 
				
			||||||
            msg.warn(
 | 
					            msg.warn(
 | 
				
			||||||
                "The training data contains {:.2f} sentences per "
 | 
					                f"The training data contains {sents_per_doc:.2f} sentences per "
 | 
				
			||||||
                "document. When there are very few documents containing more "
 | 
					                f"document. When there are very few documents containing more "
 | 
				
			||||||
                "than one sentence, the parser will not learn how to segment "
 | 
					                f"than one sentence, the parser will not learn how to segment "
 | 
				
			||||||
                "longer texts into sentences.".format(sents_per_doc)
 | 
					                f"longer texts into sentences."
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # profile labels
 | 
					        # profile labels
 | 
				
			||||||
| 
						 | 
					@ -398,32 +332,13 @@ def debug_data(
 | 
				
			||||||
        labels_dev = [label for label in gold_dev_data["deps"]]
 | 
					        labels_dev = [label for label in gold_dev_data["deps"]]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if gold_train_unpreprocessed_data["n_nonproj"] > 0:
 | 
					        if gold_train_unpreprocessed_data["n_nonproj"] > 0:
 | 
				
			||||||
            msg.info(
 | 
					            n_nonproj = gold_train_unpreprocessed_data["n_nonproj"]
 | 
				
			||||||
                "Found {} nonprojective train sentence{}".format(
 | 
					            msg.info(f"Found {n_nonproj} nonprojective train sentence(s)")
 | 
				
			||||||
                    gold_train_unpreprocessed_data["n_nonproj"],
 | 
					 | 
				
			||||||
                    "s" if gold_train_unpreprocessed_data["n_nonproj"] > 1 else "",
 | 
					 | 
				
			||||||
                )
 | 
					 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
        if gold_dev_data["n_nonproj"] > 0:
 | 
					        if gold_dev_data["n_nonproj"] > 0:
 | 
				
			||||||
            msg.info(
 | 
					            n_nonproj = gold_dev_data["n_nonproj"]
 | 
				
			||||||
                "Found {} nonprojective dev sentence{}".format(
 | 
					            msg.info(f"Found {n_nonproj} nonprojective dev sentence(s)")
 | 
				
			||||||
                    gold_dev_data["n_nonproj"],
 | 
					        msg.info(f"{labels_train_unpreprocessed} label(s) in train data")
 | 
				
			||||||
                    "s" if gold_dev_data["n_nonproj"] > 1 else "",
 | 
					        msg.info(f"{len(labels_train)} label(s) in projectivized train data")
 | 
				
			||||||
                )
 | 
					 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        msg.info(
 | 
					 | 
				
			||||||
            "{} {} in train data".format(
 | 
					 | 
				
			||||||
                len(labels_train_unpreprocessed),
 | 
					 | 
				
			||||||
                "label" if len(labels_train) == 1 else "labels",
 | 
					 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
        msg.info(
 | 
					 | 
				
			||||||
            "{} {} in projectivized train data".format(
 | 
					 | 
				
			||||||
                len(labels_train), "label" if len(labels_train) == 1 else "labels"
 | 
					 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        labels_with_counts = _format_labels(
 | 
					        labels_with_counts = _format_labels(
 | 
				
			||||||
            gold_train_unpreprocessed_data["deps"].most_common(), counts=True
 | 
					            gold_train_unpreprocessed_data["deps"].most_common(), counts=True
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
| 
						 | 
					@ -433,9 +348,8 @@ def debug_data(
 | 
				
			||||||
        for label in gold_train_unpreprocessed_data["deps"]:
 | 
					        for label in gold_train_unpreprocessed_data["deps"]:
 | 
				
			||||||
            if gold_train_unpreprocessed_data["deps"][label] <= DEP_LABEL_THRESHOLD:
 | 
					            if gold_train_unpreprocessed_data["deps"][label] <= DEP_LABEL_THRESHOLD:
 | 
				
			||||||
                msg.warn(
 | 
					                msg.warn(
 | 
				
			||||||
                    "Low number of examples for label '{}' ({})".format(
 | 
					                    f"Low number of examples for label '{label}' "
 | 
				
			||||||
                        label, gold_train_unpreprocessed_data["deps"][label]
 | 
					                    f"({gold_train_unpreprocessed_data['deps'][label]})"
 | 
				
			||||||
                    )
 | 
					 | 
				
			||||||
                )
 | 
					                )
 | 
				
			||||||
                has_low_data_warning = True
 | 
					                has_low_data_warning = True
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -444,22 +358,19 @@ def debug_data(
 | 
				
			||||||
        for label in gold_train_data["deps"]:
 | 
					        for label in gold_train_data["deps"]:
 | 
				
			||||||
            if gold_train_data["deps"][label] <= DEP_LABEL_THRESHOLD and "||" in label:
 | 
					            if gold_train_data["deps"][label] <= DEP_LABEL_THRESHOLD and "||" in label:
 | 
				
			||||||
                rare_projectivized_labels.append(
 | 
					                rare_projectivized_labels.append(
 | 
				
			||||||
                    "{}: {}".format(label, str(gold_train_data["deps"][label]))
 | 
					                    f"{label}: {gold_train_data['deps'][label]}"
 | 
				
			||||||
                )
 | 
					                )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if len(rare_projectivized_labels) > 0:
 | 
					        if len(rare_projectivized_labels) > 0:
 | 
				
			||||||
            msg.warn(
 | 
					            msg.warn(
 | 
				
			||||||
                "Low number of examples for {} label{} in the "
 | 
					                f"Low number of examples for {len(rare_projectivized_labels)} "
 | 
				
			||||||
                "projectivized dependency trees used for training. You may "
 | 
					                "label(s) in the projectivized dependency trees used for "
 | 
				
			||||||
                "want to projectivize labels such as punct before "
 | 
					                "training. You may want to projectivize labels such as punct "
 | 
				
			||||||
                "training in order to improve parser performance.".format(
 | 
					                "before training in order to improve parser performance."
 | 
				
			||||||
                    len(rare_projectivized_labels),
 | 
					 | 
				
			||||||
                    "s" if len(rare_projectivized_labels) > 1 else "",
 | 
					 | 
				
			||||||
                )
 | 
					 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
            msg.warn(
 | 
					            msg.warn(
 | 
				
			||||||
                "Projectivized labels with low numbers of examples: "
 | 
					                f"Projectivized labels with low numbers of examples: ",
 | 
				
			||||||
                "{}".format("\n".join(rare_projectivized_labels)),
 | 
					                ", ".join(rare_projectivized_labels),
 | 
				
			||||||
                show=verbose,
 | 
					                show=verbose,
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
            has_low_data_warning = True
 | 
					            has_low_data_warning = True
 | 
				
			||||||
| 
						 | 
					@ -467,50 +378,44 @@ def debug_data(
 | 
				
			||||||
        # labels only in train
 | 
					        # labels only in train
 | 
				
			||||||
        if set(labels_train) - set(labels_dev):
 | 
					        if set(labels_train) - set(labels_dev):
 | 
				
			||||||
            msg.warn(
 | 
					            msg.warn(
 | 
				
			||||||
                "The following labels were found only in the train data: "
 | 
					                "The following labels were found only in the train data:",
 | 
				
			||||||
                "{}".format(", ".join(set(labels_train) - set(labels_dev))),
 | 
					                ", ".join(set(labels_train) - set(labels_dev)),
 | 
				
			||||||
                show=verbose,
 | 
					                show=verbose,
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # labels only in dev
 | 
					        # labels only in dev
 | 
				
			||||||
        if set(labels_dev) - set(labels_train):
 | 
					        if set(labels_dev) - set(labels_train):
 | 
				
			||||||
            msg.warn(
 | 
					            msg.warn(
 | 
				
			||||||
                "The following labels were found only in the dev data: "
 | 
					                "The following labels were found only in the dev data:",
 | 
				
			||||||
                + ", ".join(set(labels_dev) - set(labels_train)),
 | 
					                ", ".join(set(labels_dev) - set(labels_train)),
 | 
				
			||||||
                show=verbose,
 | 
					                show=verbose,
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if has_low_data_warning:
 | 
					        if has_low_data_warning:
 | 
				
			||||||
            msg.text(
 | 
					            msg.text(
 | 
				
			||||||
                "To train a parser, your data should include at "
 | 
					                f"To train a parser, your data should include at "
 | 
				
			||||||
                "least {} instances of each label.".format(DEP_LABEL_THRESHOLD),
 | 
					                f"least {DEP_LABEL_THRESHOLD} instances of each label.",
 | 
				
			||||||
                show=verbose,
 | 
					                show=verbose,
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # multiple root labels
 | 
					        # multiple root labels
 | 
				
			||||||
        if len(gold_train_unpreprocessed_data["roots"]) > 1:
 | 
					        if len(gold_train_unpreprocessed_data["roots"]) > 1:
 | 
				
			||||||
            msg.warn(
 | 
					            msg.warn(
 | 
				
			||||||
                "Multiple root labels ({}) ".format(
 | 
					                f"Multiple root labels "
 | 
				
			||||||
                    ", ".join(gold_train_unpreprocessed_data["roots"])
 | 
					                f"({', '.join(gold_train_unpreprocessed_data['roots'])}) "
 | 
				
			||||||
                )
 | 
					                f"found in training data. spaCy's parser uses a single root "
 | 
				
			||||||
                + "found in training data. spaCy's parser uses a single root "
 | 
					                f"label ROOT so this distinction will not be available."
 | 
				
			||||||
                "label ROOT so this distinction will not be available."
 | 
					 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # these should not happen, but just in case
 | 
					        # these should not happen, but just in case
 | 
				
			||||||
        if gold_train_data["n_nonproj"] > 0:
 | 
					        if gold_train_data["n_nonproj"] > 0:
 | 
				
			||||||
            msg.fail(
 | 
					            msg.fail(
 | 
				
			||||||
                "Found {} nonprojective projectivized train sentence{}".format(
 | 
					                f"Found {gold_train_data['n_nonproj']} nonprojective "
 | 
				
			||||||
                    gold_train_data["n_nonproj"],
 | 
					                f"projectivized train sentence(s)"
 | 
				
			||||||
                    "s" if gold_train_data["n_nonproj"] > 1 else "",
 | 
					 | 
				
			||||||
                )
 | 
					 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
        if gold_train_data["n_cycles"] > 0:
 | 
					        if gold_train_data["n_cycles"] > 0:
 | 
				
			||||||
            msg.fail(
 | 
					            msg.fail(
 | 
				
			||||||
                "Found {} projectivized train sentence{} with cycles".format(
 | 
					                f"Found {gold_train_data['n_cycles']} projectivized train sentence(s) with cycles"
 | 
				
			||||||
                    gold_train_data["n_cycles"],
 | 
					 | 
				
			||||||
                    "s" if gold_train_data["n_cycles"] > 1 else "",
 | 
					 | 
				
			||||||
                )
 | 
					 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    msg.divider("Summary")
 | 
					    msg.divider("Summary")
 | 
				
			||||||
| 
						 | 
					@ -518,42 +423,34 @@ def debug_data(
 | 
				
			||||||
    warn_counts = msg.counts[MESSAGES.WARN]
 | 
					    warn_counts = msg.counts[MESSAGES.WARN]
 | 
				
			||||||
    fail_counts = msg.counts[MESSAGES.FAIL]
 | 
					    fail_counts = msg.counts[MESSAGES.FAIL]
 | 
				
			||||||
    if good_counts:
 | 
					    if good_counts:
 | 
				
			||||||
        msg.good(
 | 
					        msg.good(f"{good_counts} {'check' if good_counts == 1 else 'checks'} passed")
 | 
				
			||||||
            "{} {} passed".format(
 | 
					 | 
				
			||||||
                good_counts, "check" if good_counts == 1 else "checks"
 | 
					 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
    if warn_counts:
 | 
					    if warn_counts:
 | 
				
			||||||
        msg.warn(
 | 
					        msg.warn(f"{warn_counts} {'warning' if warn_counts == 1 else 'warnings'}")
 | 
				
			||||||
            "{} {}".format(warn_counts, "warning" if warn_counts == 1 else "warnings")
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
    if fail_counts:
 | 
					 | 
				
			||||||
        msg.fail("{} {}".format(fail_counts, "error" if fail_counts == 1 else "errors"))
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    if fail_counts:
 | 
					    if fail_counts:
 | 
				
			||||||
 | 
					        msg.fail(f"{fail_counts} {'error' if fail_counts == 1 else 'errors'}")
 | 
				
			||||||
        sys.exit(1)
 | 
					        sys.exit(1)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def _load_file(file_path, msg):
 | 
					def _load_file(file_path, msg):
 | 
				
			||||||
    file_name = file_path.parts[-1]
 | 
					    file_name = file_path.parts[-1]
 | 
				
			||||||
    if file_path.suffix == ".json":
 | 
					    if file_path.suffix == ".json":
 | 
				
			||||||
        with msg.loading("Loading {}...".format(file_name)):
 | 
					        with msg.loading(f"Loading {file_name}..."):
 | 
				
			||||||
            data = srsly.read_json(file_path)
 | 
					            data = srsly.read_json(file_path)
 | 
				
			||||||
        msg.good("Loaded {}".format(file_name))
 | 
					        msg.good(f"Loaded {file_name}")
 | 
				
			||||||
        return data
 | 
					        return data
 | 
				
			||||||
    elif file_path.suffix == ".jsonl":
 | 
					    elif file_path.suffix == ".jsonl":
 | 
				
			||||||
        with msg.loading("Loading {}...".format(file_name)):
 | 
					        with msg.loading(f"Loading {file_name}..."):
 | 
				
			||||||
            data = srsly.read_jsonl(file_path)
 | 
					            data = srsly.read_jsonl(file_path)
 | 
				
			||||||
        msg.good("Loaded {}".format(file_name))
 | 
					        msg.good(f"Loaded {file_name}")
 | 
				
			||||||
        return data
 | 
					        return data
 | 
				
			||||||
    msg.fail(
 | 
					    msg.fail(
 | 
				
			||||||
        "Can't load file extension {}".format(file_path.suffix),
 | 
					        f"Can't load file extension {file_path.suffix}",
 | 
				
			||||||
        "Expected .json or .jsonl",
 | 
					        "Expected .json or .jsonl",
 | 
				
			||||||
        exits=1,
 | 
					        exits=1,
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def _compile_gold(train_docs, pipeline):
 | 
					def _compile_gold(examples, pipeline):
 | 
				
			||||||
    data = {
 | 
					    data = {
 | 
				
			||||||
        "ner": Counter(),
 | 
					        "ner": Counter(),
 | 
				
			||||||
        "cats": Counter(),
 | 
					        "cats": Counter(),
 | 
				
			||||||
| 
						 | 
					@ -571,7 +468,9 @@ def _compile_gold(train_docs, pipeline):
 | 
				
			||||||
        "n_cats_multilabel": 0,
 | 
					        "n_cats_multilabel": 0,
 | 
				
			||||||
        "texts": set(),
 | 
					        "texts": set(),
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    for doc, gold in train_docs:
 | 
					    for example in examples:
 | 
				
			||||||
 | 
					        gold = example.gold
 | 
				
			||||||
 | 
					        doc = example.doc
 | 
				
			||||||
        valid_words = [x for x in gold.words if x is not None]
 | 
					        valid_words = [x for x in gold.words if x is not None]
 | 
				
			||||||
        data["words"].update(valid_words)
 | 
					        data["words"].update(valid_words)
 | 
				
			||||||
        data["n_words"] += len(valid_words)
 | 
					        data["n_words"] += len(valid_words)
 | 
				
			||||||
| 
						 | 
					@ -584,7 +483,13 @@ def _compile_gold(train_docs, pipeline):
 | 
				
			||||||
                if label.startswith(("B-", "U-", "L-")) and doc[i].is_space:
 | 
					                if label.startswith(("B-", "U-", "L-")) and doc[i].is_space:
 | 
				
			||||||
                    # "Illegal" whitespace entity
 | 
					                    # "Illegal" whitespace entity
 | 
				
			||||||
                    data["ws_ents"] += 1
 | 
					                    data["ws_ents"] += 1
 | 
				
			||||||
                if label.startswith(("B-", "U-", "L-")) and doc[i].text in [".", "'", "!", "?", ","]:
 | 
					                if label.startswith(("B-", "U-", "L-")) and doc[i].text in [
 | 
				
			||||||
 | 
					                    ".",
 | 
				
			||||||
 | 
					                    "'",
 | 
				
			||||||
 | 
					                    "!",
 | 
				
			||||||
 | 
					                    "?",
 | 
				
			||||||
 | 
					                    ",",
 | 
				
			||||||
 | 
					                ]:
 | 
				
			||||||
                    # punctuation entity: could be replaced by whitespace when training with noise,
 | 
					                    # punctuation entity: could be replaced by whitespace when training with noise,
 | 
				
			||||||
                    # so add a warning to alert the user to this unexpected side effect.
 | 
					                    # so add a warning to alert the user to this unexpected side effect.
 | 
				
			||||||
                    data["punct_ents"] += 1
 | 
					                    data["punct_ents"] += 1
 | 
				
			||||||
| 
						 | 
					@ -614,14 +519,18 @@ def _compile_gold(train_docs, pipeline):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def _format_labels(labels, counts=False):
 | 
					def _format_labels(labels, counts=False):
 | 
				
			||||||
    if counts:
 | 
					    if counts:
 | 
				
			||||||
        return ", ".join(["'{}' ({})".format(l, c) for l, c in labels])
 | 
					        return ", ".join([f"'{l}' ({c})" for l, c in labels])
 | 
				
			||||||
    return ", ".join(["'{}'".format(l) for l in labels])
 | 
					    return ", ".join([f"'{l}'" for l in labels])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def _get_examples_without_label(data, label):
 | 
					def _get_examples_without_label(data, label):
 | 
				
			||||||
    count = 0
 | 
					    count = 0
 | 
				
			||||||
    for doc, gold in data:
 | 
					    for ex in data:
 | 
				
			||||||
        labels = [label.split("-")[1] for label in gold.ner if label not in ("O", "-")]
 | 
					        labels = [
 | 
				
			||||||
 | 
					            label.split("-")[1]
 | 
				
			||||||
 | 
					            for label in ex.gold.ner
 | 
				
			||||||
 | 
					            if label not in ("O", "-", None)
 | 
				
			||||||
 | 
					        ]
 | 
				
			||||||
        if label not in labels:
 | 
					        if label not in labels:
 | 
				
			||||||
            count += 1
 | 
					            count += 1
 | 
				
			||||||
    return count
 | 
					    return count
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,28 +1,21 @@
 | 
				
			||||||
# coding: utf8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
import plac
 | 
					 | 
				
			||||||
import requests
 | 
					import requests
 | 
				
			||||||
import os
 | 
					import os
 | 
				
			||||||
import subprocess
 | 
					import subprocess
 | 
				
			||||||
import sys
 | 
					import sys
 | 
				
			||||||
from wasabi import msg
 | 
					from wasabi import msg
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .link import link
 | 
					 | 
				
			||||||
from ..util import get_package_path
 | 
					 | 
				
			||||||
from .. import about
 | 
					from .. import about
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@plac.annotations(
 | 
					def download(
 | 
				
			||||||
    model=("Model to download (shortcut or name)", "positional", None, str),
 | 
					    model: ("Model to download (shortcut or name)", "positional", None, str),
 | 
				
			||||||
    direct=("Force direct download of name + version", "flag", "d", bool),
 | 
					    direct: ("Force direct download of name + version", "flag", "d", bool) = False,
 | 
				
			||||||
    pip_args=("Additional arguments to be passed to `pip install` on model install"),
 | 
					    *pip_args: ("Additional arguments to be passed to `pip install` on model install"),
 | 
				
			||||||
)
 | 
					):
 | 
				
			||||||
def download(model, direct=False, *pip_args):
 | 
					 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    Download compatible model from default download path using pip. Model
 | 
					    Download compatible model from default download path using pip. If --direct
 | 
				
			||||||
    can be shortcut, model name or, if --direct flag is set, full model name
 | 
					    flag is set, the command expects the full model name with version.
 | 
				
			||||||
    with version. For direct downloads, the compatibility check will be skipped.
 | 
					    For direct downloads, the compatibility check will be skipped.
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    if not require_package("spacy") and "--no-deps" not in pip_args:
 | 
					    if not require_package("spacy") and "--no-deps" not in pip_args:
 | 
				
			||||||
        msg.warn(
 | 
					        msg.warn(
 | 
				
			||||||
| 
						 | 
					@ -50,29 +43,7 @@ def download(model, direct=False, *pip_args):
 | 
				
			||||||
            sys.exit(dl)
 | 
					            sys.exit(dl)
 | 
				
			||||||
        msg.good(
 | 
					        msg.good(
 | 
				
			||||||
            "Download and installation successful",
 | 
					            "Download and installation successful",
 | 
				
			||||||
            "You can now load the model via spacy.load('{}')".format(model_name),
 | 
					            f"You can now load the model via spacy.load('{model_name}')",
 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
        # Only create symlink if the model is installed via a shortcut like 'en'.
 | 
					 | 
				
			||||||
        # There's no real advantage over an additional symlink for en_core_web_sm
 | 
					 | 
				
			||||||
        # and if anything, it's more error prone and causes more confusion.
 | 
					 | 
				
			||||||
        if model in shortcuts:
 | 
					 | 
				
			||||||
            try:
 | 
					 | 
				
			||||||
                # Get package path here because link uses
 | 
					 | 
				
			||||||
                # pip.get_installed_distributions() to check if model is a
 | 
					 | 
				
			||||||
                # package, which fails if model was just installed via
 | 
					 | 
				
			||||||
                # subprocess
 | 
					 | 
				
			||||||
                package_path = get_package_path(model_name)
 | 
					 | 
				
			||||||
                link(model_name, model, force=True, model_path=package_path)
 | 
					 | 
				
			||||||
            except:  # noqa: E722
 | 
					 | 
				
			||||||
                # Dirty, but since spacy.download and the auto-linking is
 | 
					 | 
				
			||||||
                # mostly a convenience wrapper, it's best to show a success
 | 
					 | 
				
			||||||
                # message and loading instructions, even if linking fails.
 | 
					 | 
				
			||||||
                msg.warn(
 | 
					 | 
				
			||||||
                    "Download successful but linking failed",
 | 
					 | 
				
			||||||
                    "Creating a shortcut link for '{}' didn't work (maybe you "
 | 
					 | 
				
			||||||
                    "don't have admin permissions?), but you can still load "
 | 
					 | 
				
			||||||
                    "the model via its full package name: "
 | 
					 | 
				
			||||||
                    "nlp = spacy.load('{}')".format(model, model_name),
 | 
					 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
        # If a model is downloaded and then loaded within the same process, our
 | 
					        # If a model is downloaded and then loaded within the same process, our
 | 
				
			||||||
        # is_package check currently fails, because pkg_resources.working_set
 | 
					        # is_package check currently fails, because pkg_resources.working_set
 | 
				
			||||||
| 
						 | 
					@ -95,11 +66,11 @@ def get_json(url, desc):
 | 
				
			||||||
    r = requests.get(url)
 | 
					    r = requests.get(url)
 | 
				
			||||||
    if r.status_code != 200:
 | 
					    if r.status_code != 200:
 | 
				
			||||||
        msg.fail(
 | 
					        msg.fail(
 | 
				
			||||||
            "Server error ({})".format(r.status_code),
 | 
					            f"Server error ({r.status_code})",
 | 
				
			||||||
            "Couldn't fetch {}. Please find a model for your spaCy "
 | 
					            f"Couldn't fetch {desc}. Please find a model for your spaCy "
 | 
				
			||||||
            "installation (v{}), and download it manually. For more "
 | 
					            f"installation (v{about.__version__}), and download it manually. "
 | 
				
			||||||
            "details, see the documentation: "
 | 
					            f"For more details, see the documentation: "
 | 
				
			||||||
            "https://spacy.io/usage/models".format(desc, about.__version__),
 | 
					            f"https://spacy.io/usage/models",
 | 
				
			||||||
            exits=1,
 | 
					            exits=1,
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
    return r.json()
 | 
					    return r.json()
 | 
				
			||||||
| 
						 | 
					@ -111,7 +82,7 @@ def get_compatibility():
 | 
				
			||||||
    comp_table = get_json(about.__compatibility__, "compatibility table")
 | 
					    comp_table = get_json(about.__compatibility__, "compatibility table")
 | 
				
			||||||
    comp = comp_table["spacy"]
 | 
					    comp = comp_table["spacy"]
 | 
				
			||||||
    if version not in comp:
 | 
					    if version not in comp:
 | 
				
			||||||
        msg.fail("No compatible models found for v{} of spaCy".format(version), exits=1)
 | 
					        msg.fail(f"No compatible models found for v{version} of spaCy", exits=1)
 | 
				
			||||||
    return comp[version]
 | 
					    return comp[version]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -119,8 +90,7 @@ def get_version(model, comp):
 | 
				
			||||||
    model = model.rsplit(".dev", 1)[0]
 | 
					    model = model.rsplit(".dev", 1)[0]
 | 
				
			||||||
    if model not in comp:
 | 
					    if model not in comp:
 | 
				
			||||||
        msg.fail(
 | 
					        msg.fail(
 | 
				
			||||||
            "No compatible model found for '{}' "
 | 
					            f"No compatible model found for '{model}' (spaCy v{about.__version__})",
 | 
				
			||||||
            "(spaCy v{}).".format(model, about.__version__),
 | 
					 | 
				
			||||||
            exits=1,
 | 
					            exits=1,
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
    return comp[model][0]
 | 
					    return comp[model][0]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,7 +1,3 @@
 | 
				
			||||||
# coding: utf8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals, division, print_function
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
import plac
 | 
					 | 
				
			||||||
from timeit import default_timer as timer
 | 
					from timeit import default_timer as timer
 | 
				
			||||||
from wasabi import msg
 | 
					from wasabi import msg
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -10,23 +6,16 @@ from .. import util
 | 
				
			||||||
from .. import displacy
 | 
					from .. import displacy
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@plac.annotations(
 | 
					 | 
				
			||||||
    model=("Model name or path", "positional", None, str),
 | 
					 | 
				
			||||||
    data_path=("Location of JSON-formatted evaluation data", "positional", None, str),
 | 
					 | 
				
			||||||
    gold_preproc=("Use gold preprocessing", "flag", "G", bool),
 | 
					 | 
				
			||||||
    gpu_id=("Use GPU", "option", "g", int),
 | 
					 | 
				
			||||||
    displacy_path=("Directory to output rendered parses as HTML", "option", "dp", str),
 | 
					 | 
				
			||||||
    displacy_limit=("Limit of parses to render as HTML", "option", "dl", int),
 | 
					 | 
				
			||||||
    return_scores=("Return dict containing model scores", "flag", "R", bool),
 | 
					 | 
				
			||||||
)
 | 
					 | 
				
			||||||
def evaluate(
 | 
					def evaluate(
 | 
				
			||||||
    model,
 | 
					    # fmt: off
 | 
				
			||||||
    data_path,
 | 
					    model: ("Model name or path", "positional", None, str),
 | 
				
			||||||
    gpu_id=-1,
 | 
					    data_path: ("Location of JSON-formatted evaluation data", "positional", None, str),
 | 
				
			||||||
    gold_preproc=False,
 | 
					    gpu_id: ("Use GPU", "option", "g", int) = -1,
 | 
				
			||||||
    displacy_path=None,
 | 
					    gold_preproc: ("Use gold preprocessing", "flag", "G", bool) = False,
 | 
				
			||||||
    displacy_limit=25,
 | 
					    displacy_path: ("Directory to output rendered parses as HTML", "option", "dp", str) = None,
 | 
				
			||||||
    return_scores=False,
 | 
					    displacy_limit: ("Limit of parses to render as HTML", "option", "dl", int) = 25,
 | 
				
			||||||
 | 
					    return_scores: ("Return dict containing model scores", "flag", "R", bool) = False,
 | 
				
			||||||
 | 
					    # fmt: on
 | 
				
			||||||
):
 | 
					):
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    Evaluate a model. To render a sample of parses in a HTML file, set an
 | 
					    Evaluate a model. To render a sample of parses in a HTML file, set an
 | 
				
			||||||
| 
						 | 
					@ -44,28 +33,31 @@ def evaluate(
 | 
				
			||||||
        msg.fail("Visualization output directory not found", displacy_path, exits=1)
 | 
					        msg.fail("Visualization output directory not found", displacy_path, exits=1)
 | 
				
			||||||
    corpus = GoldCorpus(data_path, data_path)
 | 
					    corpus = GoldCorpus(data_path, data_path)
 | 
				
			||||||
    nlp = util.load_model(model)
 | 
					    nlp = util.load_model(model)
 | 
				
			||||||
    dev_docs = list(corpus.dev_docs(nlp, gold_preproc=gold_preproc))
 | 
					    dev_dataset = list(corpus.dev_dataset(nlp, gold_preproc=gold_preproc))
 | 
				
			||||||
    begin = timer()
 | 
					    begin = timer()
 | 
				
			||||||
    scorer = nlp.evaluate(dev_docs, verbose=False)
 | 
					    scorer = nlp.evaluate(dev_dataset, verbose=False)
 | 
				
			||||||
    end = timer()
 | 
					    end = timer()
 | 
				
			||||||
    nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
 | 
					    nwords = sum(len(ex.doc) for ex in dev_dataset)
 | 
				
			||||||
    results = {
 | 
					    results = {
 | 
				
			||||||
        "Time": "%.2f s" % (end - begin),
 | 
					        "Time": f"{end - begin:.2f} s",
 | 
				
			||||||
        "Words": nwords,
 | 
					        "Words": nwords,
 | 
				
			||||||
        "Words/s": "%.0f" % (nwords / (end - begin)),
 | 
					        "Words/s": f"{nwords / (end - begin):.0f}",
 | 
				
			||||||
        "TOK": "%.2f" % scorer.token_acc,
 | 
					        "TOK": f"{scorer.token_acc:.2f}",
 | 
				
			||||||
        "POS": "%.2f" % scorer.tags_acc,
 | 
					        "POS": f"{scorer.tags_acc:.2f}",
 | 
				
			||||||
        "UAS": "%.2f" % scorer.uas,
 | 
					        "UAS": f"{scorer.uas:.2f}",
 | 
				
			||||||
        "LAS": "%.2f" % scorer.las,
 | 
					        "LAS": f"{scorer.las:.2f}",
 | 
				
			||||||
        "NER P": "%.2f" % scorer.ents_p,
 | 
					        "NER P": f"{scorer.ents_p:.2f}",
 | 
				
			||||||
        "NER R": "%.2f" % scorer.ents_r,
 | 
					        "NER R": f"{scorer.ents_r:.2f}",
 | 
				
			||||||
        "NER F": "%.2f" % scorer.ents_f,
 | 
					        "NER F": f"{scorer.ents_f:.2f}",
 | 
				
			||||||
        "Textcat": "%.2f" % scorer.textcat_score,
 | 
					        "Textcat": f"{scorer.textcat_score:.2f}",
 | 
				
			||||||
 | 
					        "Sent P": f"{scorer.sent_p:.2f}",
 | 
				
			||||||
 | 
					        "Sent R": f"{scorer.sent_r:.2f}",
 | 
				
			||||||
 | 
					        "Sent F": f"{scorer.sent_f:.2f}",
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    msg.table(results, title="Results")
 | 
					    msg.table(results, title="Results")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if displacy_path:
 | 
					    if displacy_path:
 | 
				
			||||||
        docs, golds = zip(*dev_docs)
 | 
					        docs = [ex.doc for ex in dev_dataset]
 | 
				
			||||||
        render_deps = "parser" in nlp.meta.get("pipeline", [])
 | 
					        render_deps = "parser" in nlp.meta.get("pipeline", [])
 | 
				
			||||||
        render_ents = "ner" in nlp.meta.get("pipeline", [])
 | 
					        render_ents = "ner" in nlp.meta.get("pipeline", [])
 | 
				
			||||||
        render_parses(
 | 
					        render_parses(
 | 
				
			||||||
| 
						 | 
					@ -76,7 +68,7 @@ def evaluate(
 | 
				
			||||||
            deps=render_deps,
 | 
					            deps=render_deps,
 | 
				
			||||||
            ents=render_ents,
 | 
					            ents=render_ents,
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
        msg.good("Generated {} parses as HTML".format(displacy_limit), displacy_path)
 | 
					        msg.good(f"Generated {displacy_limit} parses as HTML", displacy_path)
 | 
				
			||||||
    if return_scores:
 | 
					    if return_scores:
 | 
				
			||||||
        return scorer.scores
 | 
					        return scorer.scores
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,44 +1,39 @@
 | 
				
			||||||
# coding: utf8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
import plac
 | 
					 | 
				
			||||||
import platform
 | 
					import platform
 | 
				
			||||||
from pathlib import Path
 | 
					from pathlib import Path
 | 
				
			||||||
from wasabi import msg
 | 
					from wasabi import msg
 | 
				
			||||||
import srsly
 | 
					import srsly
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ..compat import path2str, basestring_, unicode_
 | 
					from .validate import get_model_pkgs
 | 
				
			||||||
from .. import util
 | 
					from .. import util
 | 
				
			||||||
from .. import about
 | 
					from .. import about
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@plac.annotations(
 | 
					def info(
 | 
				
			||||||
    model=("Optional shortcut link of model", "positional", None, str),
 | 
					    model: ("Optional model name", "positional", None, str) = None,
 | 
				
			||||||
    markdown=("Generate Markdown for GitHub issues", "flag", "md", str),
 | 
					    markdown: ("Generate Markdown for GitHub issues", "flag", "md", str) = False,
 | 
				
			||||||
    silent=("Don't print anything (just return)", "flag", "s"),
 | 
					    silent: ("Don't print anything (just return)", "flag", "s") = False,
 | 
				
			||||||
)
 | 
					):
 | 
				
			||||||
def info(model=None, markdown=False, silent=False):
 | 
					 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    Print info about spaCy installation. If a model shortcut link is
 | 
					    Print info about spaCy installation. If a model is speficied as an argument,
 | 
				
			||||||
    speficied as an argument, print model information. Flag --markdown
 | 
					    print model information. Flag --markdown prints details in Markdown for easy
 | 
				
			||||||
    prints details in Markdown for easy copy-pasting to GitHub issues.
 | 
					    copy-pasting to GitHub issues.
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    if model:
 | 
					    if model:
 | 
				
			||||||
        if util.is_package(model):
 | 
					        if util.is_package(model):
 | 
				
			||||||
            model_path = util.get_package_path(model)
 | 
					            model_path = util.get_package_path(model)
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            model_path = util.get_data_path() / model
 | 
					            model_path = model
 | 
				
			||||||
        meta_path = model_path / "meta.json"
 | 
					        meta_path = model_path / "meta.json"
 | 
				
			||||||
        if not meta_path.is_file():
 | 
					        if not meta_path.is_file():
 | 
				
			||||||
            msg.fail("Can't find model meta.json", meta_path, exits=1)
 | 
					            msg.fail("Can't find model meta.json", meta_path, exits=1)
 | 
				
			||||||
        meta = srsly.read_json(meta_path)
 | 
					        meta = srsly.read_json(meta_path)
 | 
				
			||||||
        if model_path.resolve() != model_path:
 | 
					        if model_path.resolve() != model_path:
 | 
				
			||||||
            meta["link"] = path2str(model_path)
 | 
					            meta["link"] = str(model_path)
 | 
				
			||||||
            meta["source"] = path2str(model_path.resolve())
 | 
					            meta["source"] = str(model_path.resolve())
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            meta["source"] = path2str(model_path)
 | 
					            meta["source"] = str(model_path)
 | 
				
			||||||
        if not silent:
 | 
					        if not silent:
 | 
				
			||||||
            title = "Info about model '{}'".format(model)
 | 
					            title = f"Info about model '{model}'"
 | 
				
			||||||
            model_meta = {
 | 
					            model_meta = {
 | 
				
			||||||
                k: v for k, v in meta.items() if k not in ("accuracy", "speed")
 | 
					                k: v for k, v in meta.items() if k not in ("accuracy", "speed")
 | 
				
			||||||
            }
 | 
					            }
 | 
				
			||||||
| 
						 | 
					@ -47,12 +42,13 @@ def info(model=None, markdown=False, silent=False):
 | 
				
			||||||
            else:
 | 
					            else:
 | 
				
			||||||
                msg.table(model_meta, title=title)
 | 
					                msg.table(model_meta, title=title)
 | 
				
			||||||
        return meta
 | 
					        return meta
 | 
				
			||||||
 | 
					    all_models, _ = get_model_pkgs()
 | 
				
			||||||
    data = {
 | 
					    data = {
 | 
				
			||||||
        "spaCy version": about.__version__,
 | 
					        "spaCy version": about.__version__,
 | 
				
			||||||
        "Location": path2str(Path(__file__).parent.parent),
 | 
					        "Location": str(Path(__file__).parent.parent),
 | 
				
			||||||
        "Platform": platform.platform(),
 | 
					        "Platform": platform.platform(),
 | 
				
			||||||
        "Python version": platform.python_version(),
 | 
					        "Python version": platform.python_version(),
 | 
				
			||||||
        "Models": list_models(),
 | 
					        "Models": ", ".join(model["name"] for model in all_models.values()),
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    if not silent:
 | 
					    if not silent:
 | 
				
			||||||
        title = "Info about spaCy"
 | 
					        title = "Info about spaCy"
 | 
				
			||||||
| 
						 | 
					@ -63,19 +59,6 @@ def info(model=None, markdown=False, silent=False):
 | 
				
			||||||
    return data
 | 
					    return data
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def list_models():
 | 
					 | 
				
			||||||
    def exclude_dir(dir_name):
 | 
					 | 
				
			||||||
        # exclude common cache directories and hidden directories
 | 
					 | 
				
			||||||
        exclude = ("cache", "pycache", "__pycache__")
 | 
					 | 
				
			||||||
        return dir_name in exclude or dir_name.startswith(".")
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    data_path = util.get_data_path()
 | 
					 | 
				
			||||||
    if data_path:
 | 
					 | 
				
			||||||
        models = [f.parts[-1] for f in data_path.iterdir() if f.is_dir()]
 | 
					 | 
				
			||||||
        return ", ".join([m for m in models if not exclude_dir(m)])
 | 
					 | 
				
			||||||
    return "-"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def print_markdown(data, title=None):
 | 
					def print_markdown(data, title=None):
 | 
				
			||||||
    """Print data in GitHub-flavoured Markdown format for issues etc.
 | 
					    """Print data in GitHub-flavoured Markdown format for issues etc.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -84,9 +67,9 @@ def print_markdown(data, title=None):
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    markdown = []
 | 
					    markdown = []
 | 
				
			||||||
    for key, value in data.items():
 | 
					    for key, value in data.items():
 | 
				
			||||||
        if isinstance(value, basestring_) and Path(value).exists():
 | 
					        if isinstance(value, str) and Path(value).exists():
 | 
				
			||||||
            continue
 | 
					            continue
 | 
				
			||||||
        markdown.append("* **{}:** {}".format(key, unicode_(value)))
 | 
					        markdown.append(f"* **{key}:** {value}")
 | 
				
			||||||
    if title:
 | 
					    if title:
 | 
				
			||||||
        print("\n## {}".format(title))
 | 
					        print(f"\n## {title}")
 | 
				
			||||||
    print("\n{}\n".format("\n".join(markdown)))
 | 
					    print("\n{}\n".format("\n".join(markdown)))
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,7 +1,3 @@
 | 
				
			||||||
# coding: utf8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
import plac
 | 
					 | 
				
			||||||
import math
 | 
					import math
 | 
				
			||||||
from tqdm import tqdm
 | 
					from tqdm import tqdm
 | 
				
			||||||
import numpy
 | 
					import numpy
 | 
				
			||||||
| 
						 | 
					@ -27,32 +23,18 @@ except ImportError:
 | 
				
			||||||
DEFAULT_OOV_PROB = -20
 | 
					DEFAULT_OOV_PROB = -20
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@plac.annotations(
 | 
					 | 
				
			||||||
    lang=("Model language", "positional", None, str),
 | 
					 | 
				
			||||||
    output_dir=("Model output directory", "positional", None, Path),
 | 
					 | 
				
			||||||
    freqs_loc=("Location of words frequencies file", "option", "f", Path),
 | 
					 | 
				
			||||||
    jsonl_loc=("Location of JSONL-formatted attributes file", "option", "j", Path),
 | 
					 | 
				
			||||||
    clusters_loc=("Optional location of brown clusters data", "option", "c", str),
 | 
					 | 
				
			||||||
    vectors_loc=("Optional vectors file in Word2Vec format", "option", "v", str),
 | 
					 | 
				
			||||||
    prune_vectors=("Optional number of vectors to prune to", "option", "V", int),
 | 
					 | 
				
			||||||
    vectors_name=(
 | 
					 | 
				
			||||||
        "Optional name for the word vectors, e.g. en_core_web_lg.vectors",
 | 
					 | 
				
			||||||
        "option",
 | 
					 | 
				
			||||||
        "vn",
 | 
					 | 
				
			||||||
        str,
 | 
					 | 
				
			||||||
    ),
 | 
					 | 
				
			||||||
    model_name=("Optional name for the model meta", "option", "mn", str),
 | 
					 | 
				
			||||||
)
 | 
					 | 
				
			||||||
def init_model(
 | 
					def init_model(
 | 
				
			||||||
    lang,
 | 
					    # fmt: off
 | 
				
			||||||
    output_dir,
 | 
					    lang: ("Model language", "positional", None, str),
 | 
				
			||||||
    freqs_loc=None,
 | 
					    output_dir: ("Model output directory", "positional", None, Path),
 | 
				
			||||||
    clusters_loc=None,
 | 
					    freqs_loc: ("Location of words frequencies file", "option", "f", Path) = None,
 | 
				
			||||||
    jsonl_loc=None,
 | 
					    clusters_loc: ("Optional location of brown clusters data", "option", "c", str) = None,
 | 
				
			||||||
    vectors_loc=None,
 | 
					    jsonl_loc: ("Location of JSONL-formatted attributes file", "option", "j", Path) = None,
 | 
				
			||||||
    prune_vectors=-1,
 | 
					    vectors_loc: ("Optional vectors file in Word2Vec format", "option", "v", str) = None,
 | 
				
			||||||
    vectors_name=None,
 | 
					    prune_vectors: ("Optional number of vectors to prune to", "option", "V", int) = -1,
 | 
				
			||||||
    model_name=None,
 | 
					    vectors_name: ("Optional name for the word vectors, e.g. en_core_web_lg.vectors", "option", "vn", str) = None,
 | 
				
			||||||
 | 
					    model_name: ("Optional name for the model meta", "option", "mn", str) = None,
 | 
				
			||||||
 | 
					    # fmt: on
 | 
				
			||||||
):
 | 
					):
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    Create a new model from raw data, like word frequencies, Brown clusters
 | 
					    Create a new model from raw data, like word frequencies, Brown clusters
 | 
				
			||||||
| 
						 | 
					@ -91,8 +73,7 @@ def init_model(
 | 
				
			||||||
    vec_added = len(nlp.vocab.vectors)
 | 
					    vec_added = len(nlp.vocab.vectors)
 | 
				
			||||||
    lex_added = len(nlp.vocab)
 | 
					    lex_added = len(nlp.vocab)
 | 
				
			||||||
    msg.good(
 | 
					    msg.good(
 | 
				
			||||||
        "Sucessfully compiled vocab",
 | 
					        "Sucessfully compiled vocab", f"{lex_added} entries, {vec_added} vectors",
 | 
				
			||||||
        "{} entries, {} vectors".format(lex_added, vec_added),
 | 
					 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
    if not output_dir.exists():
 | 
					    if not output_dir.exists():
 | 
				
			||||||
        output_dir.mkdir()
 | 
					        output_dir.mkdir()
 | 
				
			||||||
| 
						 | 
					@ -177,9 +158,9 @@ def add_vectors(nlp, vectors_loc, prune_vectors, name=None):
 | 
				
			||||||
                nlp.vocab.vectors.add(lex.orth, row=lex.rank)
 | 
					                nlp.vocab.vectors.add(lex.orth, row=lex.rank)
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        if vectors_loc:
 | 
					        if vectors_loc:
 | 
				
			||||||
            with msg.loading("Reading vectors from {}".format(vectors_loc)):
 | 
					            with msg.loading(f"Reading vectors from {vectors_loc}"):
 | 
				
			||||||
                vectors_data, vector_keys = read_vectors(vectors_loc)
 | 
					                vectors_data, vector_keys = read_vectors(vectors_loc)
 | 
				
			||||||
            msg.good("Loaded vectors from {}".format(vectors_loc))
 | 
					            msg.good(f"Loaded vectors from {vectors_loc}")
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            vectors_data, vector_keys = (None, None)
 | 
					            vectors_data, vector_keys = (None, None)
 | 
				
			||||||
        if vector_keys is not None:
 | 
					        if vector_keys is not None:
 | 
				
			||||||
| 
						 | 
					@ -190,7 +171,7 @@ def add_vectors(nlp, vectors_loc, prune_vectors, name=None):
 | 
				
			||||||
        if vectors_data is not None:
 | 
					        if vectors_data is not None:
 | 
				
			||||||
            nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys)
 | 
					            nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys)
 | 
				
			||||||
    if name is None:
 | 
					    if name is None:
 | 
				
			||||||
        nlp.vocab.vectors.name = "%s_model.vectors" % nlp.meta["lang"]
 | 
					        nlp.vocab.vectors.name = f"{nlp.meta['lang']}_model.vectors"
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        nlp.vocab.vectors.name = name
 | 
					        nlp.vocab.vectors.name = name
 | 
				
			||||||
    nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name
 | 
					    nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name
 | 
				
			||||||
| 
						 | 
					@ -236,7 +217,7 @@ def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50):
 | 
				
			||||||
                    word = literal_eval(key)
 | 
					                    word = literal_eval(key)
 | 
				
			||||||
                except SyntaxError:
 | 
					                except SyntaxError:
 | 
				
			||||||
                    # Take odd strings literally.
 | 
					                    # Take odd strings literally.
 | 
				
			||||||
                    word = literal_eval("'%s'" % key)
 | 
					                    word = literal_eval(f"'{key}'")
 | 
				
			||||||
                smooth_count = counts.smoother(int(freq))
 | 
					                smooth_count = counts.smoother(int(freq))
 | 
				
			||||||
                probs[word] = math.log(smooth_count) - log_total
 | 
					                probs[word] = math.log(smooth_count) - log_total
 | 
				
			||||||
    oov_prob = math.log(counts.smoother(0)) - log_total
 | 
					    oov_prob = math.log(counts.smoother(0)) - log_total
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,77 +0,0 @@
 | 
				
			||||||
# coding: utf8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
import plac
 | 
					 | 
				
			||||||
from pathlib import Path
 | 
					 | 
				
			||||||
from wasabi import msg
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from ..compat import symlink_to, path2str
 | 
					 | 
				
			||||||
from .. import util
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@plac.annotations(
 | 
					 | 
				
			||||||
    origin=("package name or local path to model", "positional", None, str),
 | 
					 | 
				
			||||||
    link_name=("name of shortuct link to create", "positional", None, str),
 | 
					 | 
				
			||||||
    force=("force overwriting of existing link", "flag", "f", bool),
 | 
					 | 
				
			||||||
)
 | 
					 | 
				
			||||||
def link(origin, link_name, force=False, model_path=None):
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
    Create a symlink for models within the spacy/data directory. Accepts
 | 
					 | 
				
			||||||
    either the name of a pip package, or the local path to the model data
 | 
					 | 
				
			||||||
    directory. Linking models allows loading them via spacy.load(link_name).
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
    if util.is_package(origin):
 | 
					 | 
				
			||||||
        model_path = util.get_package_path(origin)
 | 
					 | 
				
			||||||
    else:
 | 
					 | 
				
			||||||
        model_path = Path(origin) if model_path is None else Path(model_path)
 | 
					 | 
				
			||||||
    if not model_path.exists():
 | 
					 | 
				
			||||||
        msg.fail(
 | 
					 | 
				
			||||||
            "Can't locate model data",
 | 
					 | 
				
			||||||
            "The data should be located in {}".format(path2str(model_path)),
 | 
					 | 
				
			||||||
            exits=1,
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
    data_path = util.get_data_path()
 | 
					 | 
				
			||||||
    if not data_path or not data_path.exists():
 | 
					 | 
				
			||||||
        spacy_loc = Path(__file__).parent.parent
 | 
					 | 
				
			||||||
        msg.fail(
 | 
					 | 
				
			||||||
            "Can't find the spaCy data path to create model symlink",
 | 
					 | 
				
			||||||
            "Make sure a directory `/data` exists within your spaCy "
 | 
					 | 
				
			||||||
            "installation and try again. The data directory should be located "
 | 
					 | 
				
			||||||
            "here:".format(path=spacy_loc),
 | 
					 | 
				
			||||||
            exits=1,
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
    link_path = util.get_data_path() / link_name
 | 
					 | 
				
			||||||
    if link_path.is_symlink() and not force:
 | 
					 | 
				
			||||||
        msg.fail(
 | 
					 | 
				
			||||||
            "Link '{}' already exists".format(link_name),
 | 
					 | 
				
			||||||
            "To overwrite an existing link, use the --force flag",
 | 
					 | 
				
			||||||
            exits=1,
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
    elif link_path.is_symlink():  # does a symlink exist?
 | 
					 | 
				
			||||||
        # NB: It's important to check for is_symlink here and not for exists,
 | 
					 | 
				
			||||||
        # because invalid/outdated symlinks would return False otherwise.
 | 
					 | 
				
			||||||
        link_path.unlink()
 | 
					 | 
				
			||||||
    elif link_path.exists():  # does it exist otherwise?
 | 
					 | 
				
			||||||
        # NB: Check this last because valid symlinks also "exist".
 | 
					 | 
				
			||||||
        msg.fail(
 | 
					 | 
				
			||||||
            "Can't overwrite symlink '{}'".format(link_name),
 | 
					 | 
				
			||||||
            "This can happen if your data directory contains a directory or "
 | 
					 | 
				
			||||||
            "file of the same name.",
 | 
					 | 
				
			||||||
            exits=1,
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
    details = "%s --> %s" % (path2str(model_path), path2str(link_path))
 | 
					 | 
				
			||||||
    try:
 | 
					 | 
				
			||||||
        symlink_to(link_path, model_path)
 | 
					 | 
				
			||||||
    except:  # noqa: E722
 | 
					 | 
				
			||||||
        # This is quite dirty, but just making sure other errors are caught.
 | 
					 | 
				
			||||||
        msg.fail(
 | 
					 | 
				
			||||||
            "Couldn't link model to '{}'".format(link_name),
 | 
					 | 
				
			||||||
            "Creating a symlink in spacy/data failed. Make sure you have the "
 | 
					 | 
				
			||||||
            "required permissions and try re-running the command as admin, or "
 | 
					 | 
				
			||||||
            "use a virtualenv. You can still import the model as a module and "
 | 
					 | 
				
			||||||
            "call its load() method, or create the symlink manually.",
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
        msg.text(details)
 | 
					 | 
				
			||||||
        raise
 | 
					 | 
				
			||||||
    msg.good("Linking successful", details)
 | 
					 | 
				
			||||||
    msg.text("You can now load the model via spacy.load('{}')".format(link_name))
 | 
					 | 
				
			||||||
| 
						 | 
					@ -1,25 +1,21 @@
 | 
				
			||||||
# coding: utf8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
import plac
 | 
					 | 
				
			||||||
import shutil
 | 
					import shutil
 | 
				
			||||||
from pathlib import Path
 | 
					from pathlib import Path
 | 
				
			||||||
from wasabi import msg, get_raw_input
 | 
					from wasabi import msg, get_raw_input
 | 
				
			||||||
import srsly
 | 
					import srsly
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ..compat import path2str
 | 
					 | 
				
			||||||
from .. import util
 | 
					from .. import util
 | 
				
			||||||
from .. import about
 | 
					from .. import about
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@plac.annotations(
 | 
					def package(
 | 
				
			||||||
    input_dir=("Directory with model data", "positional", None, str),
 | 
					    # fmt: off
 | 
				
			||||||
    output_dir=("Output parent directory", "positional", None, str),
 | 
					    input_dir: ("Directory with model data", "positional", None, str),
 | 
				
			||||||
    meta_path=("Path to meta.json", "option", "m", str),
 | 
					    output_dir: ("Output parent directory", "positional", None, str),
 | 
				
			||||||
    create_meta=("Create meta.json, even if one exists", "flag", "c", bool),
 | 
					    meta_path: ("Path to meta.json", "option", "m", str) = None,
 | 
				
			||||||
    force=("Force overwriting existing model in output directory", "flag", "f", bool),
 | 
					    create_meta: ("Create meta.json, even if one exists", "flag", "c", bool) = False,
 | 
				
			||||||
)
 | 
					    force: ("Force overwriting existing model in output directory", "flag", "f", bool) = False,
 | 
				
			||||||
def package(input_dir, output_dir, meta_path=None, create_meta=False, force=False):
 | 
					    # fmt: on
 | 
				
			||||||
 | 
					):
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    Generate Python package for model data, including meta and required
 | 
					    Generate Python package for model data, including meta and required
 | 
				
			||||||
    installation files. A new directory will be created in the specified
 | 
					    installation files. A new directory will be created in the specified
 | 
				
			||||||
| 
						 | 
					@ -47,7 +43,7 @@ def package(input_dir, output_dir, meta_path=None, create_meta=False, force=Fals
 | 
				
			||||||
    for key in ("lang", "name", "version"):
 | 
					    for key in ("lang", "name", "version"):
 | 
				
			||||||
        if key not in meta or meta[key] == "":
 | 
					        if key not in meta or meta[key] == "":
 | 
				
			||||||
            msg.fail(
 | 
					            msg.fail(
 | 
				
			||||||
                "No '{}' setting found in meta.json".format(key),
 | 
					                f"No '{key}' setting found in meta.json",
 | 
				
			||||||
                "This setting is required to build your package.",
 | 
					                "This setting is required to build your package.",
 | 
				
			||||||
                exits=1,
 | 
					                exits=1,
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
| 
						 | 
					@ -58,22 +54,21 @@ def package(input_dir, output_dir, meta_path=None, create_meta=False, force=Fals
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if package_path.exists():
 | 
					    if package_path.exists():
 | 
				
			||||||
        if force:
 | 
					        if force:
 | 
				
			||||||
            shutil.rmtree(path2str(package_path))
 | 
					            shutil.rmtree(str(package_path))
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            msg.fail(
 | 
					            msg.fail(
 | 
				
			||||||
                "Package directory already exists",
 | 
					                "Package directory already exists",
 | 
				
			||||||
                "Please delete the directory and try again, or use the "
 | 
					                "Please delete the directory and try again, or use the "
 | 
				
			||||||
                "`--force` flag to overwrite existing "
 | 
					                "`--force` flag to overwrite existing directories.",
 | 
				
			||||||
                "directories.".format(path=path2str(package_path)),
 | 
					 | 
				
			||||||
                exits=1,
 | 
					                exits=1,
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
    Path.mkdir(package_path, parents=True)
 | 
					    Path.mkdir(package_path, parents=True)
 | 
				
			||||||
    shutil.copytree(path2str(input_path), path2str(package_path / model_name_v))
 | 
					    shutil.copytree(str(input_path), str(package_path / model_name_v))
 | 
				
			||||||
    create_file(main_path / "meta.json", srsly.json_dumps(meta, indent=2))
 | 
					    create_file(main_path / "meta.json", srsly.json_dumps(meta, indent=2))
 | 
				
			||||||
    create_file(main_path / "setup.py", TEMPLATE_SETUP)
 | 
					    create_file(main_path / "setup.py", TEMPLATE_SETUP)
 | 
				
			||||||
    create_file(main_path / "MANIFEST.in", TEMPLATE_MANIFEST)
 | 
					    create_file(main_path / "MANIFEST.in", TEMPLATE_MANIFEST)
 | 
				
			||||||
    create_file(package_path / "__init__.py", TEMPLATE_INIT)
 | 
					    create_file(package_path / "__init__.py", TEMPLATE_INIT)
 | 
				
			||||||
    msg.good("Successfully created package '{}'".format(model_name_v), main_path)
 | 
					    msg.good(f"Successfully created package '{model_name_v}'", main_path)
 | 
				
			||||||
    msg.text("To build the package, run `python setup.py sdist` in this directory.")
 | 
					    msg.text("To build the package, run `python setup.py sdist` in this directory.")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -88,7 +83,7 @@ def generate_meta(model_path, existing_meta, msg):
 | 
				
			||||||
        ("lang", "Model language", meta.get("lang", "en")),
 | 
					        ("lang", "Model language", meta.get("lang", "en")),
 | 
				
			||||||
        ("name", "Model name", meta.get("name", "model")),
 | 
					        ("name", "Model name", meta.get("name", "model")),
 | 
				
			||||||
        ("version", "Model version", meta.get("version", "0.0.0")),
 | 
					        ("version", "Model version", meta.get("version", "0.0.0")),
 | 
				
			||||||
        ("spacy_version", "Required spaCy version", ">=%s,<3.0.0" % about.__version__),
 | 
					        ("spacy_version", "Required spaCy version", f">={about.__version__},<3.0.0"),
 | 
				
			||||||
        ("description", "Model description", meta.get("description", False)),
 | 
					        ("description", "Model description", meta.get("description", False)),
 | 
				
			||||||
        ("author", "Author", meta.get("author", False)),
 | 
					        ("author", "Author", meta.get("author", False)),
 | 
				
			||||||
        ("email", "Author email", meta.get("email", False)),
 | 
					        ("email", "Author email", meta.get("email", False)),
 | 
				
			||||||
| 
						 | 
					@ -118,9 +113,6 @@ def generate_meta(model_path, existing_meta, msg):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
TEMPLATE_SETUP = """
 | 
					TEMPLATE_SETUP = """
 | 
				
			||||||
#!/usr/bin/env python
 | 
					#!/usr/bin/env python
 | 
				
			||||||
# coding: utf8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
import io
 | 
					import io
 | 
				
			||||||
import json
 | 
					import json
 | 
				
			||||||
from os import path, walk
 | 
					from os import path, walk
 | 
				
			||||||
| 
						 | 
					@ -190,9 +182,6 @@ include meta.json
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
TEMPLATE_INIT = """
 | 
					TEMPLATE_INIT = """
 | 
				
			||||||
# coding: utf8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from pathlib import Path
 | 
					from pathlib import Path
 | 
				
			||||||
from spacy.util import load_model_from_init_py, get_model_meta
 | 
					from spacy.util import load_model_from_init_py, get_model_meta
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,107 +1,50 @@
 | 
				
			||||||
# coding: utf8
 | 
					 | 
				
			||||||
from __future__ import print_function, unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
import plac
 | 
					 | 
				
			||||||
import random
 | 
					import random
 | 
				
			||||||
import numpy
 | 
					import numpy
 | 
				
			||||||
import time
 | 
					import time
 | 
				
			||||||
import re
 | 
					import re
 | 
				
			||||||
from collections import Counter
 | 
					from collections import Counter
 | 
				
			||||||
from pathlib import Path
 | 
					from pathlib import Path
 | 
				
			||||||
from thinc.v2v import Affine, Maxout
 | 
					from thinc.api import Linear, Maxout, chain, list2array, prefer_gpu
 | 
				
			||||||
from thinc.misc import LayerNorm as LN
 | 
					from thinc.api import CosineDistance, L2Distance
 | 
				
			||||||
from thinc.neural.util import prefer_gpu
 | 
					 | 
				
			||||||
from wasabi import msg
 | 
					from wasabi import msg
 | 
				
			||||||
import srsly
 | 
					import srsly
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from ..gold import Example
 | 
				
			||||||
from ..errors import Errors
 | 
					from ..errors import Errors
 | 
				
			||||||
from ..tokens import Doc
 | 
					from ..tokens import Doc
 | 
				
			||||||
from ..attrs import ID, HEAD
 | 
					from ..attrs import ID, HEAD
 | 
				
			||||||
from .._ml import Tok2Vec, flatten, chain, create_default_optimizer
 | 
					from ..ml.component_models import Tok2Vec
 | 
				
			||||||
from .._ml import masked_language_model, get_cossim_loss
 | 
					from ..ml.component_models import masked_language_model
 | 
				
			||||||
from .. import util
 | 
					from .. import util
 | 
				
			||||||
 | 
					from ..util import create_default_optimizer
 | 
				
			||||||
from .train import _load_pretrained_tok2vec
 | 
					from .train import _load_pretrained_tok2vec
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@plac.annotations(
 | 
					 | 
				
			||||||
    texts_loc=(
 | 
					 | 
				
			||||||
        "Path to JSONL file with raw texts to learn from, with text provided as the key 'text' or tokens as the "
 | 
					 | 
				
			||||||
        "key 'tokens'",
 | 
					 | 
				
			||||||
        "positional",
 | 
					 | 
				
			||||||
        None,
 | 
					 | 
				
			||||||
        str,
 | 
					 | 
				
			||||||
    ),
 | 
					 | 
				
			||||||
    vectors_model=("Name or path to spaCy model with vectors to learn from"),
 | 
					 | 
				
			||||||
    output_dir=("Directory to write models to on each epoch", "positional", None, str),
 | 
					 | 
				
			||||||
    width=("Width of CNN layers", "option", "cw", int),
 | 
					 | 
				
			||||||
    depth=("Depth of CNN layers", "option", "cd", int),
 | 
					 | 
				
			||||||
    cnn_window=("Window size for CNN layers", "option", "cW", int),
 | 
					 | 
				
			||||||
    cnn_pieces=("Maxout size for CNN layers. 1 for Mish", "option", "cP", int),
 | 
					 | 
				
			||||||
    use_chars=("Whether to use character-based embedding", "flag", "chr", bool),
 | 
					 | 
				
			||||||
    sa_depth=("Depth of self-attention layers", "option", "sa", int),
 | 
					 | 
				
			||||||
    bilstm_depth=("Depth of BiLSTM layers (requires PyTorch)", "option", "lstm", int),
 | 
					 | 
				
			||||||
    embed_rows=("Number of embedding rows", "option", "er", int),
 | 
					 | 
				
			||||||
    loss_func=(
 | 
					 | 
				
			||||||
        "Loss function to use for the objective. Either 'L2' or 'cosine'",
 | 
					 | 
				
			||||||
        "option",
 | 
					 | 
				
			||||||
        "L",
 | 
					 | 
				
			||||||
        str,
 | 
					 | 
				
			||||||
    ),
 | 
					 | 
				
			||||||
    use_vectors=("Whether to use the static vectors as input features", "flag", "uv"),
 | 
					 | 
				
			||||||
    dropout=("Dropout rate", "option", "d", float),
 | 
					 | 
				
			||||||
    batch_size=("Number of words per training batch", "option", "bs", int),
 | 
					 | 
				
			||||||
    max_length=(
 | 
					 | 
				
			||||||
        "Max words per example. Longer examples are discarded",
 | 
					 | 
				
			||||||
        "option",
 | 
					 | 
				
			||||||
        "xw",
 | 
					 | 
				
			||||||
        int,
 | 
					 | 
				
			||||||
    ),
 | 
					 | 
				
			||||||
    min_length=(
 | 
					 | 
				
			||||||
        "Min words per example. Shorter examples are discarded",
 | 
					 | 
				
			||||||
        "option",
 | 
					 | 
				
			||||||
        "nw",
 | 
					 | 
				
			||||||
        int,
 | 
					 | 
				
			||||||
    ),
 | 
					 | 
				
			||||||
    seed=("Seed for random number generators", "option", "s", int),
 | 
					 | 
				
			||||||
    n_iter=("Number of iterations to pretrain", "option", "i", int),
 | 
					 | 
				
			||||||
    n_save_every=("Save model every X batches.", "option", "se", int),
 | 
					 | 
				
			||||||
    init_tok2vec=(
 | 
					 | 
				
			||||||
        "Path to pretrained weights for the token-to-vector parts of the models. See 'spacy pretrain'. Experimental.",
 | 
					 | 
				
			||||||
        "option",
 | 
					 | 
				
			||||||
        "t2v",
 | 
					 | 
				
			||||||
        Path,
 | 
					 | 
				
			||||||
    ),
 | 
					 | 
				
			||||||
    epoch_start=(
 | 
					 | 
				
			||||||
        "The epoch to start counting at. Only relevant when using '--init-tok2vec' and the given weight file has been "
 | 
					 | 
				
			||||||
        "renamed. Prevents unintended overwriting of existing weight files.",
 | 
					 | 
				
			||||||
        "option",
 | 
					 | 
				
			||||||
        "es",
 | 
					 | 
				
			||||||
        int,
 | 
					 | 
				
			||||||
    ),
 | 
					 | 
				
			||||||
)
 | 
					 | 
				
			||||||
def pretrain(
 | 
					def pretrain(
 | 
				
			||||||
    texts_loc,
 | 
					    # fmt: off
 | 
				
			||||||
    vectors_model,
 | 
					    texts_loc: ("Path to JSONL file with raw texts to learn from, with text provided as the key 'text' or tokens as the key 'tokens'", "positional", None, str),
 | 
				
			||||||
    output_dir,
 | 
					    vectors_model: ("Name or path to spaCy model with vectors to learn from", "positional", None, str),
 | 
				
			||||||
    width=96,
 | 
					    output_dir: ("Directory to write models to on each epoch", "positional", None, str),
 | 
				
			||||||
    depth=4,
 | 
					    width: ("Width of CNN layers", "option", "cw", int) = 96,
 | 
				
			||||||
    bilstm_depth=0,
 | 
					    conv_depth: ("Depth of CNN layers", "option", "cd", int) = 4,
 | 
				
			||||||
    cnn_pieces=3,
 | 
					    bilstm_depth: ("Depth of BiLSTM layers (requires PyTorch)", "option", "lstm", int) = 0,
 | 
				
			||||||
    sa_depth=0,
 | 
					    cnn_pieces: ("Maxout size for CNN layers. 1 for Mish", "option", "cP", int) = 3,
 | 
				
			||||||
    use_chars=False,
 | 
					    sa_depth: ("Depth of self-attention layers", "option", "sa", int) = 0,
 | 
				
			||||||
    cnn_window=1,
 | 
					    use_chars: ("Whether to use character-based embedding", "flag", "chr", bool) = False,
 | 
				
			||||||
    embed_rows=2000,
 | 
					    cnn_window: ("Window size for CNN layers", "option", "cW", int) = 1,
 | 
				
			||||||
    loss_func="cosine",
 | 
					    embed_rows: ("Number of embedding rows", "option", "er", int) = 2000,
 | 
				
			||||||
    use_vectors=False,
 | 
					    loss_func: ("Loss function to use for the objective. Either 'L2' or 'cosine'", "option", "L", str) = "cosine",
 | 
				
			||||||
    dropout=0.2,
 | 
					    use_vectors: ("Whether to use the static vectors as input features", "flag", "uv") = False,
 | 
				
			||||||
    n_iter=1000,
 | 
					    dropout: ("Dropout rate", "option", "d", float) = 0.2,
 | 
				
			||||||
    batch_size=3000,
 | 
					    n_iter: ("Number of iterations to pretrain", "option", "i", int) = 1000,
 | 
				
			||||||
    max_length=500,
 | 
					    batch_size: ("Number of words per training batch", "option", "bs", int) = 3000,
 | 
				
			||||||
    min_length=5,
 | 
					    max_length: ("Max words per example. Longer examples are discarded", "option", "xw", int) = 500,
 | 
				
			||||||
    seed=0,
 | 
					    min_length: ("Min words per example. Shorter examples are discarded", "option", "nw", int) = 5,
 | 
				
			||||||
    n_save_every=None,
 | 
					    seed: ("Seed for random number generators", "option", "s", int) = 0,
 | 
				
			||||||
    init_tok2vec=None,
 | 
					    n_save_every: ("Save model every X batches.", "option", "se", int) = None,
 | 
				
			||||||
    epoch_start=None,
 | 
					    init_tok2vec: ("Path to pretrained weights for the token-to-vector parts of the models. See 'spacy pretrain'. Experimental.", "option", "t2v", Path) = None,
 | 
				
			||||||
 | 
					    epoch_start: ("The epoch to start counting at. Only relevant when using '--init-tok2vec' and the given weight file has been renamed. Prevents unintended overwriting of existing weight files.", "option", "es", int) = None,
 | 
				
			||||||
 | 
					    # fmt: on
 | 
				
			||||||
):
 | 
					):
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components,
 | 
					    Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components,
 | 
				
			||||||
| 
						 | 
					@ -132,9 +75,15 @@ def pretrain(
 | 
				
			||||||
    msg.info("Using GPU" if has_gpu else "Not using GPU")
 | 
					    msg.info("Using GPU" if has_gpu else "Not using GPU")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    output_dir = Path(output_dir)
 | 
					    output_dir = Path(output_dir)
 | 
				
			||||||
 | 
					    if output_dir.exists() and [p for p in output_dir.iterdir()]:
 | 
				
			||||||
 | 
					        msg.warn(
 | 
				
			||||||
 | 
					            "Output directory is not empty",
 | 
				
			||||||
 | 
					            "It is better to use an empty directory or refer to a new output path, "
 | 
				
			||||||
 | 
					            "then the new directory will be created for you.",
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
    if not output_dir.exists():
 | 
					    if not output_dir.exists():
 | 
				
			||||||
        output_dir.mkdir()
 | 
					        output_dir.mkdir()
 | 
				
			||||||
        msg.good("Created output directory")
 | 
					        msg.good(f"Created output directory: {output_dir}")
 | 
				
			||||||
    srsly.write_json(output_dir / "config.json", config)
 | 
					    srsly.write_json(output_dir / "config.json", config)
 | 
				
			||||||
    msg.good("Saved settings to config.json")
 | 
					    msg.good("Saved settings to config.json")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -153,16 +102,16 @@ def pretrain(
 | 
				
			||||||
        msg.text("Reading input text from stdin...")
 | 
					        msg.text("Reading input text from stdin...")
 | 
				
			||||||
        texts = srsly.read_jsonl("-")
 | 
					        texts = srsly.read_jsonl("-")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    with msg.loading("Loading model '{}'...".format(vectors_model)):
 | 
					    with msg.loading(f"Loading model '{vectors_model}'..."):
 | 
				
			||||||
        nlp = util.load_model(vectors_model)
 | 
					        nlp = util.load_model(vectors_model)
 | 
				
			||||||
    msg.good("Loaded model '{}'".format(vectors_model))
 | 
					    msg.good(f"Loaded model '{vectors_model}'")
 | 
				
			||||||
    pretrained_vectors = None if not use_vectors else nlp.vocab.vectors.name
 | 
					    pretrained_vectors = None if not use_vectors else nlp.vocab.vectors
 | 
				
			||||||
    model = create_pretraining_model(
 | 
					    model = create_pretraining_model(
 | 
				
			||||||
        nlp,
 | 
					        nlp,
 | 
				
			||||||
        Tok2Vec(
 | 
					        Tok2Vec(
 | 
				
			||||||
            width,
 | 
					            width,
 | 
				
			||||||
            embed_rows,
 | 
					            embed_rows,
 | 
				
			||||||
            conv_depth=depth,
 | 
					            conv_depth=conv_depth,
 | 
				
			||||||
            pretrained_vectors=pretrained_vectors,
 | 
					            pretrained_vectors=pretrained_vectors,
 | 
				
			||||||
            bilstm_depth=bilstm_depth,  # Requires PyTorch. Experimental.
 | 
					            bilstm_depth=bilstm_depth,  # Requires PyTorch. Experimental.
 | 
				
			||||||
            subword_features=not use_chars,  # Set to False for Chinese etc
 | 
					            subword_features=not use_chars,  # Set to False for Chinese etc
 | 
				
			||||||
| 
						 | 
					@ -172,7 +121,7 @@ def pretrain(
 | 
				
			||||||
    # Load in pretrained weights
 | 
					    # Load in pretrained weights
 | 
				
			||||||
    if init_tok2vec is not None:
 | 
					    if init_tok2vec is not None:
 | 
				
			||||||
        components = _load_pretrained_tok2vec(nlp, init_tok2vec)
 | 
					        components = _load_pretrained_tok2vec(nlp, init_tok2vec)
 | 
				
			||||||
        msg.text("Loaded pretrained tok2vec for: {}".format(components))
 | 
					        msg.text(f"Loaded pretrained tok2vec for: {components}")
 | 
				
			||||||
        # Parse the epoch number from the given weight file
 | 
					        # Parse the epoch number from the given weight file
 | 
				
			||||||
        model_name = re.search(r"model\d+\.bin", str(init_tok2vec))
 | 
					        model_name = re.search(r"model\d+\.bin", str(init_tok2vec))
 | 
				
			||||||
        if model_name:
 | 
					        if model_name:
 | 
				
			||||||
| 
						 | 
					@ -181,32 +130,28 @@ def pretrain(
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            if not epoch_start:
 | 
					            if not epoch_start:
 | 
				
			||||||
                msg.fail(
 | 
					                msg.fail(
 | 
				
			||||||
                    "You have to use the '--epoch-start' argument when using a renamed weight file for "
 | 
					                    "You have to use the --epoch-start argument when using a renamed weight file for --init-tok2vec",
 | 
				
			||||||
                    "'--init-tok2vec'",
 | 
					 | 
				
			||||||
                    exits=True,
 | 
					                    exits=True,
 | 
				
			||||||
                )
 | 
					                )
 | 
				
			||||||
            elif epoch_start < 0:
 | 
					            elif epoch_start < 0:
 | 
				
			||||||
                msg.fail(
 | 
					                msg.fail(
 | 
				
			||||||
                    "The argument '--epoch-start' has to be greater or equal to 0. '%d' is invalid"
 | 
					                    f"The argument --epoch-start has to be greater or equal to 0. {epoch_start} is invalid",
 | 
				
			||||||
                    % epoch_start,
 | 
					 | 
				
			||||||
                    exits=True,
 | 
					                    exits=True,
 | 
				
			||||||
                )
 | 
					                )
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        # Without '--init-tok2vec' the '--epoch-start' argument is ignored
 | 
					        # Without '--init-tok2vec' the '--epoch-start' argument is ignored
 | 
				
			||||||
        epoch_start = 0
 | 
					        epoch_start = 0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    optimizer = create_default_optimizer(model.ops)
 | 
					    optimizer = create_default_optimizer()
 | 
				
			||||||
    tracker = ProgressTracker(frequency=10000)
 | 
					    tracker = ProgressTracker(frequency=10000)
 | 
				
			||||||
    msg.divider("Pre-training tok2vec layer - starting at epoch %d" % epoch_start)
 | 
					    msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_start}")
 | 
				
			||||||
    row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")}
 | 
					    row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")}
 | 
				
			||||||
    msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings)
 | 
					    msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def _save_model(epoch, is_temp=False):
 | 
					    def _save_model(epoch, is_temp=False):
 | 
				
			||||||
        is_temp_str = ".temp" if is_temp else ""
 | 
					        is_temp_str = ".temp" if is_temp else ""
 | 
				
			||||||
        with model.use_params(optimizer.averages):
 | 
					        with model.use_params(optimizer.averages):
 | 
				
			||||||
            with (output_dir / ("model%d%s.bin" % (epoch, is_temp_str))).open(
 | 
					            with (output_dir / f"model{epoch}{is_temp_str}.bin").open("wb") as file_:
 | 
				
			||||||
                "wb"
 | 
					 | 
				
			||||||
            ) as file_:
 | 
					 | 
				
			||||||
                file_.write(model.tok2vec.to_bytes())
 | 
					                file_.write(model.tok2vec.to_bytes())
 | 
				
			||||||
            log = {
 | 
					            log = {
 | 
				
			||||||
                "nr_word": tracker.nr_word,
 | 
					                "nr_word": tracker.nr_word,
 | 
				
			||||||
| 
						 | 
					@ -220,7 +165,9 @@ def pretrain(
 | 
				
			||||||
    skip_counter = 0
 | 
					    skip_counter = 0
 | 
				
			||||||
    for epoch in range(epoch_start, n_iter + epoch_start):
 | 
					    for epoch in range(epoch_start, n_iter + epoch_start):
 | 
				
			||||||
        for batch_id, batch in enumerate(
 | 
					        for batch_id, batch in enumerate(
 | 
				
			||||||
            util.minibatch_by_words(((text, None) for text in texts), size=batch_size)
 | 
					            util.minibatch_by_words(
 | 
				
			||||||
 | 
					                (Example(doc=text) for text in texts), size=batch_size
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
        ):
 | 
					        ):
 | 
				
			||||||
            docs, count = make_docs(
 | 
					            docs, count = make_docs(
 | 
				
			||||||
                nlp,
 | 
					                nlp,
 | 
				
			||||||
| 
						 | 
					@ -245,7 +192,7 @@ def pretrain(
 | 
				
			||||||
            # Reshuffle the texts if texts were loaded from a file
 | 
					            # Reshuffle the texts if texts were loaded from a file
 | 
				
			||||||
            random.shuffle(texts)
 | 
					            random.shuffle(texts)
 | 
				
			||||||
    if skip_counter > 0:
 | 
					    if skip_counter > 0:
 | 
				
			||||||
        msg.warn("Skipped {count} empty values".format(count=str(skip_counter)))
 | 
					        msg.warn(f"Skipped {skip_counter} empty values")
 | 
				
			||||||
    msg.good("Successfully finished pretrain")
 | 
					    msg.good("Successfully finished pretrain")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -310,13 +257,14 @@ def get_vectors_loss(ops, docs, prediction, objective="L2"):
 | 
				
			||||||
    # and look them up all at once. This prevents data copying.
 | 
					    # and look them up all at once. This prevents data copying.
 | 
				
			||||||
    ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
 | 
					    ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
 | 
				
			||||||
    target = docs[0].vocab.vectors.data[ids]
 | 
					    target = docs[0].vocab.vectors.data[ids]
 | 
				
			||||||
 | 
					    # TODO: this code originally didn't normalize, but shouldn't normalize=True ?
 | 
				
			||||||
    if objective == "L2":
 | 
					    if objective == "L2":
 | 
				
			||||||
        d_target = prediction - target
 | 
					        distance = L2Distance(normalize=False)
 | 
				
			||||||
        loss = (d_target ** 2).sum()
 | 
					 | 
				
			||||||
    elif objective == "cosine":
 | 
					    elif objective == "cosine":
 | 
				
			||||||
        loss, d_target = get_cossim_loss(prediction, target)
 | 
					        distance = CosineDistance(normalize=False)
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        raise ValueError(Errors.E142.format(loss_func=objective))
 | 
					        raise ValueError(Errors.E142.format(loss_func=objective))
 | 
				
			||||||
 | 
					    d_target, loss = distance(prediction, target)
 | 
				
			||||||
    return loss, d_target
 | 
					    return loss, d_target
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -328,18 +276,18 @@ def create_pretraining_model(nlp, tok2vec):
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    output_size = nlp.vocab.vectors.data.shape[1]
 | 
					    output_size = nlp.vocab.vectors.data.shape[1]
 | 
				
			||||||
    output_layer = chain(
 | 
					    output_layer = chain(
 | 
				
			||||||
        LN(Maxout(300, pieces=3)), Affine(output_size, drop_factor=0.0)
 | 
					        Maxout(300, pieces=3, normalize=True, dropout=0.0), Linear(output_size)
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
    # This is annoying, but the parser etc have the flatten step after
 | 
					    # This is annoying, but the parser etc have the flatten step after
 | 
				
			||||||
    # the tok2vec. To load the weights in cleanly, we need to match
 | 
					    # the tok2vec. To load the weights in cleanly, we need to match
 | 
				
			||||||
    # the shape of the models' components exactly. So what we cann
 | 
					    # the shape of the models' components exactly. So what we cann
 | 
				
			||||||
    # "tok2vec" has to be the same set of processes as what the components do.
 | 
					    # "tok2vec" has to be the same set of processes as what the components do.
 | 
				
			||||||
    tok2vec = chain(tok2vec, flatten)
 | 
					    tok2vec = chain(tok2vec, list2array())
 | 
				
			||||||
    model = chain(tok2vec, output_layer)
 | 
					    model = chain(tok2vec, output_layer)
 | 
				
			||||||
    model = masked_language_model(nlp.vocab, model)
 | 
					    model = masked_language_model(nlp.vocab, model)
 | 
				
			||||||
    model.tok2vec = tok2vec
 | 
					    model.set_ref("tok2vec", tok2vec)
 | 
				
			||||||
    model.output_layer = output_layer
 | 
					    model.set_ref("output_layer", output_layer)
 | 
				
			||||||
    model.begin_training([nlp.make_doc("Give it a doc to infer shapes")])
 | 
					    model.initialize(X=[nlp.make_doc("Give it a doc to infer shapes")])
 | 
				
			||||||
    return model
 | 
					    return model
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,7 +1,3 @@
 | 
				
			||||||
# coding: utf8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals, division, print_function
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
import plac
 | 
					 | 
				
			||||||
import tqdm
 | 
					import tqdm
 | 
				
			||||||
from pathlib import Path
 | 
					from pathlib import Path
 | 
				
			||||||
import srsly
 | 
					import srsly
 | 
				
			||||||
| 
						 | 
					@ -9,18 +5,19 @@ import cProfile
 | 
				
			||||||
import pstats
 | 
					import pstats
 | 
				
			||||||
import sys
 | 
					import sys
 | 
				
			||||||
import itertools
 | 
					import itertools
 | 
				
			||||||
import thinc.extra.datasets
 | 
					import ml_datasets
 | 
				
			||||||
from wasabi import msg
 | 
					from wasabi import msg
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ..util import load_model
 | 
					from ..util import load_model
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@plac.annotations(
 | 
					def profile(
 | 
				
			||||||
    model=("Model to load", "positional", None, str),
 | 
					    # fmt: off
 | 
				
			||||||
    inputs=("Location of input file. '-' for stdin.", "positional", None, str),
 | 
					    model: ("Model to load", "positional", None, str),
 | 
				
			||||||
    n_texts=("Maximum number of texts to use if available", "option", "n", int),
 | 
					    inputs: ("Location of input file. '-' for stdin.", "positional", None, str) = None,
 | 
				
			||||||
)
 | 
					    n_texts: ("Maximum number of texts to use if available", "option", "n", int) = 10000,
 | 
				
			||||||
def profile(model, inputs=None, n_texts=10000):
 | 
					    # fmt: on
 | 
				
			||||||
 | 
					):
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    Profile a spaCy pipeline, to find out which functions take the most time.
 | 
					    Profile a spaCy pipeline, to find out which functions take the most time.
 | 
				
			||||||
    Input should be formatted as one JSON object per line with a key "text".
 | 
					    Input should be formatted as one JSON object per line with a key "text".
 | 
				
			||||||
| 
						 | 
					@ -32,13 +29,13 @@ def profile(model, inputs=None, n_texts=10000):
 | 
				
			||||||
    if inputs is None:
 | 
					    if inputs is None:
 | 
				
			||||||
        n_inputs = 25000
 | 
					        n_inputs = 25000
 | 
				
			||||||
        with msg.loading("Loading IMDB dataset via Thinc..."):
 | 
					        with msg.loading("Loading IMDB dataset via Thinc..."):
 | 
				
			||||||
            imdb_train, _ = thinc.extra.datasets.imdb()
 | 
					            imdb_train, _ = ml_datasets.imdb()
 | 
				
			||||||
            inputs, _ = zip(*imdb_train)
 | 
					            inputs, _ = zip(*imdb_train)
 | 
				
			||||||
        msg.info("Loaded IMDB dataset and using {} examples".format(n_inputs))
 | 
					        msg.info(f"Loaded IMDB dataset and using {n_inputs} examples")
 | 
				
			||||||
        inputs = inputs[:n_inputs]
 | 
					        inputs = inputs[:n_inputs]
 | 
				
			||||||
    with msg.loading("Loading model '{}'...".format(model)):
 | 
					    with msg.loading(f"Loading model '{model}'..."):
 | 
				
			||||||
        nlp = load_model(model)
 | 
					        nlp = load_model(model)
 | 
				
			||||||
    msg.good("Loaded model '{}'".format(model))
 | 
					    msg.good(f"Loaded model '{model}'")
 | 
				
			||||||
    texts = list(itertools.islice(inputs, n_texts))
 | 
					    texts = list(itertools.islice(inputs, n_texts))
 | 
				
			||||||
    cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), "Profile.prof")
 | 
					    cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), "Profile.prof")
 | 
				
			||||||
    s = pstats.Stats("Profile.prof")
 | 
					    s = pstats.Stats("Profile.prof")
 | 
				
			||||||
| 
						 | 
					@ -60,7 +57,7 @@ def _read_inputs(loc, msg):
 | 
				
			||||||
        input_path = Path(loc)
 | 
					        input_path = Path(loc)
 | 
				
			||||||
        if not input_path.exists() or not input_path.is_file():
 | 
					        if not input_path.exists() or not input_path.is_file():
 | 
				
			||||||
            msg.fail("Not a valid input data file", loc, exits=1)
 | 
					            msg.fail("Not a valid input data file", loc, exits=1)
 | 
				
			||||||
        msg.info("Using data from {}".format(input_path.parts[-1]))
 | 
					        msg.info(f"Using data from {input_path.parts[-1]}")
 | 
				
			||||||
        file_ = input_path.open()
 | 
					        file_ = input_path.open()
 | 
				
			||||||
    for line in file_:
 | 
					    for line in file_:
 | 
				
			||||||
        data = srsly.json_loads(line)
 | 
					        data = srsly.json_loads(line)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,11 +1,7 @@
 | 
				
			||||||
# coding: utf8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals, division, print_function
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
import plac
 | 
					 | 
				
			||||||
import os
 | 
					import os
 | 
				
			||||||
import tqdm
 | 
					import tqdm
 | 
				
			||||||
from pathlib import Path
 | 
					from pathlib import Path
 | 
				
			||||||
from thinc.neural._classes.model import Model
 | 
					from thinc.api import use_ops
 | 
				
			||||||
from timeit import default_timer as timer
 | 
					from timeit import default_timer as timer
 | 
				
			||||||
import shutil
 | 
					import shutil
 | 
				
			||||||
import srsly
 | 
					import srsly
 | 
				
			||||||
| 
						 | 
					@ -13,76 +9,53 @@ from wasabi import msg
 | 
				
			||||||
import contextlib
 | 
					import contextlib
 | 
				
			||||||
import random
 | 
					import random
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .._ml import create_default_optimizer
 | 
					from ..util import create_default_optimizer
 | 
				
			||||||
from ..util import use_gpu as set_gpu
 | 
					from ..util import use_gpu as set_gpu
 | 
				
			||||||
from ..attrs import PROB, IS_OOV, CLUSTER, LANG
 | 
					from ..attrs import PROB, IS_OOV, CLUSTER, LANG
 | 
				
			||||||
from ..gold import GoldCorpus
 | 
					from ..gold import GoldCorpus
 | 
				
			||||||
from ..compat import path2str
 | 
					 | 
				
			||||||
from .. import util
 | 
					from .. import util
 | 
				
			||||||
from .. import about
 | 
					from .. import about
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@plac.annotations(
 | 
					 | 
				
			||||||
    # fmt: off
 | 
					 | 
				
			||||||
    lang=("Model language", "positional", None, str),
 | 
					 | 
				
			||||||
    output_path=("Output directory to store model in", "positional", None, Path),
 | 
					 | 
				
			||||||
    train_path=("Location of JSON-formatted training data", "positional", None, Path),
 | 
					 | 
				
			||||||
    dev_path=("Location of JSON-formatted development data", "positional", None, Path),
 | 
					 | 
				
			||||||
    raw_text=("Path to jsonl file with unlabelled text documents.", "option", "rt", Path),
 | 
					 | 
				
			||||||
    base_model=("Name of model to update (optional)", "option", "b", str),
 | 
					 | 
				
			||||||
    pipeline=("Comma-separated names of pipeline components", "option", "p", str),
 | 
					 | 
				
			||||||
    replace_components=("Replace components from base model", "flag", "R", bool),
 | 
					 | 
				
			||||||
    vectors=("Model to load vectors from", "option", "v", str),
 | 
					 | 
				
			||||||
    n_iter=("Number of iterations", "option", "n", int),
 | 
					 | 
				
			||||||
    n_early_stopping=("Maximum number of training epochs without dev accuracy improvement", "option", "ne", int),
 | 
					 | 
				
			||||||
    n_examples=("Number of examples", "option", "ns", int),
 | 
					 | 
				
			||||||
    use_gpu=("Use GPU", "option", "g", int),
 | 
					 | 
				
			||||||
    version=("Model version", "option", "V", str),
 | 
					 | 
				
			||||||
    meta_path=("Optional path to meta.json to use as base.", "option", "m", Path),
 | 
					 | 
				
			||||||
    init_tok2vec=("Path to pretrained weights for the token-to-vector parts of the models. See 'spacy pretrain'. Experimental.", "option", "t2v", Path),
 | 
					 | 
				
			||||||
    parser_multitasks=("Side objectives for parser CNN, e.g. 'dep' or 'dep,tag'", "option", "pt", str),
 | 
					 | 
				
			||||||
    entity_multitasks=("Side objectives for NER CNN, e.g. 'dep' or 'dep,tag'", "option", "et", str),
 | 
					 | 
				
			||||||
    noise_level=("Amount of corruption for data augmentation", "option", "nl", float),
 | 
					 | 
				
			||||||
    orth_variant_level=("Amount of orthography variation for data augmentation", "option", "ovl", float),
 | 
					 | 
				
			||||||
    eval_beam_widths=("Beam widths to evaluate, e.g. 4,8", "option", "bw", str),
 | 
					 | 
				
			||||||
    gold_preproc=("Use gold preprocessing", "flag", "G", bool),
 | 
					 | 
				
			||||||
    learn_tokens=("Make parser learn gold-standard tokenization", "flag", "T", bool),
 | 
					 | 
				
			||||||
    textcat_multilabel=("Textcat classes aren't mutually exclusive (multilabel)", "flag", "TML", bool),
 | 
					 | 
				
			||||||
    textcat_arch=("Textcat model architecture", "option", "ta", str),
 | 
					 | 
				
			||||||
    textcat_positive_label=("Textcat positive label for binary classes with two labels", "option", "tpl", str),
 | 
					 | 
				
			||||||
    verbose=("Display more information for debug", "flag", "VV", bool),
 | 
					 | 
				
			||||||
    debug=("Run data diagnostics before training", "flag", "D", bool),
 | 
					 | 
				
			||||||
    # fmt: on
 | 
					 | 
				
			||||||
)
 | 
					 | 
				
			||||||
def train(
 | 
					def train(
 | 
				
			||||||
    lang,
 | 
					    # fmt: off
 | 
				
			||||||
    output_path,
 | 
					    lang: ("Model language", "positional", None, str),
 | 
				
			||||||
    train_path,
 | 
					    output_path: ("Output directory to store model in", "positional", None, Path),
 | 
				
			||||||
    dev_path,
 | 
					    train_path: ("Location of JSON-formatted training data", "positional", None, Path),
 | 
				
			||||||
    raw_text=None,
 | 
					    dev_path: ("Location of JSON-formatted development data", "positional", None, Path),
 | 
				
			||||||
    base_model=None,
 | 
					    raw_text: ("Path to jsonl file with unlabelled text documents.", "option", "rt", Path) = None,
 | 
				
			||||||
    pipeline="tagger,parser,ner",
 | 
					    base_model: ("Name of model to update (optional)", "option", "b", str) = None,
 | 
				
			||||||
    replace_components=False,
 | 
					    pipeline: ("Comma-separated names of pipeline components", "option", "p", str) = "tagger,parser,ner",
 | 
				
			||||||
    vectors=None,
 | 
					    vectors: ("Model to load vectors from", "option", "v", str) = None,
 | 
				
			||||||
    n_iter=30,
 | 
					    replace_components: ("Replace components from base model", "flag", "R", bool) = False,
 | 
				
			||||||
    n_early_stopping=None,
 | 
					    width: ("Width of CNN layers of Tok2Vec component", "option", "cw", int) = 96,
 | 
				
			||||||
    n_examples=0,
 | 
					    conv_depth: ("Depth of CNN layers of Tok2Vec component", "option", "cd", int) = 4,
 | 
				
			||||||
    use_gpu=-1,
 | 
					    cnn_window: ("Window size for CNN layers of Tok2Vec component", "option", "cW", int) = 1,
 | 
				
			||||||
    version="0.0.0",
 | 
					    cnn_pieces: ("Maxout size for CNN layers of Tok2Vec component. 1 for Mish", "option", "cP", int) = 3,
 | 
				
			||||||
    meta_path=None,
 | 
					    use_chars: ("Whether to use character-based embedding of Tok2Vec component", "flag", "chr", bool) = False,
 | 
				
			||||||
    init_tok2vec=None,
 | 
					    bilstm_depth: ("Depth of BiLSTM layers of Tok2Vec component (requires PyTorch)", "option", "lstm", int) = 0,
 | 
				
			||||||
    parser_multitasks="",
 | 
					    embed_rows: ("Number of embedding rows of Tok2Vec component", "option", "er", int) = 2000,
 | 
				
			||||||
    entity_multitasks="",
 | 
					    n_iter: ("Number of iterations", "option", "n", int) = 30,
 | 
				
			||||||
    noise_level=0.0,
 | 
					    n_early_stopping: ("Maximum number of training epochs without dev accuracy improvement", "option", "ne", int) = None,
 | 
				
			||||||
    orth_variant_level=0.0,
 | 
					    n_examples: ("Number of examples", "option", "ns", int) = 0,
 | 
				
			||||||
    eval_beam_widths="",
 | 
					    use_gpu: ("Use GPU", "option", "g", int) = -1,
 | 
				
			||||||
    gold_preproc=False,
 | 
					    version: ("Model version", "option", "V", str) = "0.0.0",
 | 
				
			||||||
    learn_tokens=False,
 | 
					    meta_path: ("Optional path to meta.json to use as base.", "option", "m", Path) = None,
 | 
				
			||||||
    textcat_multilabel=False,
 | 
					    init_tok2vec: ("Path to pretrained weights for the token-to-vector parts of the models. See 'spacy pretrain'. Experimental.", "option", "t2v", Path) = None,
 | 
				
			||||||
    textcat_arch="bow",
 | 
					    parser_multitasks: ("Side objectives for parser CNN, e.g. 'dep' or 'dep,tag'", "option", "pt", str) = "",
 | 
				
			||||||
    textcat_positive_label=None,
 | 
					    entity_multitasks: ("Side objectives for NER CNN, e.g. 'dep' or 'dep,tag'", "option", "et", str) = "",
 | 
				
			||||||
    verbose=False,
 | 
					    noise_level: ("Amount of corruption for data augmentation", "option", "nl", float) = 0.0,
 | 
				
			||||||
    debug=False,
 | 
					    orth_variant_level: ("Amount of orthography variation for data augmentation", "option", "ovl", float) = 0.0,
 | 
				
			||||||
 | 
					    eval_beam_widths: ("Beam widths to evaluate, e.g. 4,8", "option", "bw", str) = "",
 | 
				
			||||||
 | 
					    gold_preproc: ("Use gold preprocessing", "flag", "G", bool) = False,
 | 
				
			||||||
 | 
					    learn_tokens: ("Make parser learn gold-standard tokenization", "flag", "T", bool) = False,
 | 
				
			||||||
 | 
					    textcat_multilabel: ("Textcat classes aren't mutually exclusive (multilabel)", "flag", "TML", bool) = False,
 | 
				
			||||||
 | 
					    textcat_arch: ("Textcat model architecture", "option", "ta", str) = "bow",
 | 
				
			||||||
 | 
					    textcat_positive_label: ("Textcat positive label for binary classes with two labels", "option", "tpl", str) = None,
 | 
				
			||||||
 | 
					    tag_map_path: ("Location of JSON-formatted tag map", "option", "tm", Path) = None,
 | 
				
			||||||
 | 
					    verbose: ("Display more information for debug", "flag", "VV", bool) = False,
 | 
				
			||||||
 | 
					    debug: ("Run data diagnostics before training", "flag", "D", bool) = False,
 | 
				
			||||||
 | 
					    # fmt: on
 | 
				
			||||||
):
 | 
					):
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    Train or update a spaCy model. Requires data to be formatted in spaCy's
 | 
					    Train or update a spaCy model. Requires data to be formatted in spaCy's
 | 
				
			||||||
| 
						 | 
					@ -116,7 +89,11 @@ def train(
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
    if not output_path.exists():
 | 
					    if not output_path.exists():
 | 
				
			||||||
        output_path.mkdir()
 | 
					        output_path.mkdir()
 | 
				
			||||||
 | 
					        msg.good(f"Created output directory: {output_path}")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    tag_map = {}
 | 
				
			||||||
 | 
					    if tag_map_path is not None:
 | 
				
			||||||
 | 
					        tag_map = srsly.read_json(tag_map_path)
 | 
				
			||||||
    # Take dropout and batch size as generators of values -- dropout
 | 
					    # Take dropout and batch size as generators of values -- dropout
 | 
				
			||||||
    # starts high and decays sharply, to force the optimizer to explore.
 | 
					    # starts high and decays sharply, to force the optimizer to explore.
 | 
				
			||||||
    # Batch size starts at 1 and grows, so that we make updates quickly
 | 
					    # Batch size starts at 1 and grows, so that we make updates quickly
 | 
				
			||||||
| 
						 | 
					@ -145,28 +122,29 @@ def train(
 | 
				
			||||||
    # the model and make sure the pipeline matches the pipeline setting. If
 | 
					    # the model and make sure the pipeline matches the pipeline setting. If
 | 
				
			||||||
    # training starts from a blank model, intitalize the language class.
 | 
					    # training starts from a blank model, intitalize the language class.
 | 
				
			||||||
    pipeline = [p.strip() for p in pipeline.split(",")]
 | 
					    pipeline = [p.strip() for p in pipeline.split(",")]
 | 
				
			||||||
 | 
					    msg.text(f"Training pipeline: {pipeline}")
 | 
				
			||||||
    disabled_pipes = None
 | 
					    disabled_pipes = None
 | 
				
			||||||
    pipes_added = False
 | 
					    pipes_added = False
 | 
				
			||||||
    msg.text("Training pipeline: {}".format(pipeline))
 | 
					    msg.text(f"Training pipeline: {pipeline}")
 | 
				
			||||||
    if use_gpu >= 0:
 | 
					    if use_gpu >= 0:
 | 
				
			||||||
        activated_gpu = None
 | 
					        activated_gpu = None
 | 
				
			||||||
        try:
 | 
					        try:
 | 
				
			||||||
            activated_gpu = set_gpu(use_gpu)
 | 
					            activated_gpu = set_gpu(use_gpu)
 | 
				
			||||||
        except Exception as e:
 | 
					        except Exception as e:
 | 
				
			||||||
            msg.warn("Exception: {}".format(e))
 | 
					            msg.warn(f"Exception: {e}")
 | 
				
			||||||
        if activated_gpu is not None:
 | 
					        if activated_gpu is not None:
 | 
				
			||||||
            msg.text("Using GPU: {}".format(use_gpu))
 | 
					            msg.text(f"Using GPU: {use_gpu}")
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            msg.warn("Unable to activate GPU: {}".format(use_gpu))
 | 
					            msg.warn(f"Unable to activate GPU: {use_gpu}")
 | 
				
			||||||
            msg.text("Using CPU only")
 | 
					            msg.text("Using CPU only")
 | 
				
			||||||
            use_gpu = -1
 | 
					            use_gpu = -1
 | 
				
			||||||
    if base_model:
 | 
					    if base_model:
 | 
				
			||||||
        msg.text("Starting with base model '{}'".format(base_model))
 | 
					        msg.text(f"Starting with base model '{base_model}'")
 | 
				
			||||||
        nlp = util.load_model(base_model)
 | 
					        nlp = util.load_model(base_model)
 | 
				
			||||||
        if nlp.lang != lang:
 | 
					        if nlp.lang != lang:
 | 
				
			||||||
            msg.fail(
 | 
					            msg.fail(
 | 
				
			||||||
                "Model language ('{}') doesn't match language specified as "
 | 
					                f"Model language ('{nlp.lang}') doesn't match language "
 | 
				
			||||||
                "`lang` argument ('{}') ".format(nlp.lang, lang),
 | 
					                f"specified as `lang` argument ('{lang}') ",
 | 
				
			||||||
                exits=1,
 | 
					                exits=1,
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
        for pipe in pipeline:
 | 
					        for pipe in pipeline:
 | 
				
			||||||
| 
						 | 
					@ -180,11 +158,11 @@ def train(
 | 
				
			||||||
                    "positive_label": textcat_positive_label,
 | 
					                    "positive_label": textcat_positive_label,
 | 
				
			||||||
                }
 | 
					                }
 | 
				
			||||||
            if pipe not in nlp.pipe_names:
 | 
					            if pipe not in nlp.pipe_names:
 | 
				
			||||||
                msg.text("Adding component to base model '{}'".format(pipe))
 | 
					                msg.text(f"Adding component to base model '{pipe}'")
 | 
				
			||||||
                nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg))
 | 
					                nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg))
 | 
				
			||||||
                pipes_added = True
 | 
					                pipes_added = True
 | 
				
			||||||
            elif replace_components:
 | 
					            elif replace_components:
 | 
				
			||||||
                msg.text("Replacing component from base model '{}'".format(pipe))
 | 
					                msg.text(f"Replacing component from base model '{pipe}'")
 | 
				
			||||||
                nlp.replace_pipe(pipe, nlp.create_pipe(pipe, config=pipe_cfg))
 | 
					                nlp.replace_pipe(pipe, nlp.create_pipe(pipe, config=pipe_cfg))
 | 
				
			||||||
                pipes_added = True
 | 
					                pipes_added = True
 | 
				
			||||||
            else:
 | 
					            else:
 | 
				
			||||||
| 
						 | 
					@ -197,17 +175,17 @@ def train(
 | 
				
			||||||
                    }
 | 
					                    }
 | 
				
			||||||
                    if base_cfg != pipe_cfg:
 | 
					                    if base_cfg != pipe_cfg:
 | 
				
			||||||
                        msg.fail(
 | 
					                        msg.fail(
 | 
				
			||||||
                            "The base textcat model configuration does"
 | 
					                            f"The base textcat model configuration does"
 | 
				
			||||||
                            "not match the provided training options. "
 | 
					                            f"not match the provided training options. "
 | 
				
			||||||
                            "Existing cfg: {}, provided cfg: {}".format(
 | 
					                            f"Existing cfg: {base_cfg}, provided cfg: {pipe_cfg}",
 | 
				
			||||||
                                base_cfg, pipe_cfg
 | 
					 | 
				
			||||||
                            ),
 | 
					 | 
				
			||||||
                            exits=1,
 | 
					                            exits=1,
 | 
				
			||||||
                        )
 | 
					                        )
 | 
				
			||||||
                msg.text("Extending component from base model '{}'".format(pipe))
 | 
					                msg.text(f"Extending component from base model '{pipe}'")
 | 
				
			||||||
        disabled_pipes = nlp.disable_pipes([p for p in nlp.pipe_names if p not in pipeline])
 | 
					        disabled_pipes = nlp.disable_pipes(
 | 
				
			||||||
 | 
					            [p for p in nlp.pipe_names if p not in pipeline]
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        msg.text("Starting with blank model '{}'".format(lang))
 | 
					        msg.text(f"Starting with blank model '{lang}'")
 | 
				
			||||||
        lang_cls = util.get_lang_class(lang)
 | 
					        lang_cls = util.get_lang_class(lang)
 | 
				
			||||||
        nlp = lang_cls()
 | 
					        nlp = lang_cls()
 | 
				
			||||||
        for pipe in pipeline:
 | 
					        for pipe in pipeline:
 | 
				
			||||||
| 
						 | 
					@ -223,8 +201,11 @@ def train(
 | 
				
			||||||
                pipe_cfg = {}
 | 
					                pipe_cfg = {}
 | 
				
			||||||
            nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg))
 | 
					            nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Update tag map with provided mapping
 | 
				
			||||||
 | 
					    nlp.vocab.morphology.tag_map.update(tag_map)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if vectors:
 | 
					    if vectors:
 | 
				
			||||||
        msg.text("Loading vector from model '{}'".format(vectors))
 | 
					        msg.text(f"Loading vector from model '{vectors}'")
 | 
				
			||||||
        _load_vectors(nlp, vectors)
 | 
					        _load_vectors(nlp, vectors)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # Multitask objectives
 | 
					    # Multitask objectives
 | 
				
			||||||
| 
						 | 
					@ -233,49 +214,56 @@ def train(
 | 
				
			||||||
        if multitasks:
 | 
					        if multitasks:
 | 
				
			||||||
            if pipe_name not in pipeline:
 | 
					            if pipe_name not in pipeline:
 | 
				
			||||||
                msg.fail(
 | 
					                msg.fail(
 | 
				
			||||||
                    "Can't use multitask objective without '{}' in the "
 | 
					                    f"Can't use multitask objective without '{pipe_name}' in "
 | 
				
			||||||
                    "pipeline".format(pipe_name)
 | 
					                    f"the pipeline"
 | 
				
			||||||
                )
 | 
					                )
 | 
				
			||||||
            pipe = nlp.get_pipe(pipe_name)
 | 
					            pipe = nlp.get_pipe(pipe_name)
 | 
				
			||||||
            for objective in multitasks.split(","):
 | 
					            for objective in multitasks.split(","):
 | 
				
			||||||
                pipe.add_multitask_objective(objective)
 | 
					                pipe.add_multitask_objective(objective)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # Prepare training corpus
 | 
					    # Prepare training corpus
 | 
				
			||||||
    msg.text("Counting training words (limit={})".format(n_examples))
 | 
					    msg.text(f"Counting training words (limit={n_examples})")
 | 
				
			||||||
    corpus = GoldCorpus(train_path, dev_path, limit=n_examples)
 | 
					    corpus = GoldCorpus(train_path, dev_path, limit=n_examples)
 | 
				
			||||||
    n_train_words = corpus.count_train()
 | 
					    n_train_words = corpus.count_train()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if base_model and not pipes_added:
 | 
					    if base_model and not pipes_added:
 | 
				
			||||||
        # Start with an existing model, use default optimizer
 | 
					        # Start with an existing model, use default optimizer
 | 
				
			||||||
        optimizer = create_default_optimizer(Model.ops)
 | 
					        optimizer = create_default_optimizer()
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        # Start with a blank model, call begin_training
 | 
					        # Start with a blank model, call begin_training
 | 
				
			||||||
        optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu)
 | 
					        cfg = {"device": use_gpu}
 | 
				
			||||||
 | 
					        cfg["conv_depth"] = conv_depth
 | 
				
			||||||
 | 
					        cfg["token_vector_width"] = width
 | 
				
			||||||
 | 
					        cfg["bilstm_depth"] = bilstm_depth
 | 
				
			||||||
 | 
					        cfg["cnn_maxout_pieces"] = cnn_pieces
 | 
				
			||||||
 | 
					        cfg["embed_size"] = embed_rows
 | 
				
			||||||
 | 
					        cfg["conv_window"] = cnn_window
 | 
				
			||||||
 | 
					        cfg["subword_features"] = not use_chars
 | 
				
			||||||
 | 
					        optimizer = nlp.begin_training(lambda: corpus.train_tuples, **cfg)
 | 
				
			||||||
    nlp._optimizer = None
 | 
					    nlp._optimizer = None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # Load in pretrained weights
 | 
					    # Load in pretrained weights
 | 
				
			||||||
    if init_tok2vec is not None:
 | 
					    if init_tok2vec is not None:
 | 
				
			||||||
        components = _load_pretrained_tok2vec(nlp, init_tok2vec)
 | 
					        components = _load_pretrained_tok2vec(nlp, init_tok2vec)
 | 
				
			||||||
        msg.text("Loaded pretrained tok2vec for: {}".format(components))
 | 
					        msg.text(f"Loaded pretrained tok2vec for: {components}")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # Verify textcat config
 | 
					    # Verify textcat config
 | 
				
			||||||
    if "textcat" in pipeline:
 | 
					    if "textcat" in pipeline:
 | 
				
			||||||
        textcat_labels = nlp.get_pipe("textcat").cfg.get("labels", [])
 | 
					        textcat_labels = nlp.get_pipe("textcat").cfg.get("labels", [])
 | 
				
			||||||
        if textcat_positive_label and textcat_positive_label not in textcat_labels:
 | 
					        if textcat_positive_label and textcat_positive_label not in textcat_labels:
 | 
				
			||||||
            msg.fail(
 | 
					            msg.fail(
 | 
				
			||||||
                "The textcat_positive_label (tpl) '{}' does not match any "
 | 
					                f"The textcat_positive_label (tpl) '{textcat_positive_label}' "
 | 
				
			||||||
                "label in the training data.".format(textcat_positive_label),
 | 
					                f"does not match any label in the training data.",
 | 
				
			||||||
                exits=1,
 | 
					                exits=1,
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
        if textcat_positive_label and len(textcat_labels) != 2:
 | 
					        if textcat_positive_label and len(textcat_labels) != 2:
 | 
				
			||||||
            msg.fail(
 | 
					            msg.fail(
 | 
				
			||||||
                "A textcat_positive_label (tpl) '{}' was provided for training "
 | 
					                "A textcat_positive_label (tpl) '{textcat_positive_label}' was "
 | 
				
			||||||
                "data that does not appear to be a binary classification "
 | 
					                "provided for training data that does not appear to be a "
 | 
				
			||||||
                "problem with two labels.".format(textcat_positive_label),
 | 
					                "binary classification problem with two labels.",
 | 
				
			||||||
                exits=1,
 | 
					                exits=1,
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
        train_docs = corpus.train_docs(
 | 
					        train_data = corpus.train_data(
 | 
				
			||||||
            nlp,
 | 
					            nlp,
 | 
				
			||||||
            noise_level=noise_level,
 | 
					            noise_level=noise_level,
 | 
				
			||||||
            gold_preproc=gold_preproc,
 | 
					            gold_preproc=gold_preproc,
 | 
				
			||||||
| 
						 | 
					@ -285,9 +273,9 @@ def train(
 | 
				
			||||||
        train_labels = set()
 | 
					        train_labels = set()
 | 
				
			||||||
        if textcat_multilabel:
 | 
					        if textcat_multilabel:
 | 
				
			||||||
            multilabel_found = False
 | 
					            multilabel_found = False
 | 
				
			||||||
            for text, gold in train_docs:
 | 
					            for ex in train_data:
 | 
				
			||||||
                train_labels.update(gold.cats.keys())
 | 
					                train_labels.update(ex.gold.cats.keys())
 | 
				
			||||||
                if list(gold.cats.values()).count(1.0) != 1:
 | 
					                if list(ex.gold.cats.values()).count(1.0) != 1:
 | 
				
			||||||
                    multilabel_found = True
 | 
					                    multilabel_found = True
 | 
				
			||||||
            if not multilabel_found and not base_model:
 | 
					            if not multilabel_found and not base_model:
 | 
				
			||||||
                msg.warn(
 | 
					                msg.warn(
 | 
				
			||||||
| 
						 | 
					@ -297,9 +285,9 @@ def train(
 | 
				
			||||||
                    "mutually-exclusive classes."
 | 
					                    "mutually-exclusive classes."
 | 
				
			||||||
                )
 | 
					                )
 | 
				
			||||||
        if not textcat_multilabel:
 | 
					        if not textcat_multilabel:
 | 
				
			||||||
            for text, gold in train_docs:
 | 
					            for ex in train_data:
 | 
				
			||||||
                train_labels.update(gold.cats.keys())
 | 
					                train_labels.update(ex.gold.cats.keys())
 | 
				
			||||||
                if list(gold.cats.values()).count(1.0) != 1 and not base_model:
 | 
					                if list(ex.gold.cats.values()).count(1.0) != 1 and not base_model:
 | 
				
			||||||
                    msg.warn(
 | 
					                    msg.warn(
 | 
				
			||||||
                        "Some textcat training instances do not have exactly "
 | 
					                        "Some textcat training instances do not have exactly "
 | 
				
			||||||
                        "one positive label. Modifying training options to "
 | 
					                        "one positive label. Modifying training options to "
 | 
				
			||||||
| 
						 | 
					@ -311,20 +299,20 @@ def train(
 | 
				
			||||||
                    break
 | 
					                    break
 | 
				
			||||||
        if base_model and set(textcat_labels) != train_labels:
 | 
					        if base_model and set(textcat_labels) != train_labels:
 | 
				
			||||||
            msg.fail(
 | 
					            msg.fail(
 | 
				
			||||||
                "Cannot extend textcat model using data with different "
 | 
					                f"Cannot extend textcat model using data with different "
 | 
				
			||||||
                "labels. Base model labels: {}, training data labels: "
 | 
					                f"labels. Base model labels: {textcat_labels}, training data "
 | 
				
			||||||
                "{}.".format(textcat_labels, list(train_labels)),
 | 
					                f"labels: {list(train_labels)}",
 | 
				
			||||||
                exits=1,
 | 
					                exits=1,
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
        if textcat_multilabel:
 | 
					        if textcat_multilabel:
 | 
				
			||||||
            msg.text(
 | 
					            msg.text(
 | 
				
			||||||
                "Textcat evaluation score: ROC AUC score macro-averaged across "
 | 
					                f"Textcat evaluation score: ROC AUC score macro-averaged across "
 | 
				
			||||||
                "the labels '{}'".format(", ".join(textcat_labels))
 | 
					                f"the labels '{', '.join(textcat_labels)}'"
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
        elif textcat_positive_label and len(textcat_labels) == 2:
 | 
					        elif textcat_positive_label and len(textcat_labels) == 2:
 | 
				
			||||||
            msg.text(
 | 
					            msg.text(
 | 
				
			||||||
                "Textcat evaluation score: F1-score for the "
 | 
					                f"Textcat evaluation score: F1-score for the "
 | 
				
			||||||
                "label '{}'".format(textcat_positive_label)
 | 
					                f"label '{textcat_positive_label}'"
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
        elif len(textcat_labels) > 1:
 | 
					        elif len(textcat_labels) > 1:
 | 
				
			||||||
            if len(textcat_labels) == 2:
 | 
					            if len(textcat_labels) == 2:
 | 
				
			||||||
| 
						 | 
					@ -334,8 +322,8 @@ def train(
 | 
				
			||||||
                    "an evaluation on the positive class."
 | 
					                    "an evaluation on the positive class."
 | 
				
			||||||
                )
 | 
					                )
 | 
				
			||||||
            msg.text(
 | 
					            msg.text(
 | 
				
			||||||
                "Textcat evaluation score: F1-score macro-averaged across "
 | 
					                f"Textcat evaluation score: F1-score macro-averaged across "
 | 
				
			||||||
                "the labels '{}'".format(", ".join(textcat_labels))
 | 
					                f"the labels '{', '.join(textcat_labels)}'"
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            msg.fail(
 | 
					            msg.fail(
 | 
				
			||||||
| 
						 | 
					@ -355,7 +343,7 @@ def train(
 | 
				
			||||||
        iter_since_best = 0
 | 
					        iter_since_best = 0
 | 
				
			||||||
        best_score = 0.0
 | 
					        best_score = 0.0
 | 
				
			||||||
        for i in range(n_iter):
 | 
					        for i in range(n_iter):
 | 
				
			||||||
            train_docs = corpus.train_docs(
 | 
					            train_data = corpus.train_dataset(
 | 
				
			||||||
                nlp,
 | 
					                nlp,
 | 
				
			||||||
                noise_level=noise_level,
 | 
					                noise_level=noise_level,
 | 
				
			||||||
                orth_variant_level=orth_variant_level,
 | 
					                orth_variant_level=orth_variant_level,
 | 
				
			||||||
| 
						 | 
					@ -371,10 +359,11 @@ def train(
 | 
				
			||||||
            words_seen = 0
 | 
					            words_seen = 0
 | 
				
			||||||
            with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
 | 
					            with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
 | 
				
			||||||
                losses = {}
 | 
					                losses = {}
 | 
				
			||||||
                for batch in util.minibatch_by_words(train_docs, size=batch_sizes):
 | 
					                for batch in util.minibatch_by_words(train_data, size=batch_sizes):
 | 
				
			||||||
                    if not batch:
 | 
					                    if not batch:
 | 
				
			||||||
                        continue
 | 
					                        continue
 | 
				
			||||||
                    docs, golds = zip(*batch)
 | 
					                    docs, golds = zip(*batch)
 | 
				
			||||||
 | 
					                    try:
 | 
				
			||||||
                        nlp.update(
 | 
					                        nlp.update(
 | 
				
			||||||
                            docs,
 | 
					                            docs,
 | 
				
			||||||
                            golds,
 | 
					                            golds,
 | 
				
			||||||
| 
						 | 
					@ -382,62 +371,70 @@ def train(
 | 
				
			||||||
                            drop=next(dropout_rates),
 | 
					                            drop=next(dropout_rates),
 | 
				
			||||||
                            losses=losses,
 | 
					                            losses=losses,
 | 
				
			||||||
                        )
 | 
					                        )
 | 
				
			||||||
 | 
					                    except ValueError as e:
 | 
				
			||||||
 | 
					                        msg.warn("Error during training")
 | 
				
			||||||
 | 
					                        if init_tok2vec:
 | 
				
			||||||
 | 
					                            msg.warn(
 | 
				
			||||||
 | 
					                                "Did you provide the same parameters during 'train' as during 'pretrain'?"
 | 
				
			||||||
 | 
					                            )
 | 
				
			||||||
 | 
					                        msg.fail(f"Original error message: {e}", exits=1)
 | 
				
			||||||
                    if raw_text:
 | 
					                    if raw_text:
 | 
				
			||||||
                        # If raw text is available, perform 'rehearsal' updates,
 | 
					                        # If raw text is available, perform 'rehearsal' updates,
 | 
				
			||||||
                        # which use unlabelled data to reduce overfitting.
 | 
					                        # which use unlabelled data to reduce overfitting.
 | 
				
			||||||
                        raw_batch = list(next(raw_batches))
 | 
					                        raw_batch = list(next(raw_batches))
 | 
				
			||||||
                        nlp.rehearse(raw_batch, sgd=optimizer, losses=losses)
 | 
					                        nlp.rehearse(raw_batch, sgd=optimizer, losses=losses)
 | 
				
			||||||
 | 
					                    docs = [ex.doc for ex in batch]
 | 
				
			||||||
                    if not int(os.environ.get("LOG_FRIENDLY", 0)):
 | 
					                    if not int(os.environ.get("LOG_FRIENDLY", 0)):
 | 
				
			||||||
                        pbar.update(sum(len(doc) for doc in docs))
 | 
					                        pbar.update(sum(len(doc) for doc in docs))
 | 
				
			||||||
                    words_seen += sum(len(doc) for doc in docs)
 | 
					                    words_seen += sum(len(doc) for doc in docs)
 | 
				
			||||||
            with nlp.use_params(optimizer.averages):
 | 
					            with nlp.use_params(optimizer.averages):
 | 
				
			||||||
                util.set_env_log(False)
 | 
					                util.set_env_log(False)
 | 
				
			||||||
                epoch_model_path = output_path / ("model%d" % i)
 | 
					                epoch_model_path = output_path / f"model{i}"
 | 
				
			||||||
                nlp.to_disk(epoch_model_path)
 | 
					                nlp.to_disk(epoch_model_path)
 | 
				
			||||||
                nlp_loaded = util.load_model_from_path(epoch_model_path)
 | 
					                nlp_loaded = util.load_model_from_path(epoch_model_path)
 | 
				
			||||||
                for beam_width in eval_beam_widths:
 | 
					                for beam_width in eval_beam_widths:
 | 
				
			||||||
                    for name, component in nlp_loaded.pipeline:
 | 
					                    for name, component in nlp_loaded.pipeline:
 | 
				
			||||||
                        if hasattr(component, "cfg"):
 | 
					                        if hasattr(component, "cfg"):
 | 
				
			||||||
                            component.cfg["beam_width"] = beam_width
 | 
					                            component.cfg["beam_width"] = beam_width
 | 
				
			||||||
                    dev_docs = list(
 | 
					                    dev_dataset = list(
 | 
				
			||||||
                        corpus.dev_docs(
 | 
					                        corpus.dev_dataset(
 | 
				
			||||||
                            nlp_loaded,
 | 
					                            nlp_loaded,
 | 
				
			||||||
                            gold_preproc=gold_preproc,
 | 
					                            gold_preproc=gold_preproc,
 | 
				
			||||||
                            ignore_misaligned=True,
 | 
					                            ignore_misaligned=True,
 | 
				
			||||||
                        )
 | 
					                        )
 | 
				
			||||||
                    )
 | 
					                    )
 | 
				
			||||||
                    nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
 | 
					                    nwords = sum(len(ex.doc) for ex in dev_dataset)
 | 
				
			||||||
                    start_time = timer()
 | 
					                    start_time = timer()
 | 
				
			||||||
                    scorer = nlp_loaded.evaluate(dev_docs, verbose=verbose)
 | 
					                    scorer = nlp_loaded.evaluate(dev_dataset, verbose=verbose)
 | 
				
			||||||
                    end_time = timer()
 | 
					                    end_time = timer()
 | 
				
			||||||
                    if use_gpu < 0:
 | 
					                    if use_gpu < 0:
 | 
				
			||||||
                        gpu_wps = None
 | 
					                        gpu_wps = None
 | 
				
			||||||
                        cpu_wps = nwords / (end_time - start_time)
 | 
					                        cpu_wps = nwords / (end_time - start_time)
 | 
				
			||||||
                    else:
 | 
					                    else:
 | 
				
			||||||
                        gpu_wps = nwords / (end_time - start_time)
 | 
					                        gpu_wps = nwords / (end_time - start_time)
 | 
				
			||||||
                        with Model.use_device("cpu"):
 | 
					                        with use_ops("numpy"):
 | 
				
			||||||
                            nlp_loaded = util.load_model_from_path(epoch_model_path)
 | 
					                            nlp_loaded = util.load_model_from_path(epoch_model_path)
 | 
				
			||||||
                            for name, component in nlp_loaded.pipeline:
 | 
					                            for name, component in nlp_loaded.pipeline:
 | 
				
			||||||
                                if hasattr(component, "cfg"):
 | 
					                                if hasattr(component, "cfg"):
 | 
				
			||||||
                                    component.cfg["beam_width"] = beam_width
 | 
					                                    component.cfg["beam_width"] = beam_width
 | 
				
			||||||
                            dev_docs = list(
 | 
					                            dev_dataset = list(
 | 
				
			||||||
                                corpus.dev_docs(
 | 
					                                corpus.dev_dataset(
 | 
				
			||||||
                                    nlp_loaded,
 | 
					                                    nlp_loaded,
 | 
				
			||||||
                                    gold_preproc=gold_preproc,
 | 
					                                    gold_preproc=gold_preproc,
 | 
				
			||||||
                                    ignore_misaligned=True,
 | 
					                                    ignore_misaligned=True,
 | 
				
			||||||
                                )
 | 
					                                )
 | 
				
			||||||
                            )
 | 
					                            )
 | 
				
			||||||
                            start_time = timer()
 | 
					                            start_time = timer()
 | 
				
			||||||
                            scorer = nlp_loaded.evaluate(dev_docs, verbose=verbose)
 | 
					                            scorer = nlp_loaded.evaluate(dev_dataset, verbose=verbose)
 | 
				
			||||||
                            end_time = timer()
 | 
					                            end_time = timer()
 | 
				
			||||||
                            cpu_wps = nwords / (end_time - start_time)
 | 
					                            cpu_wps = nwords / (end_time - start_time)
 | 
				
			||||||
                    acc_loc = output_path / ("model%d" % i) / "accuracy.json"
 | 
					                    acc_loc = output_path / f"model{i}" / "accuracy.json"
 | 
				
			||||||
                    srsly.write_json(acc_loc, scorer.scores)
 | 
					                    srsly.write_json(acc_loc, scorer.scores)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
                    # Update model meta.json
 | 
					                    # Update model meta.json
 | 
				
			||||||
                    meta["lang"] = nlp.lang
 | 
					                    meta["lang"] = nlp.lang
 | 
				
			||||||
                    meta["pipeline"] = nlp.pipe_names
 | 
					                    meta["pipeline"] = nlp.pipe_names
 | 
				
			||||||
                    meta["spacy_version"] = ">=%s" % about.__version__
 | 
					                    meta["spacy_version"] = f">={about.__version__}"
 | 
				
			||||||
                    if beam_width == 1:
 | 
					                    if beam_width == 1:
 | 
				
			||||||
                        meta["speed"] = {
 | 
					                        meta["speed"] = {
 | 
				
			||||||
                            "nwords": nwords,
 | 
					                            "nwords": nwords,
 | 
				
			||||||
| 
						 | 
					@ -465,10 +462,10 @@ def train(
 | 
				
			||||||
                        "keys": nlp.vocab.vectors.n_keys,
 | 
					                        "keys": nlp.vocab.vectors.n_keys,
 | 
				
			||||||
                        "name": nlp.vocab.vectors.name,
 | 
					                        "name": nlp.vocab.vectors.name,
 | 
				
			||||||
                    }
 | 
					                    }
 | 
				
			||||||
                    meta.setdefault("name", "model%d" % i)
 | 
					                    meta.setdefault("name", f"model{i}")
 | 
				
			||||||
                    meta.setdefault("version", version)
 | 
					                    meta.setdefault("version", version)
 | 
				
			||||||
                    meta["labels"] = nlp.meta["labels"]
 | 
					                    meta["labels"] = nlp.meta["labels"]
 | 
				
			||||||
                    meta_loc = output_path / ("model%d" % i) / "meta.json"
 | 
					                    meta_loc = output_path / f"model{i}" / "meta.json"
 | 
				
			||||||
                    srsly.write_json(meta_loc, meta)
 | 
					                    srsly.write_json(meta_loc, meta)
 | 
				
			||||||
                    util.set_env_log(verbose)
 | 
					                    util.set_env_log(verbose)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -486,8 +483,8 @@ def train(
 | 
				
			||||||
                        for cat, cat_score in textcats_per_cat.items():
 | 
					                        for cat, cat_score in textcats_per_cat.items():
 | 
				
			||||||
                            if cat_score.get("roc_auc_score", 0) < 0:
 | 
					                            if cat_score.get("roc_auc_score", 0) < 0:
 | 
				
			||||||
                                msg.warn(
 | 
					                                msg.warn(
 | 
				
			||||||
                                    "Textcat ROC AUC score is undefined due to "
 | 
					                                    f"Textcat ROC AUC score is undefined due to "
 | 
				
			||||||
                                    "only one value in label '{}'.".format(cat)
 | 
					                                    f"only one value in label '{cat}'."
 | 
				
			||||||
                                )
 | 
					                                )
 | 
				
			||||||
                    msg.row(progress, **row_settings)
 | 
					                    msg.row(progress, **row_settings)
 | 
				
			||||||
                # Early stopping
 | 
					                # Early stopping
 | 
				
			||||||
| 
						 | 
					@ -500,14 +497,14 @@ def train(
 | 
				
			||||||
                        best_score = current_score
 | 
					                        best_score = current_score
 | 
				
			||||||
                    if iter_since_best >= n_early_stopping:
 | 
					                    if iter_since_best >= n_early_stopping:
 | 
				
			||||||
                        msg.text(
 | 
					                        msg.text(
 | 
				
			||||||
                            "Early stopping, best iteration "
 | 
					                            f"Early stopping, best iteration is: {i - iter_since_best}"
 | 
				
			||||||
                            "is: {}".format(i - iter_since_best)
 | 
					 | 
				
			||||||
                        )
 | 
					                        )
 | 
				
			||||||
                        msg.text(
 | 
					                        msg.text(
 | 
				
			||||||
                            "Best score = {}; Final iteration "
 | 
					                            f"Best score = {best_score}; Final iteration score = {current_score}"
 | 
				
			||||||
                            "score = {}".format(best_score, current_score)
 | 
					 | 
				
			||||||
                        )
 | 
					                        )
 | 
				
			||||||
                        break
 | 
					                        break
 | 
				
			||||||
 | 
					    except Exception as e:
 | 
				
			||||||
 | 
					        msg.warn(f"Aborting and saving final best model. Encountered exception: {e}")
 | 
				
			||||||
    finally:
 | 
					    finally:
 | 
				
			||||||
        best_pipes = nlp.pipe_names
 | 
					        best_pipes = nlp.pipe_names
 | 
				
			||||||
        if disabled_pipes:
 | 
					        if disabled_pipes:
 | 
				
			||||||
| 
						 | 
					@ -535,6 +532,8 @@ def _score_for_model(meta):
 | 
				
			||||||
        mean_acc.append((acc["ents_p"] + acc["ents_r"] + acc["ents_f"]) / 3)
 | 
					        mean_acc.append((acc["ents_p"] + acc["ents_r"] + acc["ents_f"]) / 3)
 | 
				
			||||||
    if "textcat" in pipes:
 | 
					    if "textcat" in pipes:
 | 
				
			||||||
        mean_acc.append(acc["textcat_score"])
 | 
					        mean_acc.append(acc["textcat_score"])
 | 
				
			||||||
 | 
					    if "sentrec" in pipes:
 | 
				
			||||||
 | 
					        mean_acc.append((acc["sent_p"] + acc["sent_r"] + acc["sent_f"]) / 3)
 | 
				
			||||||
    return sum(mean_acc) / len(mean_acc)
 | 
					    return sum(mean_acc) / len(mean_acc)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -580,12 +579,10 @@ def _collate_best_model(meta, output_path, components):
 | 
				
			||||||
    for component in components:
 | 
					    for component in components:
 | 
				
			||||||
        bests[component] = _find_best(output_path, component)
 | 
					        bests[component] = _find_best(output_path, component)
 | 
				
			||||||
    best_dest = output_path / "model-best"
 | 
					    best_dest = output_path / "model-best"
 | 
				
			||||||
    shutil.copytree(path2str(output_path / "model-final"), path2str(best_dest))
 | 
					    shutil.copytree(str(output_path / "model-final"), str(best_dest))
 | 
				
			||||||
    for component, best_component_src in bests.items():
 | 
					    for component, best_component_src in bests.items():
 | 
				
			||||||
        shutil.rmtree(path2str(best_dest / component))
 | 
					        shutil.rmtree(str(best_dest / component))
 | 
				
			||||||
        shutil.copytree(
 | 
					        shutil.copytree(str(best_component_src / component), str(best_dest / component))
 | 
				
			||||||
            path2str(best_component_src / component), path2str(best_dest / component)
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
        accs = srsly.read_json(best_component_src / "accuracy.json")
 | 
					        accs = srsly.read_json(best_component_src / "accuracy.json")
 | 
				
			||||||
        for metric in _get_metrics(component):
 | 
					        for metric in _get_metrics(component):
 | 
				
			||||||
            meta["accuracy"][metric] = accs[metric]
 | 
					            meta["accuracy"][metric] = accs[metric]
 | 
				
			||||||
| 
						 | 
					@ -608,11 +605,13 @@ def _find_best(experiment_dir, component):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def _get_metrics(component):
 | 
					def _get_metrics(component):
 | 
				
			||||||
    if component == "parser":
 | 
					    if component == "parser":
 | 
				
			||||||
        return ("las", "uas", "las_per_type", "token_acc")
 | 
					        return ("las", "uas", "las_per_type", "token_acc", "sent_f")
 | 
				
			||||||
    elif component == "tagger":
 | 
					    elif component == "tagger":
 | 
				
			||||||
        return ("tags_acc",)
 | 
					        return ("tags_acc",)
 | 
				
			||||||
    elif component == "ner":
 | 
					    elif component == "ner":
 | 
				
			||||||
        return ("ents_f", "ents_p", "ents_r", "ents_per_type")
 | 
					        return ("ents_f", "ents_p", "ents_r", "enty_per_type")
 | 
				
			||||||
 | 
					    elif component == "sentrec":
 | 
				
			||||||
 | 
					        return ("sent_f", "sent_p", "sent_r")
 | 
				
			||||||
    elif component == "textcat":
 | 
					    elif component == "textcat":
 | 
				
			||||||
        return ("textcat_score",)
 | 
					        return ("textcat_score",)
 | 
				
			||||||
    return ("token_acc",)
 | 
					    return ("token_acc",)
 | 
				
			||||||
| 
						 | 
					@ -626,14 +625,21 @@ def _configure_training_output(pipeline, use_gpu, has_beam_widths):
 | 
				
			||||||
            row_head.extend(["Tag Loss ", " Tag %  "])
 | 
					            row_head.extend(["Tag Loss ", " Tag %  "])
 | 
				
			||||||
            output_stats.extend(["tag_loss", "tags_acc"])
 | 
					            output_stats.extend(["tag_loss", "tags_acc"])
 | 
				
			||||||
        elif pipe == "parser":
 | 
					        elif pipe == "parser":
 | 
				
			||||||
            row_head.extend(["Dep Loss ", " UAS  ", " LAS  "])
 | 
					            row_head.extend(
 | 
				
			||||||
            output_stats.extend(["dep_loss", "uas", "las"])
 | 
					                ["Dep Loss ", " UAS  ", " LAS  ", "Sent P", "Sent R", "Sent F"]
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					            output_stats.extend(
 | 
				
			||||||
 | 
					                ["dep_loss", "uas", "las", "sent_p", "sent_r", "sent_f"]
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
        elif pipe == "ner":
 | 
					        elif pipe == "ner":
 | 
				
			||||||
            row_head.extend(["NER Loss ", "NER P ", "NER R ", "NER F "])
 | 
					            row_head.extend(["NER Loss ", "NER P ", "NER R ", "NER F "])
 | 
				
			||||||
            output_stats.extend(["ner_loss", "ents_p", "ents_r", "ents_f"])
 | 
					            output_stats.extend(["ner_loss", "ents_p", "ents_r", "ents_f"])
 | 
				
			||||||
        elif pipe == "textcat":
 | 
					        elif pipe == "textcat":
 | 
				
			||||||
            row_head.extend(["Textcat Loss", "Textcat"])
 | 
					            row_head.extend(["Textcat Loss", "Textcat"])
 | 
				
			||||||
            output_stats.extend(["textcat_loss", "textcat_score"])
 | 
					            output_stats.extend(["textcat_loss", "textcat_score"])
 | 
				
			||||||
 | 
					        elif pipe == "sentrec":
 | 
				
			||||||
 | 
					            row_head.extend(["Sentrec Loss", "Sent P", "Sent R", "Sent F"])
 | 
				
			||||||
 | 
					            output_stats.extend(["sentrec_loss", "sent_p", "sent_r", "sent_f"])
 | 
				
			||||||
    row_head.extend(["Token %", "CPU WPS"])
 | 
					    row_head.extend(["Token %", "CPU WPS"])
 | 
				
			||||||
    output_stats.extend(["token_acc", "cpu_wps"])
 | 
					    output_stats.extend(["token_acc", "cpu_wps"])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -643,7 +649,10 @@ def _configure_training_output(pipeline, use_gpu, has_beam_widths):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if has_beam_widths:
 | 
					    if has_beam_widths:
 | 
				
			||||||
        row_head.insert(1, "Beam W.")
 | 
					        row_head.insert(1, "Beam W.")
 | 
				
			||||||
    return row_head, output_stats
 | 
					    # remove duplicates
 | 
				
			||||||
 | 
					    row_head_dict = {k: 1 for k in row_head}
 | 
				
			||||||
 | 
					    output_stats_dict = {k: 1 for k in output_stats}
 | 
				
			||||||
 | 
					    return row_head_dict.keys(), output_stats_dict.keys()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def _get_progress(
 | 
					def _get_progress(
 | 
				
			||||||
| 
						 | 
					@ -656,6 +665,7 @@ def _get_progress(
 | 
				
			||||||
    scores["ner_loss"] = losses.get("ner", 0.0)
 | 
					    scores["ner_loss"] = losses.get("ner", 0.0)
 | 
				
			||||||
    scores["tag_loss"] = losses.get("tagger", 0.0)
 | 
					    scores["tag_loss"] = losses.get("tagger", 0.0)
 | 
				
			||||||
    scores["textcat_loss"] = losses.get("textcat", 0.0)
 | 
					    scores["textcat_loss"] = losses.get("textcat", 0.0)
 | 
				
			||||||
 | 
					    scores["sentrec_loss"] = losses.get("sentrec", 0.0)
 | 
				
			||||||
    scores["cpu_wps"] = cpu_wps
 | 
					    scores["cpu_wps"] = cpu_wps
 | 
				
			||||||
    scores["gpu_wps"] = gpu_wps or 0.0
 | 
					    scores["gpu_wps"] = gpu_wps or 0.0
 | 
				
			||||||
    scores.update(dev_scores)
 | 
					    scores.update(dev_scores)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										439
									
								
								spacy/cli/train_from_config.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										439
									
								
								spacy/cli/train_from_config.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,439 @@
 | 
				
			||||||
 | 
					from typing import Optional, Dict, List, Union, Sequence
 | 
				
			||||||
 | 
					import plac
 | 
				
			||||||
 | 
					from wasabi import msg
 | 
				
			||||||
 | 
					from pathlib import Path
 | 
				
			||||||
 | 
					import thinc
 | 
				
			||||||
 | 
					import thinc.schedules
 | 
				
			||||||
 | 
					from thinc.api import Model
 | 
				
			||||||
 | 
					from pydantic import BaseModel, FilePath, StrictInt
 | 
				
			||||||
 | 
					import tqdm
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# TODO: relative imports?
 | 
				
			||||||
 | 
					import spacy
 | 
				
			||||||
 | 
					from spacy.gold import GoldCorpus
 | 
				
			||||||
 | 
					from spacy.pipeline.tok2vec import Tok2VecListener
 | 
				
			||||||
 | 
					from spacy.ml import component_models
 | 
				
			||||||
 | 
					from spacy import util
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					registry = util.registry
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					CONFIG_STR = """
 | 
				
			||||||
 | 
					[training]
 | 
				
			||||||
 | 
					patience = 10
 | 
				
			||||||
 | 
					eval_frequency = 10
 | 
				
			||||||
 | 
					dropout = 0.2
 | 
				
			||||||
 | 
					init_tok2vec = null
 | 
				
			||||||
 | 
					vectors = null
 | 
				
			||||||
 | 
					max_epochs = 100
 | 
				
			||||||
 | 
					orth_variant_level = 0.0
 | 
				
			||||||
 | 
					gold_preproc = false
 | 
				
			||||||
 | 
					max_length = 0
 | 
				
			||||||
 | 
					use_gpu = 0
 | 
				
			||||||
 | 
					scores = ["ents_p",  "ents_r", "ents_f"]
 | 
				
			||||||
 | 
					score_weights = {"ents_f": 1.0}
 | 
				
			||||||
 | 
					limit = 0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[training.batch_size]
 | 
				
			||||||
 | 
					@schedules = "compounding.v1"
 | 
				
			||||||
 | 
					start = 100
 | 
				
			||||||
 | 
					stop = 1000
 | 
				
			||||||
 | 
					compound = 1.001
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[optimizer]
 | 
				
			||||||
 | 
					@optimizers = "Adam.v1"
 | 
				
			||||||
 | 
					learn_rate = 0.001
 | 
				
			||||||
 | 
					beta1 = 0.9
 | 
				
			||||||
 | 
					beta2 = 0.999
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp]
 | 
				
			||||||
 | 
					lang = "en"
 | 
				
			||||||
 | 
					vectors = ${training:vectors}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.pipeline.tok2vec]
 | 
				
			||||||
 | 
					factory = "tok2vec"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.pipeline.ner]
 | 
				
			||||||
 | 
					factory = "ner"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.pipeline.ner.model]
 | 
				
			||||||
 | 
					@architectures = "transition_based_ner.v1"
 | 
				
			||||||
 | 
					nr_feature_tokens = 3
 | 
				
			||||||
 | 
					hidden_width = 64
 | 
				
			||||||
 | 
					maxout_pieces = 3
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.pipeline.ner.model.tok2vec]
 | 
				
			||||||
 | 
					@architectures = "tok2vec_tensors.v1"
 | 
				
			||||||
 | 
					width = ${nlp.pipeline.tok2vec.model:width}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.pipeline.tok2vec.model]
 | 
				
			||||||
 | 
					@architectures = "hash_embed_cnn.v1"
 | 
				
			||||||
 | 
					pretrained_vectors = ${nlp:vectors}
 | 
				
			||||||
 | 
					width = 128
 | 
				
			||||||
 | 
					depth = 4
 | 
				
			||||||
 | 
					window_size = 1
 | 
				
			||||||
 | 
					embed_size = 10000
 | 
				
			||||||
 | 
					maxout_pieces = 3
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class PipelineComponent(BaseModel):
 | 
				
			||||||
 | 
					    factory: str
 | 
				
			||||||
 | 
					    model: Model
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    class Config:
 | 
				
			||||||
 | 
					        arbitrary_types_allowed = True
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class ConfigSchema(BaseModel):
 | 
				
			||||||
 | 
					    optimizer: Optional["Optimizer"]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    class training(BaseModel):
 | 
				
			||||||
 | 
					        patience: int = 10
 | 
				
			||||||
 | 
					        eval_frequency: int = 100
 | 
				
			||||||
 | 
					        dropout: float = 0.2
 | 
				
			||||||
 | 
					        init_tok2vec: Optional[FilePath] = None
 | 
				
			||||||
 | 
					        vectors: Optional[str] = None
 | 
				
			||||||
 | 
					        max_epochs: int = 100
 | 
				
			||||||
 | 
					        orth_variant_level: float = 0.0
 | 
				
			||||||
 | 
					        gold_preproc: bool = False
 | 
				
			||||||
 | 
					        max_length: int = 0
 | 
				
			||||||
 | 
					        use_gpu: int = 0
 | 
				
			||||||
 | 
					        scores: List[str] = ["ents_p", "ents_r", "ents_f"]
 | 
				
			||||||
 | 
					        score_weights: Dict[str, Union[int, float]] = {"ents_f": 1.0}
 | 
				
			||||||
 | 
					        limit: int = 0
 | 
				
			||||||
 | 
					        batch_size: Union[Sequence[int], int]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    class nlp(BaseModel):
 | 
				
			||||||
 | 
					        lang: str
 | 
				
			||||||
 | 
					        vectors: Optional[str]
 | 
				
			||||||
 | 
					        pipeline: Optional[Dict[str, PipelineComponent]]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    class Config:
 | 
				
			||||||
 | 
					        extra = "allow"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# Of course, these would normally decorate the functions where they're defined.
 | 
				
			||||||
 | 
					# But for now...
 | 
				
			||||||
 | 
					@registry.architectures.register("hash_embed_cnn.v1")
 | 
				
			||||||
 | 
					def hash_embed_cnn(
 | 
				
			||||||
 | 
					    pretrained_vectors, width, depth, embed_size, maxout_pieces, window_size
 | 
				
			||||||
 | 
					):
 | 
				
			||||||
 | 
					    return component_models.Tok2Vec(
 | 
				
			||||||
 | 
					        width=width,
 | 
				
			||||||
 | 
					        embed_size=embed_size,
 | 
				
			||||||
 | 
					        pretrained_vectors=pretrained_vectors,
 | 
				
			||||||
 | 
					        conv_depth=depth,
 | 
				
			||||||
 | 
					        cnn_maxout_pieces=maxout_pieces,
 | 
				
			||||||
 | 
					        bilstm_depth=0,
 | 
				
			||||||
 | 
					        window_size=window_size,
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.architectures.register("hash_embed_bilstm.v1")
 | 
				
			||||||
 | 
					def hash_embed_bilstm_v1(pretrained_vectors, width, depth, embed_size):
 | 
				
			||||||
 | 
					    return component_models.Tok2Vec(
 | 
				
			||||||
 | 
					        width=width,
 | 
				
			||||||
 | 
					        embed_size=embed_size,
 | 
				
			||||||
 | 
					        pretrained_vectors=pretrained_vectors,
 | 
				
			||||||
 | 
					        bilstm_depth=depth,
 | 
				
			||||||
 | 
					        conv_depth=0,
 | 
				
			||||||
 | 
					        cnn_maxout_pieces=0,
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.architectures.register("tagger_model.v1")
 | 
				
			||||||
 | 
					def build_tagger_model_v1(tok2vec):
 | 
				
			||||||
 | 
					    return component_models.build_tagger_model(nr_class=None, tok2vec=tok2vec)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.architectures.register("transition_based_parser.v1")
 | 
				
			||||||
 | 
					def create_tb_parser_model(
 | 
				
			||||||
 | 
					    tok2vec: Model,
 | 
				
			||||||
 | 
					    nr_feature_tokens: StrictInt = 3,
 | 
				
			||||||
 | 
					    hidden_width: StrictInt = 64,
 | 
				
			||||||
 | 
					    maxout_pieces: StrictInt = 3,
 | 
				
			||||||
 | 
					):
 | 
				
			||||||
 | 
					    from thinc.api import Linear, chain, list2array, use_ops, zero_init
 | 
				
			||||||
 | 
					    from spacy.ml._layers import PrecomputableAffine
 | 
				
			||||||
 | 
					    from spacy.syntax._parser_model import ParserModel
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    token_vector_width = tok2vec.get_dim("nO")
 | 
				
			||||||
 | 
					    tok2vec = chain(tok2vec, list2array())
 | 
				
			||||||
 | 
					    tok2vec.set_dim("nO", token_vector_width)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    lower = PrecomputableAffine(
 | 
				
			||||||
 | 
					        hidden_width, nF=nr_feature_tokens, nI=tok2vec.get_dim("nO"), nP=maxout_pieces
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					    lower.set_dim("nP", maxout_pieces)
 | 
				
			||||||
 | 
					    with use_ops("numpy"):
 | 
				
			||||||
 | 
					        # Initialize weights at zero, as it's a classification layer.
 | 
				
			||||||
 | 
					        upper = Linear(init_W=zero_init)
 | 
				
			||||||
 | 
					    return ParserModel(tok2vec, lower, upper)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@plac.annotations(
 | 
				
			||||||
 | 
					    # fmt: off
 | 
				
			||||||
 | 
					    train_path=("Location of JSON-formatted training data", "positional", None, Path),
 | 
				
			||||||
 | 
					    dev_path=("Location of JSON-formatted development data", "positional", None, Path),
 | 
				
			||||||
 | 
					    config_path=("Path to config file", "positional", None, Path),
 | 
				
			||||||
 | 
					    output_path=("Output directory to store model in", "option", "o", Path),
 | 
				
			||||||
 | 
					    meta_path=("Optional path to meta.json to use as base.", "option", "m", Path),
 | 
				
			||||||
 | 
					    raw_text=("Path to jsonl file with unlabelled text documents.", "option", "rt", Path),
 | 
				
			||||||
 | 
					    # fmt: on
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					def train_from_config_cli(
 | 
				
			||||||
 | 
					    train_path,
 | 
				
			||||||
 | 
					    dev_path,
 | 
				
			||||||
 | 
					    config_path,
 | 
				
			||||||
 | 
					    output_path=None,
 | 
				
			||||||
 | 
					    meta_path=None,
 | 
				
			||||||
 | 
					    raw_text=None,
 | 
				
			||||||
 | 
					    debug=False,
 | 
				
			||||||
 | 
					    verbose=False,
 | 
				
			||||||
 | 
					):
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    Train or update a spaCy model. Requires data to be formatted in spaCy's
 | 
				
			||||||
 | 
					    JSON format. To convert data from other formats, use the `spacy convert`
 | 
				
			||||||
 | 
					    command.
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    if not config_path or not config_path.exists():
 | 
				
			||||||
 | 
					        msg.fail("Config file not found", config_path, exits=1)
 | 
				
			||||||
 | 
					    if not train_path or not train_path.exists():
 | 
				
			||||||
 | 
					        msg.fail("Training data not found", train_path, exits=1)
 | 
				
			||||||
 | 
					    if not dev_path or not dev_path.exists():
 | 
				
			||||||
 | 
					        msg.fail("Development data not found", dev_path, exits=1)
 | 
				
			||||||
 | 
					    if meta_path is not None and not meta_path.exists():
 | 
				
			||||||
 | 
					        msg.fail("Can't find model meta.json", meta_path, exits=1)
 | 
				
			||||||
 | 
					    if output_path is not None and not output_path.exists():
 | 
				
			||||||
 | 
					        output_path.mkdir()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    try:
 | 
				
			||||||
 | 
					        train_from_config(
 | 
				
			||||||
 | 
					            config_path,
 | 
				
			||||||
 | 
					            {"train": train_path, "dev": dev_path},
 | 
				
			||||||
 | 
					            output_path=output_path,
 | 
				
			||||||
 | 
					            meta_path=meta_path,
 | 
				
			||||||
 | 
					            raw_text=raw_text,
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					    except KeyboardInterrupt:
 | 
				
			||||||
 | 
					        msg.warn("Cancelled.")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def train_from_config(
 | 
				
			||||||
 | 
					    config_path, data_paths, raw_text=None, meta_path=None, output_path=None,
 | 
				
			||||||
 | 
					):
 | 
				
			||||||
 | 
					    msg.info(f"Loading config from: {config_path}")
 | 
				
			||||||
 | 
					    config = util.load_from_config(config_path, create_objects=True)
 | 
				
			||||||
 | 
					    use_gpu = config["training"]["use_gpu"]
 | 
				
			||||||
 | 
					    if use_gpu >= 0:
 | 
				
			||||||
 | 
					        msg.info("Using GPU")
 | 
				
			||||||
 | 
					    else:
 | 
				
			||||||
 | 
					        msg.info("Using CPU")
 | 
				
			||||||
 | 
					    msg.info("Creating nlp from config")
 | 
				
			||||||
 | 
					    nlp = create_nlp_from_config(**config["nlp"])
 | 
				
			||||||
 | 
					    optimizer = config["optimizer"]
 | 
				
			||||||
 | 
					    limit = config["training"]["limit"]
 | 
				
			||||||
 | 
					    msg.info("Loading training corpus")
 | 
				
			||||||
 | 
					    corpus = GoldCorpus(data_paths["train"], data_paths["dev"], limit=limit)
 | 
				
			||||||
 | 
					    msg.info("Initializing the nlp pipeline")
 | 
				
			||||||
 | 
					    nlp.begin_training(lambda: corpus.train_examples, device=use_gpu)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    train_batches = create_train_batches(nlp, corpus, config["training"])
 | 
				
			||||||
 | 
					    evaluate = create_evaluation_callback(nlp, optimizer, corpus, config["training"])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Create iterator, which yields out info after each optimization step.
 | 
				
			||||||
 | 
					    msg.info("Start training")
 | 
				
			||||||
 | 
					    training_step_iterator = train_while_improving(
 | 
				
			||||||
 | 
					        nlp,
 | 
				
			||||||
 | 
					        optimizer,
 | 
				
			||||||
 | 
					        train_batches,
 | 
				
			||||||
 | 
					        evaluate,
 | 
				
			||||||
 | 
					        config["training"]["dropout"],
 | 
				
			||||||
 | 
					        config["training"]["patience"],
 | 
				
			||||||
 | 
					        config["training"]["eval_frequency"],
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    msg.info(f"Training. Initial learn rate: {optimizer.learn_rate}")
 | 
				
			||||||
 | 
					    print_row = setup_printer(config)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    try:
 | 
				
			||||||
 | 
					        progress = tqdm.tqdm(total=config["training"]["eval_frequency"], leave=False)
 | 
				
			||||||
 | 
					        for batch, info, is_best_checkpoint in training_step_iterator:
 | 
				
			||||||
 | 
					            progress.update(1)
 | 
				
			||||||
 | 
					            if is_best_checkpoint is not None:
 | 
				
			||||||
 | 
					                progress.close()
 | 
				
			||||||
 | 
					                print_row(info)
 | 
				
			||||||
 | 
					                if is_best_checkpoint and output_path is not None:
 | 
				
			||||||
 | 
					                    nlp.to_disk(output_path)
 | 
				
			||||||
 | 
					                progress = tqdm.tqdm(
 | 
				
			||||||
 | 
					                    total=config["training"]["eval_frequency"], leave=False
 | 
				
			||||||
 | 
					                )
 | 
				
			||||||
 | 
					    finally:
 | 
				
			||||||
 | 
					        if output_path is not None:
 | 
				
			||||||
 | 
					            with nlp.use_params(optimizer.averages):
 | 
				
			||||||
 | 
					                final_model_path = output_path / "model-final"
 | 
				
			||||||
 | 
					                nlp.to_disk(final_model_path)
 | 
				
			||||||
 | 
					            msg.good("Saved model to output directory", final_model_path)
 | 
				
			||||||
 | 
					        # with msg.loading("Creating best model..."):
 | 
				
			||||||
 | 
					        #     best_model_path = _collate_best_model(meta, output_path, nlp.pipe_names)
 | 
				
			||||||
 | 
					        # msg.good("Created best model", best_model_path)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def create_nlp_from_config(lang, vectors, pipeline):
 | 
				
			||||||
 | 
					    lang_class = spacy.util.get_lang_class(lang)
 | 
				
			||||||
 | 
					    nlp = lang_class()
 | 
				
			||||||
 | 
					    if vectors is not None:
 | 
				
			||||||
 | 
					        spacy.cli.train._load_vectors(nlp, vectors)
 | 
				
			||||||
 | 
					    for name, component_cfg in pipeline.items():
 | 
				
			||||||
 | 
					        factory = component_cfg.pop("factory")
 | 
				
			||||||
 | 
					        component = nlp.create_pipe(factory, config=component_cfg)
 | 
				
			||||||
 | 
					        nlp.add_pipe(component, name=name)
 | 
				
			||||||
 | 
					    return nlp
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def create_train_batches(nlp, corpus, cfg):
 | 
				
			||||||
 | 
					    while True:
 | 
				
			||||||
 | 
					        train_examples = corpus.train_dataset(
 | 
				
			||||||
 | 
					            nlp,
 | 
				
			||||||
 | 
					            noise_level=0.0,
 | 
				
			||||||
 | 
					            orth_variant_level=cfg["orth_variant_level"],
 | 
				
			||||||
 | 
					            gold_preproc=cfg["gold_preproc"],
 | 
				
			||||||
 | 
					            max_length=cfg["max_length"],
 | 
				
			||||||
 | 
					            ignore_misaligned=True,
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					        for batch in util.minibatch_by_words(train_examples, size=cfg["batch_size"]):
 | 
				
			||||||
 | 
					            yield batch
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def create_evaluation_callback(nlp, optimizer, corpus, cfg):
 | 
				
			||||||
 | 
					    def evaluate():
 | 
				
			||||||
 | 
					        with nlp.use_params(optimizer.averages):
 | 
				
			||||||
 | 
					            dev_examples = list(
 | 
				
			||||||
 | 
					                corpus.dev_dataset(
 | 
				
			||||||
 | 
					                    nlp, gold_preproc=cfg["gold_preproc"], ignore_misaligned=True
 | 
				
			||||||
 | 
					                )
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					            scorer = nlp.evaluate(dev_examples)
 | 
				
			||||||
 | 
					            scores = scorer.scores
 | 
				
			||||||
 | 
					            # Calculate a weighted sum based on score_weights for the main score
 | 
				
			||||||
 | 
					            weights = cfg["score_weights"]
 | 
				
			||||||
 | 
					            weighted_score = sum(scores[s] * weights.get(s, 0.0) for s in weights)
 | 
				
			||||||
 | 
					        return weighted_score, scorer.scores
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    return evaluate
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def train_while_improving(
 | 
				
			||||||
 | 
					    nlp, optimizer, train_data, evaluate, dropout, patience, eval_frequency
 | 
				
			||||||
 | 
					):
 | 
				
			||||||
 | 
					    """Train until an evaluation stops improving. Works as a generator,
 | 
				
			||||||
 | 
					    with each iteration yielding a tuple `(batch, info, is_best_checkpoint)`,
 | 
				
			||||||
 | 
					    where info is a dict, and is_best_checkpoint is in [True, False, None] --
 | 
				
			||||||
 | 
					    None indicating that the iteration was not evaluated as a checkpoint.
 | 
				
			||||||
 | 
					    The evaluation is conducted by calling the evaluate callback, which should
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Positional arguments:
 | 
				
			||||||
 | 
					        nlp: The spaCy pipeline to evaluate.
 | 
				
			||||||
 | 
					        train_data (Iterable[Batch]): A generator of batches, with the training
 | 
				
			||||||
 | 
					            data. Each batch should be a Sized[Tuple[Input, Annot]]. The training
 | 
				
			||||||
 | 
					            data iterable needs to take care of iterating over the epochs and
 | 
				
			||||||
 | 
					            shuffling.
 | 
				
			||||||
 | 
					        evaluate (Callable[[], Tuple[float, Any]]): A callback to perform evaluation.
 | 
				
			||||||
 | 
					            The callback should take no arguments and return a tuple
 | 
				
			||||||
 | 
					            `(main_score, other_scores)`. The main_score should be a float where
 | 
				
			||||||
 | 
					            higher is better. other_scores can be any object.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Every iteration, the function yields out a tuple with:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * batch: A zipped sequence of Tuple[Doc, GoldParse] pairs.
 | 
				
			||||||
 | 
					    * info: A dict with various information about the last update (see below).
 | 
				
			||||||
 | 
					    * is_best_checkpoint: A value in None, False, True, indicating whether this
 | 
				
			||||||
 | 
					        was the best evaluation so far. You should use this to save the model
 | 
				
			||||||
 | 
					        checkpoints during training. If None, evaluation was not conducted on
 | 
				
			||||||
 | 
					        that iteration. False means evaluation was conducted, but a previous
 | 
				
			||||||
 | 
					        evaluation was better.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    The info dict provides the following information:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        epoch (int): How many passes over the data have been completed.
 | 
				
			||||||
 | 
					        step (int): How many steps have been completed.
 | 
				
			||||||
 | 
					        score (float): The main score form the last evaluation.
 | 
				
			||||||
 | 
					        other_scores: : The other scores from the last evaluation.
 | 
				
			||||||
 | 
					        loss: The accumulated losses throughout training.
 | 
				
			||||||
 | 
					        checkpoints: A list of previous results, where each result is a
 | 
				
			||||||
 | 
					            (score, step, epoch) tuple.
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    if isinstance(dropout, float):
 | 
				
			||||||
 | 
					        dropouts = thinc.schedules.constant(dropout)
 | 
				
			||||||
 | 
					    else:
 | 
				
			||||||
 | 
					        dropouts = dropout
 | 
				
			||||||
 | 
					    results = []
 | 
				
			||||||
 | 
					    losses = {}
 | 
				
			||||||
 | 
					    for step, batch in enumerate(train_data):
 | 
				
			||||||
 | 
					        dropout = next(dropouts)
 | 
				
			||||||
 | 
					        for subbatch in subdivide_batch(batch):
 | 
				
			||||||
 | 
					            nlp.update(subbatch, drop=dropout, losses=losses, sgd=False)
 | 
				
			||||||
 | 
					        for name, proc in nlp.pipeline:
 | 
				
			||||||
 | 
					            if hasattr(proc, "model"):
 | 
				
			||||||
 | 
					                proc.model.finish_update(optimizer)
 | 
				
			||||||
 | 
					        optimizer.step_schedules()
 | 
				
			||||||
 | 
					        if not (step % eval_frequency):
 | 
				
			||||||
 | 
					            score, other_scores = evaluate()
 | 
				
			||||||
 | 
					            results.append((score, step))
 | 
				
			||||||
 | 
					            is_best_checkpoint = score == max(results)[0]
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            score, other_scores = (None, None)
 | 
				
			||||||
 | 
					            is_best_checkpoint = None
 | 
				
			||||||
 | 
					        info = {
 | 
				
			||||||
 | 
					            "step": step,
 | 
				
			||||||
 | 
					            "score": score,
 | 
				
			||||||
 | 
					            "other_scores": other_scores,
 | 
				
			||||||
 | 
					            "losses": losses,
 | 
				
			||||||
 | 
					            "checkpoints": results,
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					        yield batch, info, is_best_checkpoint
 | 
				
			||||||
 | 
					        if is_best_checkpoint is not None:
 | 
				
			||||||
 | 
					            losses = {}
 | 
				
			||||||
 | 
					        # Stop if no improvement in `patience` updates
 | 
				
			||||||
 | 
					        best_score, best_step = max(results)
 | 
				
			||||||
 | 
					        if (step - best_step) >= patience:
 | 
				
			||||||
 | 
					            break
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def subdivide_batch(batch):
 | 
				
			||||||
 | 
					    return [batch]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def setup_printer(config):
 | 
				
			||||||
 | 
					    score_cols = config["training"]["scores"]
 | 
				
			||||||
 | 
					    score_widths = [max(len(col), 6) for col in score_cols]
 | 
				
			||||||
 | 
					    loss_cols = [f"Loss {pipe}" for pipe in config["nlp"]["pipeline"]]
 | 
				
			||||||
 | 
					    loss_widths = [max(len(col), 8) for col in loss_cols]
 | 
				
			||||||
 | 
					    table_header = ["#"] + loss_cols + score_cols + ["Score"]
 | 
				
			||||||
 | 
					    table_header = [col.upper() for col in table_header]
 | 
				
			||||||
 | 
					    table_widths = [6] + loss_widths + score_widths + [6]
 | 
				
			||||||
 | 
					    table_aligns = ["r" for _ in table_widths]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    msg.row(table_header, widths=table_widths)
 | 
				
			||||||
 | 
					    msg.row(["-" * width for width in table_widths])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def print_row(info):
 | 
				
			||||||
 | 
					        losses = [
 | 
				
			||||||
 | 
					            "{0:.2f}".format(info["losses"].get(col, 0.0))
 | 
				
			||||||
 | 
					            for col in config["nlp"]["pipeline"]
 | 
				
			||||||
 | 
					        ]
 | 
				
			||||||
 | 
					        scores = [
 | 
				
			||||||
 | 
					            "{0:.2f}".format(info["other_scores"].get(col, 0.0))
 | 
				
			||||||
 | 
					            for col in config["training"]["scores"]
 | 
				
			||||||
 | 
					        ]
 | 
				
			||||||
 | 
					        data = [info["step"]] + losses + scores + ["{0:.2f}".format(info["score"])]
 | 
				
			||||||
 | 
					        msg.row(data, widths=table_widths, aligns=table_aligns)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    return print_row
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.architectures.register("tok2vec_tensors.v1")
 | 
				
			||||||
 | 
					def tok2vec_tensors_v1(width):
 | 
				
			||||||
 | 
					    tok2vec = Tok2VecListener("tok2vec", width=width)
 | 
				
			||||||
 | 
					    return tok2vec
 | 
				
			||||||
| 
						 | 
					@ -1,14 +1,8 @@
 | 
				
			||||||
# coding: utf8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals, print_function
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from pathlib import Path
 | 
					from pathlib import Path
 | 
				
			||||||
import sys
 | 
					import sys
 | 
				
			||||||
import requests
 | 
					import requests
 | 
				
			||||||
import srsly
 | 
					 | 
				
			||||||
from wasabi import msg
 | 
					from wasabi import msg
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ..compat import path2str
 | 
					 | 
				
			||||||
from ..util import get_data_path
 | 
					 | 
				
			||||||
from .. import about
 | 
					from .. import about
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -17,51 +11,30 @@ def validate():
 | 
				
			||||||
    Validate that the currently installed version of spaCy is compatible
 | 
					    Validate that the currently installed version of spaCy is compatible
 | 
				
			||||||
    with the installed models. Should be run after `pip install -U spacy`.
 | 
					    with the installed models. Should be run after `pip install -U spacy`.
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    with msg.loading("Loading compatibility table..."):
 | 
					    model_pkgs, compat = get_model_pkgs()
 | 
				
			||||||
        r = requests.get(about.__compatibility__)
 | 
					    spacy_version = about.__version__.rsplit(".dev", 1)[0]
 | 
				
			||||||
        if r.status_code != 200:
 | 
					    current_compat = compat.get(spacy_version, {})
 | 
				
			||||||
            msg.fail(
 | 
					 | 
				
			||||||
                "Server error ({})".format(r.status_code),
 | 
					 | 
				
			||||||
                "Couldn't fetch compatibility table.",
 | 
					 | 
				
			||||||
                exits=1,
 | 
					 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
    msg.good("Loaded compatibility table")
 | 
					 | 
				
			||||||
    compat = r.json()["spacy"]
 | 
					 | 
				
			||||||
    version = about.__version__
 | 
					 | 
				
			||||||
    version = version.rsplit(".dev", 1)[0]
 | 
					 | 
				
			||||||
    current_compat = compat.get(version)
 | 
					 | 
				
			||||||
    if not current_compat:
 | 
					    if not current_compat:
 | 
				
			||||||
        msg.fail(
 | 
					        msg.warn(f"No compatible models found for v{spacy_version} of spaCy")
 | 
				
			||||||
            "Can't find spaCy v{} in compatibility table".format(version),
 | 
					 | 
				
			||||||
            about.__compatibility__,
 | 
					 | 
				
			||||||
            exits=1,
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
    all_models = set()
 | 
					 | 
				
			||||||
    for spacy_v, models in dict(compat).items():
 | 
					 | 
				
			||||||
        all_models.update(models.keys())
 | 
					 | 
				
			||||||
        for model, model_vs in models.items():
 | 
					 | 
				
			||||||
            compat[spacy_v][model] = [reformat_version(v) for v in model_vs]
 | 
					 | 
				
			||||||
    model_links = get_model_links(current_compat)
 | 
					 | 
				
			||||||
    model_pkgs = get_model_pkgs(current_compat, all_models)
 | 
					 | 
				
			||||||
    incompat_links = {l for l, d in model_links.items() if not d["compat"]}
 | 
					 | 
				
			||||||
    incompat_models = {d["name"] for _, d in model_pkgs.items() if not d["compat"]}
 | 
					    incompat_models = {d["name"] for _, d in model_pkgs.items() if not d["compat"]}
 | 
				
			||||||
    incompat_models.update(
 | 
					 | 
				
			||||||
        [d["name"] for _, d in model_links.items() if not d["compat"]]
 | 
					 | 
				
			||||||
    )
 | 
					 | 
				
			||||||
    na_models = [m for m in incompat_models if m not in current_compat]
 | 
					    na_models = [m for m in incompat_models if m not in current_compat]
 | 
				
			||||||
    update_models = [m for m in incompat_models if m in current_compat]
 | 
					    update_models = [m for m in incompat_models if m in current_compat]
 | 
				
			||||||
    spacy_dir = Path(__file__).parent.parent
 | 
					    spacy_dir = Path(__file__).parent.parent
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    msg.divider("Installed models (spaCy v{})".format(about.__version__))
 | 
					    msg.divider(f"Installed models (spaCy v{about.__version__})")
 | 
				
			||||||
    msg.info("spaCy installation: {}".format(path2str(spacy_dir)))
 | 
					    msg.info(f"spaCy installation: {spacy_dir}")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if model_links or model_pkgs:
 | 
					    if model_pkgs:
 | 
				
			||||||
        header = ("TYPE", "NAME", "MODEL", "VERSION", "")
 | 
					        header = ("NAME", "VERSION", "")
 | 
				
			||||||
        rows = []
 | 
					        rows = []
 | 
				
			||||||
        for name, data in model_pkgs.items():
 | 
					        for name, data in model_pkgs.items():
 | 
				
			||||||
            rows.append(get_model_row(current_compat, name, data, msg))
 | 
					            if data["compat"]:
 | 
				
			||||||
        for name, data in model_links.items():
 | 
					                comp = msg.text("", color="green", icon="good", no_print=True)
 | 
				
			||||||
            rows.append(get_model_row(current_compat, name, data, msg, "link"))
 | 
					                version = msg.text(data["version"], color="green", no_print=True)
 | 
				
			||||||
 | 
					            else:
 | 
				
			||||||
 | 
					                version = msg.text(data["version"], color="red", no_print=True)
 | 
				
			||||||
 | 
					                comp = f"--> {compat.get(data['name'], ['n/a'])[0]}"
 | 
				
			||||||
 | 
					            rows.append((data["name"], version, comp))
 | 
				
			||||||
        msg.table(rows, header=header)
 | 
					        msg.table(rows, header=header)
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        msg.text("No models found in your current environment.", exits=0)
 | 
					        msg.text("No models found in your current environment.", exits=0)
 | 
				
			||||||
| 
						 | 
					@ -71,44 +44,32 @@ def validate():
 | 
				
			||||||
        cmd = "python -m spacy download {}"
 | 
					        cmd = "python -m spacy download {}"
 | 
				
			||||||
        print("\n".join([cmd.format(pkg) for pkg in update_models]) + "\n")
 | 
					        print("\n".join([cmd.format(pkg) for pkg in update_models]) + "\n")
 | 
				
			||||||
    if na_models:
 | 
					    if na_models:
 | 
				
			||||||
        msg.text(
 | 
					        msg.warn(
 | 
				
			||||||
            "The following models are not available for spaCy "
 | 
					            f"The following models are not available for spaCy v{about.__version__}:",
 | 
				
			||||||
            "v{}: {}".format(about.__version__, ", ".join(na_models))
 | 
					            ", ".join(na_models),
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
    if incompat_links:
 | 
					    if incompat_models:
 | 
				
			||||||
        msg.text(
 | 
					 | 
				
			||||||
            "You may also want to overwrite the incompatible links using the "
 | 
					 | 
				
			||||||
            "`python -m spacy link` command with `--force`, or remove them "
 | 
					 | 
				
			||||||
            "from the data directory. "
 | 
					 | 
				
			||||||
            "Data path: {path}".format(path=path2str(get_data_path()))
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
    if incompat_models or incompat_links:
 | 
					 | 
				
			||||||
        sys.exit(1)
 | 
					        sys.exit(1)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def get_model_links(compat):
 | 
					def get_model_pkgs():
 | 
				
			||||||
    links = {}
 | 
					 | 
				
			||||||
    data_path = get_data_path()
 | 
					 | 
				
			||||||
    if data_path:
 | 
					 | 
				
			||||||
        models = [p for p in data_path.iterdir() if is_model_path(p)]
 | 
					 | 
				
			||||||
        for model in models:
 | 
					 | 
				
			||||||
            meta_path = Path(model) / "meta.json"
 | 
					 | 
				
			||||||
            if not meta_path.exists():
 | 
					 | 
				
			||||||
                continue
 | 
					 | 
				
			||||||
            meta = srsly.read_json(meta_path)
 | 
					 | 
				
			||||||
            link = model.parts[-1]
 | 
					 | 
				
			||||||
            name = meta["lang"] + "_" + meta["name"]
 | 
					 | 
				
			||||||
            links[link] = {
 | 
					 | 
				
			||||||
                "name": name,
 | 
					 | 
				
			||||||
                "version": meta["version"],
 | 
					 | 
				
			||||||
                "compat": is_compat(compat, name, meta["version"]),
 | 
					 | 
				
			||||||
            }
 | 
					 | 
				
			||||||
    return links
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def get_model_pkgs(compat, all_models):
 | 
					 | 
				
			||||||
    import pkg_resources
 | 
					    import pkg_resources
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    with msg.loading("Loading compatibility table..."):
 | 
				
			||||||
 | 
					        r = requests.get(about.__compatibility__)
 | 
				
			||||||
 | 
					        if r.status_code != 200:
 | 
				
			||||||
 | 
					            msg.fail(
 | 
				
			||||||
 | 
					                f"Server error ({r.status_code})",
 | 
				
			||||||
 | 
					                "Couldn't fetch compatibility table.",
 | 
				
			||||||
 | 
					                exits=1,
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					    msg.good("Loaded compatibility table")
 | 
				
			||||||
 | 
					    compat = r.json()["spacy"]
 | 
				
			||||||
 | 
					    all_models = set()
 | 
				
			||||||
 | 
					    for spacy_v, models in dict(compat).items():
 | 
				
			||||||
 | 
					        all_models.update(models.keys())
 | 
				
			||||||
 | 
					        for model, model_vs in models.items():
 | 
				
			||||||
 | 
					            compat[spacy_v][model] = [reformat_version(v) for v in model_vs]
 | 
				
			||||||
    pkgs = {}
 | 
					    pkgs = {}
 | 
				
			||||||
    for pkg_name, pkg_data in pkg_resources.working_set.by_key.items():
 | 
					    for pkg_name, pkg_data in pkg_resources.working_set.by_key.items():
 | 
				
			||||||
        package = pkg_name.replace("-", "_")
 | 
					        package = pkg_name.replace("-", "_")
 | 
				
			||||||
| 
						 | 
					@ -117,29 +78,9 @@ def get_model_pkgs(compat, all_models):
 | 
				
			||||||
            pkgs[pkg_name] = {
 | 
					            pkgs[pkg_name] = {
 | 
				
			||||||
                "name": package,
 | 
					                "name": package,
 | 
				
			||||||
                "version": version,
 | 
					                "version": version,
 | 
				
			||||||
                "compat": is_compat(compat, package, version),
 | 
					                "compat": package in compat and version in compat[package],
 | 
				
			||||||
            }
 | 
					            }
 | 
				
			||||||
    return pkgs
 | 
					    return pkgs, compat
 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def get_model_row(compat, name, data, msg, model_type="package"):
 | 
					 | 
				
			||||||
    if data["compat"]:
 | 
					 | 
				
			||||||
        comp = msg.text("", color="green", icon="good", no_print=True)
 | 
					 | 
				
			||||||
        version = msg.text(data["version"], color="green", no_print=True)
 | 
					 | 
				
			||||||
    else:
 | 
					 | 
				
			||||||
        version = msg.text(data["version"], color="red", no_print=True)
 | 
					 | 
				
			||||||
        comp = "--> {}".format(compat.get(data["name"], ["n/a"])[0])
 | 
					 | 
				
			||||||
    return (model_type, name, data["name"], version, comp)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def is_model_path(model_path):
 | 
					 | 
				
			||||||
    exclude = ["cache", "pycache", "__pycache__"]
 | 
					 | 
				
			||||||
    name = model_path.parts[-1]
 | 
					 | 
				
			||||||
    return model_path.is_dir() and name not in exclude and not name.startswith(".")
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def is_compat(compat, name, version):
 | 
					 | 
				
			||||||
    return name in compat and version in compat[name]
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def reformat_version(version):
 | 
					def reformat_version(version):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										129
									
								
								spacy/compat.py
									
									
									
									
									
								
							
							
						
						
									
										129
									
								
								spacy/compat.py
									
									
									
									
									
								
							| 
						 | 
					@ -1,4 +1,3 @@
 | 
				
			||||||
# coding: utf8
 | 
					 | 
				
			||||||
"""
 | 
					"""
 | 
				
			||||||
Helpers for Python and platform compatibility. To distinguish them from
 | 
					Helpers for Python and platform compatibility. To distinguish them from
 | 
				
			||||||
the builtin functions, replacement functions are suffixed with an underscore,
 | 
					the builtin functions, replacement functions are suffixed with an underscore,
 | 
				
			||||||
| 
						 | 
					@ -6,15 +5,9 @@ e.g. `unicode_`.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
DOCS: https://spacy.io/api/top-level#compat
 | 
					DOCS: https://spacy.io/api/top-level#compat
 | 
				
			||||||
"""
 | 
					"""
 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
import os
 | 
					 | 
				
			||||||
import sys
 | 
					import sys
 | 
				
			||||||
import itertools
 | 
					 | 
				
			||||||
import ast
 | 
					 | 
				
			||||||
import types
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
from thinc.neural.util import copy_array
 | 
					from thinc.util import copy_array
 | 
				
			||||||
 | 
					
 | 
				
			||||||
try:
 | 
					try:
 | 
				
			||||||
    import cPickle as pickle
 | 
					    import cPickle as pickle
 | 
				
			||||||
| 
						 | 
					@ -36,91 +29,23 @@ try:
 | 
				
			||||||
except ImportError:
 | 
					except ImportError:
 | 
				
			||||||
    cupy = None
 | 
					    cupy = None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
try:
 | 
					from thinc.api import Optimizer  # noqa: F401
 | 
				
			||||||
    from thinc.neural.optimizers import Optimizer  # noqa: F401
 | 
					 | 
				
			||||||
except ImportError:
 | 
					 | 
				
			||||||
    from thinc.neural.optimizers import Adam as Optimizer  # noqa: F401
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
pickle = pickle
 | 
					pickle = pickle
 | 
				
			||||||
copy_reg = copy_reg
 | 
					copy_reg = copy_reg
 | 
				
			||||||
CudaStream = CudaStream
 | 
					CudaStream = CudaStream
 | 
				
			||||||
cupy = cupy
 | 
					cupy = cupy
 | 
				
			||||||
copy_array = copy_array
 | 
					copy_array = copy_array
 | 
				
			||||||
izip = getattr(itertools, "izip", zip)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
is_windows = sys.platform.startswith("win")
 | 
					is_windows = sys.platform.startswith("win")
 | 
				
			||||||
is_linux = sys.platform.startswith("linux")
 | 
					is_linux = sys.platform.startswith("linux")
 | 
				
			||||||
is_osx = sys.platform == "darwin"
 | 
					is_osx = sys.platform == "darwin"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# See: https://github.com/benjaminp/six/blob/master/six.py
 | 
					 | 
				
			||||||
is_python2 = sys.version_info[0] == 2
 | 
					 | 
				
			||||||
is_python3 = sys.version_info[0] == 3
 | 
					 | 
				
			||||||
is_python_pre_3_5 = is_python2 or (is_python3 and sys.version_info[1] < 5)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
if is_python2:
 | 
					def is_config(windows=None, linux=None, osx=None, **kwargs):
 | 
				
			||||||
    bytes_ = str
 | 
					 | 
				
			||||||
    unicode_ = unicode  # noqa: F821
 | 
					 | 
				
			||||||
    basestring_ = basestring  # noqa: F821
 | 
					 | 
				
			||||||
    input_ = raw_input  # noqa: F821
 | 
					 | 
				
			||||||
    path2str = lambda path: str(path).decode("utf8")
 | 
					 | 
				
			||||||
    class_types = (type, types.ClassType)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
elif is_python3:
 | 
					 | 
				
			||||||
    bytes_ = bytes
 | 
					 | 
				
			||||||
    unicode_ = str
 | 
					 | 
				
			||||||
    basestring_ = str
 | 
					 | 
				
			||||||
    input_ = input
 | 
					 | 
				
			||||||
    path2str = lambda path: str(path)
 | 
					 | 
				
			||||||
    class_types = (type, types.ClassType) if is_python_pre_3_5 else type
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def b_to_str(b_str):
 | 
					 | 
				
			||||||
    """Convert a bytes object to a string.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    b_str (bytes): The object to convert.
 | 
					 | 
				
			||||||
    RETURNS (unicode): The converted string.
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
    if is_python2:
 | 
					 | 
				
			||||||
        return b_str
 | 
					 | 
				
			||||||
    # Important: if no encoding is set, string becomes "b'...'"
 | 
					 | 
				
			||||||
    return str(b_str, encoding="utf8")
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def symlink_to(orig, dest):
 | 
					 | 
				
			||||||
    """Create a symlink. Used for model shortcut links.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    orig (unicode / Path): The origin path.
 | 
					 | 
				
			||||||
    dest (unicode / Path): The destination path of the symlink.
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
    if is_windows:
 | 
					 | 
				
			||||||
        import subprocess
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        subprocess.check_call(
 | 
					 | 
				
			||||||
            ["mklink", "/d", path2str(orig), path2str(dest)], shell=True
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
    else:
 | 
					 | 
				
			||||||
        orig.symlink_to(dest)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def symlink_remove(link):
 | 
					 | 
				
			||||||
    """Remove a symlink. Used for model shortcut links.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    link (unicode / Path): The path to the symlink.
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
    # https://stackoverflow.com/q/26554135/6400719
 | 
					 | 
				
			||||||
    if os.path.isdir(path2str(link)) and is_windows:
 | 
					 | 
				
			||||||
        # this should only be on Py2.7 and windows
 | 
					 | 
				
			||||||
        os.rmdir(path2str(link))
 | 
					 | 
				
			||||||
    else:
 | 
					 | 
				
			||||||
        os.unlink(path2str(link))
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def is_config(python2=None, python3=None, windows=None, linux=None, osx=None):
 | 
					 | 
				
			||||||
    """Check if a specific configuration of Python version and operating system
 | 
					    """Check if a specific configuration of Python version and operating system
 | 
				
			||||||
    matches the user's setup. Mostly used to display targeted error messages.
 | 
					    matches the user's setup. Mostly used to display targeted error messages.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    python2 (bool): spaCy is executed with Python 2.x.
 | 
					 | 
				
			||||||
    python3 (bool): spaCy is executed with Python 3.x.
 | 
					 | 
				
			||||||
    windows (bool): spaCy is executed on Windows.
 | 
					    windows (bool): spaCy is executed on Windows.
 | 
				
			||||||
    linux (bool): spaCy is executed on Linux.
 | 
					    linux (bool): spaCy is executed on Linux.
 | 
				
			||||||
    osx (bool): spaCy is executed on OS X or macOS.
 | 
					    osx (bool): spaCy is executed on OS X or macOS.
 | 
				
			||||||
| 
						 | 
					@ -129,53 +54,7 @@ def is_config(python2=None, python3=None, windows=None, linux=None, osx=None):
 | 
				
			||||||
    DOCS: https://spacy.io/api/top-level#compat.is_config
 | 
					    DOCS: https://spacy.io/api/top-level#compat.is_config
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    return (
 | 
					    return (
 | 
				
			||||||
        python2 in (None, is_python2)
 | 
					        windows in (None, is_windows)
 | 
				
			||||||
        and python3 in (None, is_python3)
 | 
					 | 
				
			||||||
        and windows in (None, is_windows)
 | 
					 | 
				
			||||||
        and linux in (None, is_linux)
 | 
					        and linux in (None, is_linux)
 | 
				
			||||||
        and osx in (None, is_osx)
 | 
					        and osx in (None, is_osx)
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def import_file(name, loc):
 | 
					 | 
				
			||||||
    """Import module from a file. Used to load models from a directory.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    name (unicode): Name of module to load.
 | 
					 | 
				
			||||||
    loc (unicode / Path): Path to the file.
 | 
					 | 
				
			||||||
    RETURNS: The loaded module.
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
    loc = path2str(loc)
 | 
					 | 
				
			||||||
    if is_python_pre_3_5:
 | 
					 | 
				
			||||||
        import imp
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        return imp.load_source(name, loc)
 | 
					 | 
				
			||||||
    else:
 | 
					 | 
				
			||||||
        import importlib.util
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        spec = importlib.util.spec_from_file_location(name, str(loc))
 | 
					 | 
				
			||||||
        module = importlib.util.module_from_spec(spec)
 | 
					 | 
				
			||||||
        spec.loader.exec_module(module)
 | 
					 | 
				
			||||||
        return module
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def unescape_unicode(string):
 | 
					 | 
				
			||||||
    """Python2.7's re module chokes when compiling patterns that have ranges
 | 
					 | 
				
			||||||
    between escaped unicode codepoints if the two codepoints are unrecognised
 | 
					 | 
				
			||||||
    in the unicode database. For instance:
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        re.compile('[\\uAA77-\\uAA79]').findall("hello")
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    Ends up matching every character (on Python 2). This problem doesn't occur
 | 
					 | 
				
			||||||
    if we're dealing with unicode literals.
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
    if string is None:
 | 
					 | 
				
			||||||
        return string
 | 
					 | 
				
			||||||
    # We only want to unescape the unicode, so we first must protect the other
 | 
					 | 
				
			||||||
    # backslashes.
 | 
					 | 
				
			||||||
    string = string.replace("\\", "\\\\")
 | 
					 | 
				
			||||||
    # Now we remove that protection for the unicode.
 | 
					 | 
				
			||||||
    string = string.replace("\\\\u", "\\u")
 | 
					 | 
				
			||||||
    string = string.replace("\\\\U", "\\U")
 | 
					 | 
				
			||||||
    # Now we unescape by evaling the string with the AST. This can't execute
 | 
					 | 
				
			||||||
    # code -- it only does the representational level.
 | 
					 | 
				
			||||||
    return ast.literal_eval("u'''" + string + "'''")
 | 
					 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,15 +1,11 @@
 | 
				
			||||||
# coding: utf8
 | 
					 | 
				
			||||||
"""
 | 
					"""
 | 
				
			||||||
spaCy's built in visualization suite for dependencies and named entities.
 | 
					spaCy's built in visualization suite for dependencies and named entities.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
DOCS: https://spacy.io/api/top-level#displacy
 | 
					DOCS: https://spacy.io/api/top-level#displacy
 | 
				
			||||||
USAGE: https://spacy.io/usage/visualizers
 | 
					USAGE: https://spacy.io/usage/visualizers
 | 
				
			||||||
"""
 | 
					"""
 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from .render import DependencyRenderer, EntityRenderer
 | 
					from .render import DependencyRenderer, EntityRenderer
 | 
				
			||||||
from ..tokens import Doc, Span
 | 
					from ..tokens import Doc, Span
 | 
				
			||||||
from ..compat import b_to_str
 | 
					 | 
				
			||||||
from ..errors import Errors, Warnings, user_warning
 | 
					from ..errors import Errors, Warnings, user_warning
 | 
				
			||||||
from ..util import is_in_jupyter
 | 
					from ..util import is_in_jupyter
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -93,20 +89,20 @@ def serve(
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    render(docs, style=style, page=page, minify=minify, options=options, manual=manual)
 | 
					    render(docs, style=style, page=page, minify=minify, options=options, manual=manual)
 | 
				
			||||||
    httpd = simple_server.make_server(host, port, app)
 | 
					    httpd = simple_server.make_server(host, port, app)
 | 
				
			||||||
    print("\nUsing the '{}' visualizer".format(style))
 | 
					    print(f"\nUsing the '{style}' visualizer")
 | 
				
			||||||
    print("Serving on http://{}:{} ...\n".format(host, port))
 | 
					    print(f"Serving on http://{host}:{port} ...\n")
 | 
				
			||||||
    try:
 | 
					    try:
 | 
				
			||||||
        httpd.serve_forever()
 | 
					        httpd.serve_forever()
 | 
				
			||||||
    except KeyboardInterrupt:
 | 
					    except KeyboardInterrupt:
 | 
				
			||||||
        print("Shutting down server on port {}.".format(port))
 | 
					        print(f"Shutting down server on port {port}.")
 | 
				
			||||||
    finally:
 | 
					    finally:
 | 
				
			||||||
        httpd.server_close()
 | 
					        httpd.server_close()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def app(environ, start_response):
 | 
					def app(environ, start_response):
 | 
				
			||||||
    # Headers and status need to be bytes in Python 2, see #1227
 | 
					    # Headers and status need to be bytes in Python 2, see #1227
 | 
				
			||||||
    headers = [(b_to_str(b"Content-type"), b_to_str(b"text/html; charset=utf-8"))]
 | 
					    headers = [("Content-type", "text/html; charset=utf-8")]
 | 
				
			||||||
    start_response(b_to_str(b"200 OK"), headers)
 | 
					    start_response("200 OK", headers)
 | 
				
			||||||
    res = _html["parsed"].encode(encoding="utf-8")
 | 
					    res = _html["parsed"].encode(encoding="utf-8")
 | 
				
			||||||
    return [res]
 | 
					    return [res]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,3 @@
 | 
				
			||||||
# coding: utf8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
import uuid
 | 
					import uuid
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .templates import TPL_DEP_SVG, TPL_DEP_WORDS, TPL_DEP_ARCS, TPL_ENTS
 | 
					from .templates import TPL_DEP_SVG, TPL_DEP_WORDS, TPL_DEP_ARCS, TPL_ENTS
 | 
				
			||||||
| 
						 | 
					@ -55,7 +52,7 @@ class DependencyRenderer(object):
 | 
				
			||||||
                settings = p.get("settings", {})
 | 
					                settings = p.get("settings", {})
 | 
				
			||||||
                self.direction = settings.get("direction", DEFAULT_DIR)
 | 
					                self.direction = settings.get("direction", DEFAULT_DIR)
 | 
				
			||||||
                self.lang = settings.get("lang", DEFAULT_LANG)
 | 
					                self.lang = settings.get("lang", DEFAULT_LANG)
 | 
				
			||||||
            render_id = "{}-{}".format(id_prefix, i)
 | 
					            render_id = f"{id_prefix}-{i}"
 | 
				
			||||||
            svg = self.render_svg(render_id, p["words"], p["arcs"])
 | 
					            svg = self.render_svg(render_id, p["words"], p["arcs"])
 | 
				
			||||||
            rendered.append(svg)
 | 
					            rendered.append(svg)
 | 
				
			||||||
        if page:
 | 
					        if page:
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,7 +1,3 @@
 | 
				
			||||||
# coding: utf8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
# Setting explicit height and max-width: none on the SVG is required for
 | 
					# Setting explicit height and max-width: none on the SVG is required for
 | 
				
			||||||
# Jupyter to render it properly in a cell
 | 
					# Jupyter to render it properly in a cell
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,3 @@
 | 
				
			||||||
# coding: utf8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
import os
 | 
					import os
 | 
				
			||||||
import warnings
 | 
					import warnings
 | 
				
			||||||
import inspect
 | 
					import inspect
 | 
				
			||||||
| 
						 | 
					@ -12,7 +9,7 @@ def add_codes(err_cls):
 | 
				
			||||||
    class ErrorsWithCodes(object):
 | 
					    class ErrorsWithCodes(object):
 | 
				
			||||||
        def __getattribute__(self, code):
 | 
					        def __getattribute__(self, code):
 | 
				
			||||||
            msg = getattr(err_cls, code)
 | 
					            msg = getattr(err_cls, code)
 | 
				
			||||||
            return "[{code}] {msg}".format(code=code, msg=msg)
 | 
					            return f"[{code}] {msg}"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    return ErrorsWithCodes()
 | 
					    return ErrorsWithCodes()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -97,8 +94,6 @@ class Warnings(object):
 | 
				
			||||||
            "you can ignore this warning by setting SPACY_WARNING_IGNORE=W022. "
 | 
					            "you can ignore this warning by setting SPACY_WARNING_IGNORE=W022. "
 | 
				
			||||||
            "If this is surprising, make sure you have the spacy-lookups-data "
 | 
					            "If this is surprising, make sure you have the spacy-lookups-data "
 | 
				
			||||||
            "package installed.")
 | 
					            "package installed.")
 | 
				
			||||||
    W023 = ("Multiprocessing of Language.pipe is not supported in Python 2. "
 | 
					 | 
				
			||||||
            "'n_process' will be set to 1.")
 | 
					 | 
				
			||||||
    W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in "
 | 
					    W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in "
 | 
				
			||||||
            "the Knowledge Base.")
 | 
					            "the Knowledge Base.")
 | 
				
			||||||
    W025 = ("'{name}' requires '{attr}' to be assigned, but none of the "
 | 
					    W025 = ("'{name}' requires '{attr}' to be assigned, but none of the "
 | 
				
			||||||
| 
						 | 
					@ -107,7 +102,9 @@ class Warnings(object):
 | 
				
			||||||
    W027 = ("Found a large training file of {size} bytes. Note that it may "
 | 
					    W027 = ("Found a large training file of {size} bytes. Note that it may "
 | 
				
			||||||
            "be more efficient to split your training data into multiple "
 | 
					            "be more efficient to split your training data into multiple "
 | 
				
			||||||
            "smaller JSON files instead.")
 | 
					            "smaller JSON files instead.")
 | 
				
			||||||
 | 
					    W028 = ("Skipping unsupported morphological feature(s): {feature}. "
 | 
				
			||||||
 | 
					            "Provide features as a dict {{\"Field1\": \"Value1,Value2\"}} or "
 | 
				
			||||||
 | 
					            "string \"Field1=Value1,Value2|Field2=Value3\".")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@add_codes
 | 
					@add_codes
 | 
				
			||||||
| 
						 | 
					@ -227,13 +224,8 @@ class Errors(object):
 | 
				
			||||||
    E047 = ("Can't assign a value to unregistered extension attribute "
 | 
					    E047 = ("Can't assign a value to unregistered extension attribute "
 | 
				
			||||||
            "'{name}'. Did you forget to call the `set_extension` method?")
 | 
					            "'{name}'. Did you forget to call the `set_extension` method?")
 | 
				
			||||||
    E048 = ("Can't import language {lang} from spacy.lang: {err}")
 | 
					    E048 = ("Can't import language {lang} from spacy.lang: {err}")
 | 
				
			||||||
    E049 = ("Can't find spaCy data directory: '{path}'. Check your "
 | 
					    E050 = ("Can't find model '{name}'. It doesn't seem to be a Python "
 | 
				
			||||||
            "installation and permissions, or use spacy.util.set_data_path "
 | 
					            "package or a valid path to a data directory.")
 | 
				
			||||||
            "to customise the location if necessary.")
 | 
					 | 
				
			||||||
    E050 = ("Can't find model '{name}'. It doesn't seem to be a shortcut "
 | 
					 | 
				
			||||||
            "link, a Python package or a valid path to a data directory.")
 | 
					 | 
				
			||||||
    E051 = ("Cant' load '{name}'. If you're using a shortcut link, make sure "
 | 
					 | 
				
			||||||
            "it points to a valid package (not just a data directory).")
 | 
					 | 
				
			||||||
    E052 = ("Can't find model directory: {path}")
 | 
					    E052 = ("Can't find model directory: {path}")
 | 
				
			||||||
    E053 = ("Could not read meta.json from {path}")
 | 
					    E053 = ("Could not read meta.json from {path}")
 | 
				
			||||||
    E054 = ("No valid '{setting}' setting found in model meta.json.")
 | 
					    E054 = ("No valid '{setting}' setting found in model meta.json.")
 | 
				
			||||||
| 
						 | 
					@ -424,8 +416,6 @@ class Errors(object):
 | 
				
			||||||
    E134 = ("Entity '{entity}' is not defined in the Knowledge Base.")
 | 
					    E134 = ("Entity '{entity}' is not defined in the Knowledge Base.")
 | 
				
			||||||
    E135 = ("If you meant to replace a built-in component, use `create_pipe`: "
 | 
					    E135 = ("If you meant to replace a built-in component, use `create_pipe`: "
 | 
				
			||||||
            "`nlp.replace_pipe('{name}', nlp.create_pipe('{name}'))`")
 | 
					            "`nlp.replace_pipe('{name}', nlp.create_pipe('{name}'))`")
 | 
				
			||||||
    E136 = ("This additional feature requires the jsonschema library to be "
 | 
					 | 
				
			||||||
            "installed:\npip install jsonschema")
 | 
					 | 
				
			||||||
    E137 = ("Expected 'dict' type, but got '{type}' from '{line}'. Make sure "
 | 
					    E137 = ("Expected 'dict' type, but got '{type}' from '{line}'. Make sure "
 | 
				
			||||||
            "to provide a valid JSON object as input with either the `text` "
 | 
					            "to provide a valid JSON object as input with either the `text` "
 | 
				
			||||||
            "or `tokens` key. For more info, see the docs:\n"
 | 
					            "or `tokens` key. For more info, see the docs:\n"
 | 
				
			||||||
| 
						 | 
					@ -541,6 +531,15 @@ class Errors(object):
 | 
				
			||||||
    E188 = ("Could not match the gold entity links to entities in the doc - "
 | 
					    E188 = ("Could not match the gold entity links to entities in the doc - "
 | 
				
			||||||
            "make sure the gold EL data refers to valid results of the "
 | 
					            "make sure the gold EL data refers to valid results of the "
 | 
				
			||||||
            "named entity recognizer in the `nlp` pipeline.")
 | 
					            "named entity recognizer in the `nlp` pipeline.")
 | 
				
			||||||
 | 
					    # TODO: fix numbering after merging develop into master
 | 
				
			||||||
 | 
					    E996 = ("Could not parse {file}: {msg}")
 | 
				
			||||||
 | 
					    E997 = ("Tokenizer special cases are not allowed to modify the text. "
 | 
				
			||||||
 | 
					            "This would map '{chunk}' to '{orth}' given token attributes "
 | 
				
			||||||
 | 
					            "'{token_attrs}'.")
 | 
				
			||||||
 | 
					    E998 = ("Can only create GoldParse objects from Example objects without a "
 | 
				
			||||||
 | 
					            "Doc if get_gold_parses() is called with a Vocab object.")
 | 
				
			||||||
 | 
					    E999 = ("Encountered an unexpected format for the dictionary holding "
 | 
				
			||||||
 | 
					            "gold annotations: {gold_dict}")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@add_codes
 | 
					@add_codes
 | 
				
			||||||
| 
						 | 
					@ -566,10 +565,10 @@ class MatchPatternError(ValueError):
 | 
				
			||||||
        errors (dict): Validation errors (sequence of strings) mapped to pattern
 | 
					        errors (dict): Validation errors (sequence of strings) mapped to pattern
 | 
				
			||||||
            ID, i.e. the index of the added pattern.
 | 
					            ID, i.e. the index of the added pattern.
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        msg = "Invalid token patterns for matcher rule '{}'\n".format(key)
 | 
					        msg = f"Invalid token patterns for matcher rule '{key}'\n"
 | 
				
			||||||
        for pattern_idx, error_msgs in errors.items():
 | 
					        for pattern_idx, error_msgs in errors.items():
 | 
				
			||||||
            pattern_errors = "\n".join(["- {}".format(e) for e in error_msgs])
 | 
					            pattern_errors = "\n".join([f"- {e}" for e in error_msgs])
 | 
				
			||||||
            msg += "\nPattern {}:\n{}\n".format(pattern_idx, pattern_errors)
 | 
					            msg += f"\nPattern {pattern_idx}:\n{pattern_errors}\n"
 | 
				
			||||||
        ValueError.__init__(self, msg)
 | 
					        ValueError.__init__(self, msg)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,7 +1,3 @@
 | 
				
			||||||
# coding: utf8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def explain(term):
 | 
					def explain(term):
 | 
				
			||||||
    """Get a description for a given POS tag, dependency label or entity type.
 | 
					    """Get a description for a given POS tag, dependency label or entity type.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,6 @@
 | 
				
			||||||
from cymem.cymem cimport Pool
 | 
					from cymem.cymem cimport Pool
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .structs cimport TokenC
 | 
					from .tokens import Doc
 | 
				
			||||||
from .typedefs cimport attr_t
 | 
					from .typedefs cimport attr_t
 | 
				
			||||||
from .syntax.transition_system cimport Transition
 | 
					from .syntax.transition_system cimport Transition
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -19,23 +19,49 @@ cdef class GoldParse:
 | 
				
			||||||
    cdef Pool mem
 | 
					    cdef Pool mem
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef GoldParseC c
 | 
					    cdef GoldParseC c
 | 
				
			||||||
 | 
					    cdef readonly TokenAnnotation orig
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef int length
 | 
					    cdef int length
 | 
				
			||||||
    cdef public int loss
 | 
					    cdef public int loss
 | 
				
			||||||
    cdef public list words
 | 
					    cdef public list words
 | 
				
			||||||
    cdef public list tags
 | 
					    cdef public list tags
 | 
				
			||||||
    cdef public list morphology
 | 
					    cdef public list pos
 | 
				
			||||||
 | 
					    cdef public list morphs
 | 
				
			||||||
 | 
					    cdef public list lemmas
 | 
				
			||||||
 | 
					    cdef public list sent_starts
 | 
				
			||||||
    cdef public list heads
 | 
					    cdef public list heads
 | 
				
			||||||
    cdef public list labels
 | 
					    cdef public list labels
 | 
				
			||||||
    cdef public dict orths
 | 
					    cdef public dict orths
 | 
				
			||||||
    cdef public list ner
 | 
					    cdef public list ner
 | 
				
			||||||
    cdef public list ents
 | 
					 | 
				
			||||||
    cdef public dict brackets
 | 
					    cdef public dict brackets
 | 
				
			||||||
    cdef public object cats
 | 
					    cdef public dict cats
 | 
				
			||||||
    cdef public dict links
 | 
					    cdef public dict links
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef readonly list cand_to_gold
 | 
					    cdef readonly list cand_to_gold
 | 
				
			||||||
    cdef readonly list gold_to_cand
 | 
					    cdef readonly list gold_to_cand
 | 
				
			||||||
    cdef readonly list orig_annot
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					cdef class TokenAnnotation:
 | 
				
			||||||
 | 
					    cdef public list ids
 | 
				
			||||||
 | 
					    cdef public list words
 | 
				
			||||||
 | 
					    cdef public list tags
 | 
				
			||||||
 | 
					    cdef public list pos
 | 
				
			||||||
 | 
					    cdef public list morphs
 | 
				
			||||||
 | 
					    cdef public list lemmas
 | 
				
			||||||
 | 
					    cdef public list heads
 | 
				
			||||||
 | 
					    cdef public list deps
 | 
				
			||||||
 | 
					    cdef public list entities
 | 
				
			||||||
 | 
					    cdef public list sent_starts
 | 
				
			||||||
 | 
					    cdef public list brackets
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					cdef class DocAnnotation:
 | 
				
			||||||
 | 
					    cdef public object cats
 | 
				
			||||||
 | 
					    cdef public object links
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					cdef class Example:
 | 
				
			||||||
 | 
					    cdef public object doc
 | 
				
			||||||
 | 
					    cdef public TokenAnnotation token_annotation
 | 
				
			||||||
 | 
					    cdef public DocAnnotation doc_annotation
 | 
				
			||||||
 | 
					    cdef public object goldparse
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										772
									
								
								spacy/gold.pyx
									
									
									
									
									
								
							
							
						
						
									
										772
									
								
								spacy/gold.pyx
									
									
									
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| 
						 | 
					@ -6,7 +6,7 @@ from libcpp.vector cimport vector
 | 
				
			||||||
from libc.stdint cimport int32_t, int64_t
 | 
					from libc.stdint cimport int32_t, int64_t
 | 
				
			||||||
from libc.stdio cimport FILE
 | 
					from libc.stdio cimport FILE
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from spacy.vocab cimport Vocab
 | 
					from .vocab cimport Vocab
 | 
				
			||||||
from .typedefs cimport hash_t
 | 
					from .typedefs cimport hash_t
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .structs cimport KBEntryC, AliasC
 | 
					from .structs cimport KBEntryC, AliasC
 | 
				
			||||||
| 
						 | 
					@ -169,4 +169,3 @@ cdef class Reader:
 | 
				
			||||||
    cdef int read_alias(self, int64_t* entry_index, float* prob) except -1
 | 
					    cdef int read_alias(self, int64_t* entry_index, float* prob) except -1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef int _read(self, void* value, size_t size) except -1
 | 
					    cdef int _read(self, void* value, size_t size) except -1
 | 
				
			||||||
 | 
					 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										15
									
								
								spacy/kb.pyx
									
									
									
									
									
								
							
							
						
						
									
										15
									
								
								spacy/kb.pyx
									
									
									
									
									
								
							| 
						 | 
					@ -1,22 +1,17 @@
 | 
				
			||||||
# cython: infer_types=True
 | 
					# cython: infer_types=True
 | 
				
			||||||
# cython: profile=True
 | 
					# cython: profile=True
 | 
				
			||||||
# coding: utf8
 | 
					 | 
				
			||||||
from spacy.errors import Errors, Warnings, user_warning
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from pathlib import Path
 | 
					from pathlib import Path
 | 
				
			||||||
from cymem.cymem cimport Pool
 | 
					from cymem.cymem cimport Pool
 | 
				
			||||||
from preshed.maps cimport PreshMap
 | 
					from preshed.maps cimport PreshMap
 | 
				
			||||||
 | 
					 | 
				
			||||||
from cpython.exc cimport PyErr_SetFromErrno
 | 
					from cpython.exc cimport PyErr_SetFromErrno
 | 
				
			||||||
 | 
					 | 
				
			||||||
from libc.stdio cimport fopen, fclose, fread, fwrite, feof, fseek
 | 
					from libc.stdio cimport fopen, fclose, fread, fwrite, feof, fseek
 | 
				
			||||||
from libc.stdint cimport int32_t, int64_t
 | 
					from libc.stdint cimport int32_t, int64_t
 | 
				
			||||||
 | 
					 | 
				
			||||||
from .typedefs cimport hash_t
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from os import path
 | 
					from os import path
 | 
				
			||||||
from libcpp.vector cimport vector
 | 
					from libcpp.vector cimport vector
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from .typedefs cimport hash_t
 | 
				
			||||||
 | 
					from .errors import Errors, Warnings, user_warning
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef class Candidate:
 | 
					cdef class Candidate:
 | 
				
			||||||
    """A `Candidate` object refers to a textual mention (`alias`) that may or may not be resolved
 | 
					    """A `Candidate` object refers to a textual mention (`alias`) that may or may not be resolved
 | 
				
			||||||
| 
						 | 
					@ -447,7 +442,7 @@ cdef class KnowledgeBase:
 | 
				
			||||||
cdef class Writer:
 | 
					cdef class Writer:
 | 
				
			||||||
    def __init__(self, object loc):
 | 
					    def __init__(self, object loc):
 | 
				
			||||||
        if path.exists(loc):
 | 
					        if path.exists(loc):
 | 
				
			||||||
            assert not path.isdir(loc), "%s is directory." % loc
 | 
					            assert not path.isdir(loc), f"{loc} is directory"
 | 
				
			||||||
        if isinstance(loc, Path):
 | 
					        if isinstance(loc, Path):
 | 
				
			||||||
            loc = bytes(loc)
 | 
					            loc = bytes(loc)
 | 
				
			||||||
        cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
 | 
					        cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
 | 
				
			||||||
| 
						 | 
					@ -584,5 +579,3 @@ cdef class Reader:
 | 
				
			||||||
    cdef int _read(self, void* value, size_t size) except -1:
 | 
					    cdef int _read(self, void* value, size_t size) except -1:
 | 
				
			||||||
        status = fread(value, size, 1, self._fp)
 | 
					        status = fread(value, size, 1, self._fp)
 | 
				
			||||||
        return status
 | 
					        return status
 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,3 @@
 | 
				
			||||||
# coding: utf8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...attrs import LANG
 | 
					from ...attrs import LANG
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,7 +1,3 @@
 | 
				
			||||||
# coding: utf8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
# Source: https://github.com/stopwords-iso/stopwords-af
 | 
					# Source: https://github.com/stopwords-iso/stopwords-af
 | 
				
			||||||
 | 
					
 | 
				
			||||||
STOP_WORDS = set(
 | 
					STOP_WORDS = set(
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,3 @@
 | 
				
			||||||
# coding: utf8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
from .punctuation import TOKENIZER_SUFFIXES
 | 
					from .punctuation import TOKENIZER_SUFFIXES
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,3 @@
 | 
				
			||||||
# coding: utf8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
"""
 | 
					"""
 | 
				
			||||||
Example sentences to test spaCy and its language models.
 | 
					Example sentences to test spaCy and its language models.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,5 +1,3 @@
 | 
				
			||||||
# coding: utf8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
from ...attrs import LIKE_NUM
 | 
					from ...attrs import LIKE_NUM
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_num_words = set(
 | 
					_num_words = set(
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,3 @@
 | 
				
			||||||
# coding: utf8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY
 | 
					from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY
 | 
				
			||||||
from ..char_classes import UNITS, ALPHA_UPPER
 | 
					from ..char_classes import UNITS, ALPHA_UPPER
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,3 @@
 | 
				
			||||||
# coding: utf8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
STOP_WORDS = set(
 | 
					STOP_WORDS = set(
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
من
 | 
					من
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,3 @@
 | 
				
			||||||
# coding: utf8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from ...symbols import ORTH, LEMMA
 | 
					from ...symbols import ORTH, LEMMA
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,3 @@
 | 
				
			||||||
# coding: utf8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...attrs import LANG
 | 
					from ...attrs import LANG
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,3 @@
 | 
				
			||||||
# coding: utf8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
"""
 | 
					"""
 | 
				
			||||||
Example sentences to test spaCy and its language models.
 | 
					Example sentences to test spaCy and its language models.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,7 +1,3 @@
 | 
				
			||||||
# coding: utf8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
# Source: https://github.com/Alir3z4/stop-words
 | 
					# Source: https://github.com/Alir3z4/stop-words
 | 
				
			||||||
 | 
					
 | 
				
			||||||
STOP_WORDS = set(
 | 
					STOP_WORDS = set(
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,3 @@
 | 
				
			||||||
# coding: utf8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
					from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
				
			||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
 | 
					from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
 | 
				
			||||||
from .tag_map import TAG_MAP
 | 
					from .tag_map import TAG_MAP
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,7 +1,3 @@
 | 
				
			||||||
# coding: utf8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
"""
 | 
					"""
 | 
				
			||||||
Example sentences to test spaCy and its language models.
 | 
					Example sentences to test spaCy and its language models.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,3 @@
 | 
				
			||||||
# coding: utf8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from ...symbols import LEMMA, PRON_LEMMA
 | 
					from ...symbols import LEMMA, PRON_LEMMA
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,3 @@
 | 
				
			||||||
# coding: utf8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS
 | 
					from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS
 | 
				
			||||||
from ..char_classes import ALPHA_LOWER, ALPHA, HYPHENS, CONCAT_QUOTES, UNITS
 | 
					from ..char_classes import ALPHA_LOWER, ALPHA, HYPHENS, CONCAT_QUOTES, UNITS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,7 +1,3 @@
 | 
				
			||||||
# coding: utf8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
STOP_WORDS = set(
 | 
					STOP_WORDS = set(
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
অতএব অথচ অথবা অনুযায়ী অনেক অনেকে অনেকেই অন্তত  অবধি অবশ্য অর্থাৎ অন্য অনুযায়ী অর্ধভাগে
 | 
					অতএব অথচ অথবা অনুযায়ী অনেক অনেকে অনেকেই অন্তত  অবধি অবশ্য অর্থাৎ অন্য অনুযায়ী অর্ধভাগে
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,3 @@
 | 
				
			||||||
# coding: utf8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from ...symbols import POS, PUNCT, ADJ, CONJ, SCONJ, NUM, DET, ADV, ADP, X, VERB
 | 
					from ...symbols import POS, PUNCT, ADJ, CONJ, SCONJ, NUM, DET, ADV, ADP, X, VERB
 | 
				
			||||||
from ...symbols import CCONJ, NOUN, PROPN, PART, INTJ, SPACE, PRON, AUX, SYM
 | 
					from ...symbols import CCONJ, NOUN, PROPN, PART, INTJ, SPACE, PRON, AUX, SYM
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -14,8 +11,8 @@ TAG_MAP = {
 | 
				
			||||||
    '""': {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"},
 | 
					    '""': {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"},
 | 
				
			||||||
    "''": {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"},
 | 
					    "''": {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"},
 | 
				
			||||||
    ":": {POS: PUNCT},
 | 
					    ":": {POS: PUNCT},
 | 
				
			||||||
    "৳": {POS: SYM, "Other": {"SymType": "currency"}},
 | 
					    "৳": {POS: SYM, "SymType": "currency"},
 | 
				
			||||||
    "#": {POS: SYM, "Other": {"SymType": "numbersign"}},
 | 
					    "#": {POS: SYM, "SymType": "numbersign"},
 | 
				
			||||||
    "AFX": {POS: ADJ, "Hyph": "yes"},
 | 
					    "AFX": {POS: ADJ, "Hyph": "yes"},
 | 
				
			||||||
    "CC": {POS: CONJ, "ConjType": "coor"},
 | 
					    "CC": {POS: CONJ, "ConjType": "coor"},
 | 
				
			||||||
    "CD": {POS: NUM, "NumType": "card"},
 | 
					    "CD": {POS: NUM, "NumType": "card"},
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,3 @@
 | 
				
			||||||
# coding=utf-8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from ...symbols import ORTH, LEMMA
 | 
					from ...symbols import ORTH, LEMMA
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,3 @@
 | 
				
			||||||
# coding: utf8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
					from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,7 +1,3 @@
 | 
				
			||||||
# coding: utf8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
"""
 | 
					"""
 | 
				
			||||||
Example sentences to test spaCy and its language models.
 | 
					Example sentences to test spaCy and its language models.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,3 @@
 | 
				
			||||||
# coding: utf8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from ...attrs import LIKE_NUM
 | 
					from ...attrs import LIKE_NUM
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,3 @@
 | 
				
			||||||
# coding: utf8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from ..punctuation import TOKENIZER_INFIXES
 | 
					from ..punctuation import TOKENIZER_INFIXES
 | 
				
			||||||
from ..char_classes import ALPHA
 | 
					from ..char_classes import ALPHA
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,7 +1,3 @@
 | 
				
			||||||
# encoding: utf8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
STOP_WORDS = set(
 | 
					STOP_WORDS = set(
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
a abans ací ah així això al aleshores algun alguna algunes alguns alhora allà allí allò
 | 
					a abans ací ah així això al aleshores algun alguna algunes alguns alhora allà allí allò
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,28 +0,0 @@
 | 
				
			||||||
# coding: utf8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from ..symbols import POS, ADV, NOUN, ADP, PRON, SCONJ, PROPN, DET, SYM, INTJ
 | 
					 | 
				
			||||||
from ..symbols import PUNCT, NUM, AUX, X, CONJ, ADJ, VERB, PART, SPACE, CCONJ
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
TAG_MAP = {
 | 
					 | 
				
			||||||
    "ADV": {POS: ADV},
 | 
					 | 
				
			||||||
    "NOUN": {POS: NOUN},
 | 
					 | 
				
			||||||
    "ADP": {POS: ADP},
 | 
					 | 
				
			||||||
    "PRON": {POS: PRON},
 | 
					 | 
				
			||||||
    "SCONJ": {POS: SCONJ},
 | 
					 | 
				
			||||||
    "PROPN": {POS: PROPN},
 | 
					 | 
				
			||||||
    "DET": {POS: DET},
 | 
					 | 
				
			||||||
    "SYM": {POS: SYM},
 | 
					 | 
				
			||||||
    "INTJ": {POS: INTJ},
 | 
					 | 
				
			||||||
    "PUNCT": {POS: PUNCT},
 | 
					 | 
				
			||||||
    "NUM": {POS: NUM},
 | 
					 | 
				
			||||||
    "AUX": {POS: AUX},
 | 
					 | 
				
			||||||
    "X": {POS: X},
 | 
					 | 
				
			||||||
    "CONJ": {POS: CONJ},
 | 
					 | 
				
			||||||
    "CCONJ": {POS: CCONJ},
 | 
					 | 
				
			||||||
    "ADJ": {POS: ADJ},
 | 
					 | 
				
			||||||
    "VERB": {POS: VERB},
 | 
					 | 
				
			||||||
    "PART": {POS: PART},
 | 
					 | 
				
			||||||
    "SP": {POS: SPACE},
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,3 @@
 | 
				
			||||||
# coding: utf8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from ...symbols import ORTH, LEMMA
 | 
					from ...symbols import ORTH, LEMMA
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -33,9 +30,9 @@ _exc["12m."] = [{ORTH: "12"}, {ORTH: "m.", LEMMA: "p.m."}]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
for h in range(1, 12 + 1):
 | 
					for h in range(1, 12 + 1):
 | 
				
			||||||
    for period in ["a.m.", "am"]:
 | 
					    for period in ["a.m.", "am"]:
 | 
				
			||||||
        _exc["%d%s" % (h, period)] = [{ORTH: "%d" % h}, {ORTH: period, LEMMA: "a.m."}]
 | 
					        _exc[f"{h}{period}"] = [{ORTH: f"{h}"}, {ORTH: period, LEMMA: "a.m."}]
 | 
				
			||||||
    for period in ["p.m.", "pm"]:
 | 
					    for period in ["p.m.", "pm"]:
 | 
				
			||||||
        _exc["%d%s" % (h, period)] = [{ORTH: "%d" % h}, {ORTH: period, LEMMA: "p.m."}]
 | 
					        _exc[f"{h}{period}"] = [{ORTH: f"{h}"}, {ORTH: period, LEMMA: "p.m."}]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
TOKENIZER_EXCEPTIONS = _exc
 | 
					TOKENIZER_EXCEPTIONS = _exc
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,3 @@
 | 
				
			||||||
# coding: utf8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
split_chars = lambda char: list(char.strip().split(" "))
 | 
					split_chars = lambda char: list(char.strip().split(" "))
 | 
				
			||||||
merge_chars = lambda char: char.strip().replace(" ", "|")
 | 
					merge_chars = lambda char: char.strip().replace(" ", "|")
 | 
				
			||||||
group_chars = lambda char: char.strip().replace(" ", "")
 | 
					group_chars = lambda char: char.strip().replace(" ", "")
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,3 @@
 | 
				
			||||||
# coding: utf8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
from ...attrs import LANG
 | 
					from ...attrs import LANG
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,7 +1,3 @@
 | 
				
			||||||
# coding: utf8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
# Source: https://github.com/Alir3z4/stop-words
 | 
					# Source: https://github.com/Alir3z4/stop-words
 | 
				
			||||||
 | 
					
 | 
				
			||||||
STOP_WORDS = set(
 | 
					STOP_WORDS = set(
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,13 +1,9 @@
 | 
				
			||||||
# coding: utf8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
					from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
				
			||||||
from .norm_exceptions import NORM_EXCEPTIONS
 | 
					from .norm_exceptions import NORM_EXCEPTIONS
 | 
				
			||||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
 | 
					from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
from .morph_rules import MORPH_RULES
 | 
					from .morph_rules import MORPH_RULES
 | 
				
			||||||
from ..tag_map import TAG_MAP
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
					from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
				
			||||||
from ..norm_exceptions import BASE_NORMS
 | 
					from ..norm_exceptions import BASE_NORMS
 | 
				
			||||||
| 
						 | 
					@ -27,7 +23,6 @@ class DanishDefaults(Language.Defaults):
 | 
				
			||||||
    morph_rules = MORPH_RULES
 | 
					    morph_rules = MORPH_RULES
 | 
				
			||||||
    infixes = TOKENIZER_INFIXES
 | 
					    infixes = TOKENIZER_INFIXES
 | 
				
			||||||
    suffixes = TOKENIZER_SUFFIXES
 | 
					    suffixes = TOKENIZER_SUFFIXES
 | 
				
			||||||
    tag_map = TAG_MAP
 | 
					 | 
				
			||||||
    stop_words = STOP_WORDS
 | 
					    stop_words = STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,7 +1,3 @@
 | 
				
			||||||
# coding: utf8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
"""
 | 
					"""
 | 
				
			||||||
Example sentences to test spaCy and its language models.
 | 
					Example sentences to test spaCy and its language models.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
Some files were not shown because too many files have changed in this diff Show More
		Loading…
	
		Reference in New Issue
	
	Block a user