mirror of
https://github.com/explosion/spaCy.git
synced 2025-06-02 04:03:19 +03:00
Merge branch 'master' of ssh://github.com/honnibal/spaCy
This commit is contained in:
commit
fb8d50b3d5
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -17,6 +17,7 @@ models/
|
||||||
spacy/syntax/*.cpp
|
spacy/syntax/*.cpp
|
||||||
spacy/syntax/*.html
|
spacy/syntax/*.html
|
||||||
spacy/en/*.cpp
|
spacy/en/*.cpp
|
||||||
|
spacy/en/data/*
|
||||||
spacy/*.cpp
|
spacy/*.cpp
|
||||||
spacy/ner/*.cpp
|
spacy/ner/*.cpp
|
||||||
spacy/orthography/*.cpp
|
spacy/orthography/*.cpp
|
||||||
|
|
|
@ -8,12 +8,12 @@ python:
|
||||||
- "2.7"
|
- "2.7"
|
||||||
- "3.4"
|
- "3.4"
|
||||||
|
|
||||||
# command to install dependencies
|
# install dependencies
|
||||||
install:
|
install:
|
||||||
- "pip install --upgrade setuptools"
|
- "pip install --upgrade setuptools"
|
||||||
- "pip install -r requirements.txt"
|
- "pip install -r requirements.txt"
|
||||||
- "export PYTHONPATH=`pwd`"
|
- "export PYTHONPATH=`pwd`"
|
||||||
- "python setup.py build_ext --inplace"
|
- "python setup.py build_ext --inplace"
|
||||||
# command to run tests
|
# run tests
|
||||||
script:
|
script:
|
||||||
- py.test tests/
|
- py.test tests/
|
||||||
|
|
|
@ -3,20 +3,18 @@ spaCy
|
||||||
|
|
||||||
http://honnibal.github.io/spaCy
|
http://honnibal.github.io/spaCy
|
||||||
|
|
||||||
Fast, state-of-the-art natural language processing pipeline. Commercial licenses available, or use under AGPL.
|
A pipeline for fast, state-of-the-art natural language processing. Commercial licenses available, otherwise under AGPL.
|
||||||
|
|
||||||
Version 0.80 released
|
Version 0.80 released
|
||||||
---------------------
|
---------------------
|
||||||
|
|
||||||
2015-04-13
|
2015-04-13
|
||||||
|
|
||||||
* Preliminary named entity recognition support. Accuracy is currently
|
* Preliminary support for named-entity recognition. Its accuracy is substantially behind the state-of-the-art. I'm working on improvements.
|
||||||
substantially behind the current state-of-the-art. I'm working on
|
|
||||||
improvements.
|
|
||||||
|
|
||||||
* Better sentence boundary detection, drawn from the syntactic structure.
|
* Better sentence boundary detection, drawn from the syntactic structure.
|
||||||
|
|
||||||
* Lots of bug fixes
|
* Lots of bug fixes.
|
||||||
|
|
||||||
|
|
||||||
Supports:
|
Supports:
|
||||||
|
@ -35,4 +33,3 @@ Difficult to support:
|
||||||
|
|
||||||
* PyPy 2.7
|
* PyPy 2.7
|
||||||
* PyPy 3.4
|
* PyPy 3.4
|
||||||
|
|
||||||
|
|
|
@ -30,5 +30,3 @@ def main(text_loc):
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
plac.call(main)
|
plac.call(main)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -7,8 +7,6 @@ from os import path
|
||||||
import shutil
|
import shutil
|
||||||
import codecs
|
import codecs
|
||||||
import random
|
import random
|
||||||
import time
|
|
||||||
import gzip
|
|
||||||
|
|
||||||
import plac
|
import plac
|
||||||
import cProfile
|
import cProfile
|
||||||
|
|
|
@ -1,8 +1,6 @@
|
||||||
"""Read a vector file, and prepare it as binary data, for easy consumption"""
|
"""Read a vector file, and prepare it as binary data, for easy consumption"""
|
||||||
|
|
||||||
import bz2
|
|
||||||
import plac
|
import plac
|
||||||
import struct
|
|
||||||
|
|
||||||
from spacy.vocab import write_binary_vectors
|
from spacy.vocab import write_binary_vectors
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
Signing the Contributors License Agreement
|
Signing the Contributors License Agreement
|
||||||
==========================================
|
==========================================
|
||||||
|
|
||||||
SpaCy is a commercial open-source project, owned by Syllogism Co. We require that contributors to SpaCy sign our Contributors License Agreement, which is based on the Oracle Contributor Agreement.
|
SpaCy is a commercial open-source project, owned by Syllogism Co. We require that contributors to SpaCy sign our Contributors License Agreement, which is based on the Oracle Contributor Agreement.
|
||||||
|
|
||||||
The CLA must be signed on your first pull request. To do this, simply fill in the file cla_template.md, and include the filed in form in your first pull request.
|
The CLA must be signed on your first pull request. To do this, simply fill in the file cla_template.md, and include the filed in form in your first pull request.
|
||||||
|
|
||||||
|
@ -11,5 +11,3 @@ The CLA must be signed on your first pull request. To do this, simply fill in th
|
||||||
$ git add -A spaCy/contributors/<your GitHub username>.md
|
$ git add -A spaCy/contributors/<your GitHub username>.md
|
||||||
|
|
||||||
Now finish your pull request, and you're done.
|
Now finish your pull request, and you're done.
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -2,7 +2,7 @@ Syllogism Contributor Agreement
|
||||||
===============================
|
===============================
|
||||||
|
|
||||||
This Syllogism Contributor Agreement (“SCA”) is based on the Oracle Contributor
|
This Syllogism Contributor Agreement (“SCA”) is based on the Oracle Contributor
|
||||||
Agreement. The SCA applies to any contribution that you make to any product or
|
Agreement. The SCA applies to any contribution that you make to any product or
|
||||||
project managed by us (the “project”), and sets out the intellectual property
|
project managed by us (the “project”), and sets out the intellectual property
|
||||||
rights you grant to us in the contributed materials. The term “us” shall mean
|
rights you grant to us in the contributed materials. The term “us” shall mean
|
||||||
Syllogism Co. The term "you" shall mean the person or entity identified below.
|
Syllogism Co. The term "you" shall mean the person or entity identified below.
|
||||||
|
|
95
contributors/suchow.md
Normal file
95
contributors/suchow.md
Normal file
|
@ -0,0 +1,95 @@
|
||||||
|
Syllogism Contributor Agreement
|
||||||
|
===============================
|
||||||
|
|
||||||
|
This Syllogism Contributor Agreement (“SCA”) is based on the Oracle Contributor
|
||||||
|
Agreement. The SCA applies to any contribution that you make to any product or
|
||||||
|
project managed by us (the “project”), and sets out the intellectual property
|
||||||
|
rights you grant to us in the contributed materials. The term “us” shall mean
|
||||||
|
Syllogism Co. The term "you" shall mean the person or entity identified below.
|
||||||
|
If you agree to be bound by these terms, fill in the information requested below
|
||||||
|
and include the filled-in version with your first pull-request, under the file
|
||||||
|
contrbutors/. The name of the file should be your GitHub username, with the
|
||||||
|
extension .md. For example, the user example_user would create the file
|
||||||
|
spaCy/contributors/example_user.md .
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
1. The term 'contribution' or ‘contributed materials’ means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual, documentation,
|
||||||
|
or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and registrations,
|
||||||
|
in your contribution:
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such assignment
|
||||||
|
is or becomes invalid, ineffective or unenforceable, you hereby grant to us a perpetual,
|
||||||
|
irrevocable, non-exclusive, worldwide, no-charge, royalty-free, unrestricted license
|
||||||
|
to exercise all rights under those copyrights. This includes, at our option, the
|
||||||
|
right to sublicense these same rights to third parties through multiple levels of
|
||||||
|
sublicensees or other licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your contribution
|
||||||
|
as if each of us were the sole owners, and if one of us makes a derivative work
|
||||||
|
of your contribution, the one who makes the derivative work (or has it made) will
|
||||||
|
be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution against
|
||||||
|
us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and exercise
|
||||||
|
all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the consent
|
||||||
|
of, pay or render an accounting to the other for any use or distribution of your
|
||||||
|
contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable, non-exclusive,
|
||||||
|
worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer your
|
||||||
|
contribution in whole or in part, alone or in combination with
|
||||||
|
or included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through multiple
|
||||||
|
levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective on
|
||||||
|
the date you first submitted a contribution to us, even if your submission took
|
||||||
|
place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of authorship
|
||||||
|
and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any third
|
||||||
|
party's copyrights, trademarks, patents, or other intellectual property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and other
|
||||||
|
applicable export and import laws. You agree to notify us if you become aware of
|
||||||
|
any circumstance which would make any of the foregoing representations inaccurate
|
||||||
|
in any respect. Syllogism Co. may publicly disclose your participation in the project,
|
||||||
|
including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable U.S.
|
||||||
|
Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
x___ I am signing on behalf of myself as an individual and no other person or entity, including my employer, has or will have rights with respect my contributions.
|
||||||
|
|
||||||
|
____ I am signing on behalf of my employer or a legal entity and I have the actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Jordan Suchow |
|
||||||
|
| Company's name (if applicable) | |
|
||||||
|
| Title or Role (if applicable) | |
|
||||||
|
| Date | 2015-04-19 |
|
||||||
|
| GitHub username | suchow |
|
||||||
|
| Website (optional) | http://suchow.io |
|
||||||
|
|
|
@ -64,8 +64,6 @@ def clean(ext):
|
||||||
if os.path.exists(html):
|
if os.path.exists(html):
|
||||||
os.unlink(html)
|
os.unlink(html)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
HERE = os.path.dirname(__file__)
|
HERE = os.path.dirname(__file__)
|
||||||
virtual_env = os.environ.get('VIRTUAL_ENV', '')
|
virtual_env = os.environ.get('VIRTUAL_ENV', '')
|
||||||
compile_args = []
|
compile_args = []
|
||||||
|
|
|
@ -107,7 +107,7 @@ API
|
||||||
*derivational* suffixes are not stripped, e.g. the lemma of "instutitions"
|
*derivational* suffixes are not stripped, e.g. the lemma of "instutitions"
|
||||||
is "institution", not "institute". Lemmatization is performed using the
|
is "institution", not "institute". Lemmatization is performed using the
|
||||||
WordNet data, but extended to also cover closed-class words such as
|
WordNet data, but extended to also cover closed-class words such as
|
||||||
pronouns. By default, the WN lemmatizer returns "hi" as the lemma of "his".
|
pronouns. By default, the WN lemmatizer returns "hi" as the lemma of "his".
|
||||||
We assign pronouns the lemma -PRON-.
|
We assign pronouns the lemma -PRON-.
|
||||||
|
|
||||||
lower
|
lower
|
||||||
|
@ -121,7 +121,7 @@ API
|
||||||
A transform of the word's string, to show orthographic features. The
|
A transform of the word's string, to show orthographic features. The
|
||||||
characters a-z are mapped to x, A-Z is mapped to X, 0-9 is mapped to d.
|
characters a-z are mapped to x, A-Z is mapped to X, 0-9 is mapped to d.
|
||||||
After these mappings, sequences of 4 or more of the same character are
|
After these mappings, sequences of 4 or more of the same character are
|
||||||
truncated to length 4. Examples: C3Po --> XdXx, favorite --> xxxx,
|
truncated to length 4. Examples: C3Po --> XdXx, favorite --> xxxx,
|
||||||
:) --> :)
|
:) --> :)
|
||||||
|
|
||||||
prefix
|
prefix
|
||||||
|
|
|
@ -66,7 +66,7 @@ Boolean features
|
||||||
+-------------+--------------------------------------------------------------+
|
+-------------+--------------------------------------------------------------+
|
||||||
| IS_UPPER | The result of sic.isupper() |
|
| IS_UPPER | The result of sic.isupper() |
|
||||||
+-------------+--------------------------------------------------------------+
|
+-------------+--------------------------------------------------------------+
|
||||||
| LIKE_URL | Check whether the string looks like it could be a URL. Aims |
|
| LIKE_URL | Check whether the string looks like it could be a URL. Aims |
|
||||||
| | for low false negative rate. |
|
| | for low false negative rate. |
|
||||||
+-------------+--------------------------------------------------------------+
|
+-------------+--------------------------------------------------------------+
|
||||||
| LIKE_NUMBER | Check whether the string looks like it could be a numeric |
|
| LIKE_NUMBER | Check whether the string looks like it could be a numeric |
|
||||||
|
@ -75,4 +75,3 @@ Boolean features
|
||||||
+-------------+--------------------------------------------------------------+
|
+-------------+--------------------------------------------------------------+
|
||||||
| IN_LIST | Facility for loading arbitrary run-time word lists? |
|
| IN_LIST | Facility for loading arbitrary run-time word lists? |
|
||||||
+-------------+--------------------------------------------------------------+
|
+-------------+--------------------------------------------------------------+
|
||||||
|
|
||||||
|
|
|
@ -6,7 +6,7 @@ What and Why
|
||||||
|
|
||||||
spaCy is a lightning-fast, full-cream NLP tokenizer and lexicon.
|
spaCy is a lightning-fast, full-cream NLP tokenizer and lexicon.
|
||||||
|
|
||||||
Most tokenizers give you a sequence of strings. That's barbaric.
|
Most tokenizers give you a sequence of strings. That's barbaric.
|
||||||
Giving you strings invites you to compute on every *token*, when what
|
Giving you strings invites you to compute on every *token*, when what
|
||||||
you should be doing is computing on every *type*. Remember
|
you should be doing is computing on every *type*. Remember
|
||||||
`Zipf's law <http://en.wikipedia.org/wiki/Zipf's_law>`_: you'll
|
`Zipf's law <http://en.wikipedia.org/wiki/Zipf's_law>`_: you'll
|
||||||
|
@ -28,14 +28,14 @@ can access an excellent set of pre-computed orthographic and distributional feat
|
||||||
>>> are.check_flag(en.CAN_NOUN)
|
>>> are.check_flag(en.CAN_NOUN)
|
||||||
False
|
False
|
||||||
|
|
||||||
spaCy makes it easy to write very efficient NLP applications, because your feature
|
spaCy makes it easy to write efficient NLP applications, because your feature
|
||||||
functions have to do almost no work: almost every lexical property you'll want
|
functions have to do almost no work: almost every lexical property you'll want
|
||||||
is pre-computed for you. See the tutorial for an example POS tagger.
|
is pre-computed for you. See the tutorial for an example POS tagger.
|
||||||
|
|
||||||
Benchmark
|
Benchmark
|
||||||
---------
|
---------
|
||||||
|
|
||||||
The tokenizer itself is also very efficient:
|
The tokenizer itself is also efficient:
|
||||||
|
|
||||||
+--------+-------+--------------+--------------+
|
+--------+-------+--------------+--------------+
|
||||||
| System | Time | Words/second | Speed Factor |
|
| System | Time | Words/second | Speed Factor |
|
||||||
|
@ -56,7 +56,7 @@ Pros:
|
||||||
|
|
||||||
- All tokens come with indices into the original string
|
- All tokens come with indices into the original string
|
||||||
- Full unicode support
|
- Full unicode support
|
||||||
- Extensible to other languages
|
- Extendable to other languages
|
||||||
- Batch operations computed efficiently in Cython
|
- Batch operations computed efficiently in Cython
|
||||||
- Cython API
|
- Cython API
|
||||||
- numpy interoperability
|
- numpy interoperability
|
||||||
|
@ -68,4 +68,3 @@ Cons:
|
||||||
- Higher memory usage (up to 1gb)
|
- Higher memory usage (up to 1gb)
|
||||||
- More conceptually complicated
|
- More conceptually complicated
|
||||||
- Tokenization rules expressed in code, not as data
|
- Tokenization rules expressed in code, not as data
|
||||||
|
|
||||||
|
|
|
@ -116,7 +116,7 @@ this was written quickly and has not been executed):
|
||||||
|
|
||||||
|
|
||||||
This procedure splits off tokens from the start and end of the string, at each
|
This procedure splits off tokens from the start and end of the string, at each
|
||||||
point checking whether the remaining string is in our special-cases table. If
|
point checking whether the remaining string is in our special-cases table. If
|
||||||
it is, we stop splitting, and return the tokenization at that point.
|
it is, we stop splitting, and return the tokenization at that point.
|
||||||
|
|
||||||
The advantage of this design is that the prefixes, suffixes and special-cases
|
The advantage of this design is that the prefixes, suffixes and special-cases
|
||||||
|
@ -135,7 +135,7 @@ lexical types.
|
||||||
|
|
||||||
In a sample of text, vocabulary size grows exponentially slower than word
|
In a sample of text, vocabulary size grows exponentially slower than word
|
||||||
count. So any computations we can perform over the vocabulary and apply to the
|
count. So any computations we can perform over the vocabulary and apply to the
|
||||||
word count are very efficient.
|
word count are efficient.
|
||||||
|
|
||||||
|
|
||||||
Part-of-speech Tagger
|
Part-of-speech Tagger
|
||||||
|
@ -206,8 +206,8 @@ loop:
|
||||||
class_, score = max(enumerate(scores), key=lambda item: item[1])
|
class_, score = max(enumerate(scores), key=lambda item: item[1])
|
||||||
transition(state, class_)
|
transition(state, class_)
|
||||||
|
|
||||||
The parser makes 2N transitions for a sentence of length N. In order to select
|
The parser makes 2N transitions for a sentence of length N. In order to select
|
||||||
the transition, it extracts a vector of K features from the state. Each feature
|
the transition, it extracts a vector of K features from the state. Each feature
|
||||||
is used as a key into a hash table managed by the model. The features map to
|
is used as a key into a hash table managed by the model. The features map to
|
||||||
a vector of weights, of length C. We then dot product the feature weights to the
|
a vector of weights, of length C. We then dot product the feature weights to the
|
||||||
scores vector we are building for that instance.
|
scores vector we are building for that instance.
|
||||||
|
@ -260,5 +260,3 @@ these models is really all about the data structures. We want to stay small,
|
||||||
and stay contiguous. Minimize redundancy and minimize pointer chasing.
|
and stay contiguous. Minimize redundancy and minimize pointer chasing.
|
||||||
That's why Cython is so well suited to this: we get to lay out our data
|
That's why Cython is so well suited to this: we get to lay out our data
|
||||||
structures, and manage the memory ourselves, with full C-level control.
|
structures, and manage the memory ourselves, with full C-level control.
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -10,7 +10,7 @@ spaCy: Industrial-strength NLP
|
||||||
|
|
||||||
.. _Issue Tracker: https://github.com/honnibal/spaCy/issues
|
.. _Issue Tracker: https://github.com/honnibal/spaCy/issues
|
||||||
|
|
||||||
**13/04**: *Version 0.80 released. Includes named entity recognition, better sentence
|
**13/04**: *Version 0.80 released. Includes named entity recognition, better sentence
|
||||||
boundary detection, and many bug fixes.*
|
boundary detection, and many bug fixes.*
|
||||||
|
|
||||||
`spaCy`_ is a new library for text processing in Python and Cython.
|
`spaCy`_ is a new library for text processing in Python and Cython.
|
||||||
|
@ -28,7 +28,7 @@ If they don't want to stay in academia, they join Google, IBM, etc.
|
||||||
|
|
||||||
The net result is that outside of the tech giants, commercial NLP has changed
|
The net result is that outside of the tech giants, commercial NLP has changed
|
||||||
little in the last ten years. In academia, it's changed entirely. Amazing
|
little in the last ten years. In academia, it's changed entirely. Amazing
|
||||||
improvements in quality. Orders of magnitude faster. But the
|
improvements in quality. Orders of magnitude faster. But the
|
||||||
academic code is always GPL, undocumented, unuseable, or all three. You could
|
academic code is always GPL, undocumented, unuseable, or all three. You could
|
||||||
implement the ideas yourself, but the papers are hard to read, and training
|
implement the ideas yourself, but the papers are hard to read, and training
|
||||||
data is exorbitantly expensive. So what are you left with? A common answer is
|
data is exorbitantly expensive. So what are you left with? A common answer is
|
||||||
|
@ -37,7 +37,7 @@ tokenizer is suitable for production use.
|
||||||
|
|
||||||
I used to think that the NLP community just needed to do more to communicate
|
I used to think that the NLP community just needed to do more to communicate
|
||||||
its findings to software engineers. So I wrote two blog posts, explaining
|
its findings to software engineers. So I wrote two blog posts, explaining
|
||||||
`how to write a part-of-speech tagger`_ and `parser`_. Both were very well received,
|
`how to write a part-of-speech tagger`_ and `parser`_. Both were well received,
|
||||||
and there's been a bit of interest in `my research software`_ --- even though
|
and there's been a bit of interest in `my research software`_ --- even though
|
||||||
it's entirely undocumented, and mostly unuseable to anyone but me.
|
it's entirely undocumented, and mostly unuseable to anyone but me.
|
||||||
|
|
||||||
|
@ -58,7 +58,7 @@ to embedded word representations, and a range of useful features are pre-calcula
|
||||||
and cached.
|
and cached.
|
||||||
|
|
||||||
If none of that made any sense to you, here's the gist of it. Computers don't
|
If none of that made any sense to you, here's the gist of it. Computers don't
|
||||||
understand text. This is unfortunate, because that's what the web almost entirely
|
understand text. This is unfortunate, because that's what the web almost entirely
|
||||||
consists of. We want to recommend people text based on other text they liked.
|
consists of. We want to recommend people text based on other text they liked.
|
||||||
We want to shorten text to display it on a mobile screen. We want to aggregate
|
We want to shorten text to display it on a mobile screen. We want to aggregate
|
||||||
it, link it, filter it, categorise it, generate it and correct it.
|
it, link it, filter it, categorise it, generate it and correct it.
|
||||||
|
@ -202,7 +202,7 @@ this:
|
||||||
|
|
||||||
We wanted to refine the logic so that only adverbs modifying evocative verbs
|
We wanted to refine the logic so that only adverbs modifying evocative verbs
|
||||||
of communication, like "pleaded", were highlighted. We've now built a vector that
|
of communication, like "pleaded", were highlighted. We've now built a vector that
|
||||||
represents that type of word, so now we can highlight adverbs based on very
|
represents that type of word, so now we can highlight adverbs based on
|
||||||
subtle logic, honing in on adverbs that seem the most stylistically
|
subtle logic, honing in on adverbs that seem the most stylistically
|
||||||
problematic, given our starting assumptions:
|
problematic, given our starting assumptions:
|
||||||
|
|
||||||
|
@ -242,7 +242,7 @@ I report mean times per document, in milliseconds.
|
||||||
|
|
||||||
**Hardware**: Intel i7-3770 (2012)
|
**Hardware**: Intel i7-3770 (2012)
|
||||||
|
|
||||||
.. table:: Efficiency comparison. Lower is better.
|
.. table:: Efficiency comparison. Lower is better.
|
||||||
|
|
||||||
+--------------+---------------------------+--------------------------------+
|
+--------------+---------------------------+--------------------------------+
|
||||||
| | Absolute (ms per doc) | Relative (to spaCy) |
|
| | Absolute (ms per doc) | Relative (to spaCy) |
|
||||||
|
@ -278,7 +278,7 @@ representations.
|
||||||
publish or perform any benchmark or performance tests or analysis relating to
|
publish or perform any benchmark or performance tests or analysis relating to
|
||||||
the Service or the use thereof without express authorization from AlchemyAPI;
|
the Service or the use thereof without express authorization from AlchemyAPI;
|
||||||
|
|
||||||
.. Did you get that? You're not allowed to evaluate how well their system works,
|
.. Did you get that? You're not allowed to evaluate how well their system works,
|
||||||
unless you're granted a special exception. Their system must be pretty
|
unless you're granted a special exception. Their system must be pretty
|
||||||
terrible to motivate such an embarrassing restriction.
|
terrible to motivate such an embarrassing restriction.
|
||||||
They must know this makes them look bad, but they apparently believe allowing
|
They must know this makes them look bad, but they apparently believe allowing
|
||||||
|
|
|
@ -278,6 +278,3 @@ sentence represents the document as a whole.
|
||||||
|
|
||||||
Document Model
|
Document Model
|
||||||
--------------
|
--------------
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -35,7 +35,7 @@ And if you're ever in acquisition or IPO talks, the story is simple.
|
||||||
spaCy can also be used as free open-source software, under the Aferro GPL
|
spaCy can also be used as free open-source software, under the Aferro GPL
|
||||||
license. If you use it this way, you must comply with the AGPL license terms.
|
license. If you use it this way, you must comply with the AGPL license terms.
|
||||||
When you distribute your project, or offer it as a network service, you must
|
When you distribute your project, or offer it as a network service, you must
|
||||||
distribute the source-code, and grant users an AGPL license to it.
|
distribute the source-code and grant users an AGPL license to it.
|
||||||
|
|
||||||
|
|
||||||
.. I left academia in June 2014, just when I should have been submitting my first
|
.. I left academia in June 2014, just when I should have been submitting my first
|
||||||
|
@ -92,7 +92,7 @@ developing. They own the copyright to any modifications they make to spaCy,
|
||||||
but not to the original spaCy code.
|
but not to the original spaCy code.
|
||||||
|
|
||||||
No additional fees will be due when they hire new developers, run spaCy on
|
No additional fees will be due when they hire new developers, run spaCy on
|
||||||
additional internal servers, etc. If their company is acquired, the license will
|
additional internal servers, etc. If their company is acquired, the license will
|
||||||
be transferred to the company acquiring them. However, to use spaCy in another
|
be transferred to the company acquiring them. However, to use spaCy in another
|
||||||
product, they will have to buy a second license.
|
product, they will have to buy a second license.
|
||||||
|
|
||||||
|
@ -115,9 +115,9 @@ In order to do this, they must sign a contributor agreement, ceding their
|
||||||
copyright. When commercial licenses to spaCy are sold, Alex and Sasha will
|
copyright. When commercial licenses to spaCy are sold, Alex and Sasha will
|
||||||
not be able to claim any royalties from their contributions.
|
not be able to claim any royalties from their contributions.
|
||||||
|
|
||||||
Later, Alex and Sasha implement new features into spaCy, for another paper. The
|
Later, Alex and Sasha implement new features into spaCy, for another paper. The
|
||||||
code was quite rushed, and they don't want to take the time to put together a
|
code was quite rushed, and they don't want to take the time to put together a
|
||||||
proper pull request. They must release their modifications under the AGPL, but
|
proper pull request. They must release their modifications under the AGPL, but
|
||||||
they are not obliged to contribute it to the spaCy repository, or concede their
|
they are not obliged to contribute it to the spaCy repository, or concede their
|
||||||
copyright.
|
copyright.
|
||||||
|
|
||||||
|
@ -126,8 +126,8 @@ Phuong and Jessie: Open Source developers
|
||||||
#########################################
|
#########################################
|
||||||
|
|
||||||
Phuong and Jessie use the open-source software Calibre to manage their e-book
|
Phuong and Jessie use the open-source software Calibre to manage their e-book
|
||||||
libraries. They have an idea for a search feature, and they want to use spaCy
|
libraries. They have an idea for a search feature, and they want to use spaCy
|
||||||
to implement it. Calibre is released under the GPLv3. The AGPL has additional
|
to implement it. Calibre is released under the GPLv3. The AGPL has additional
|
||||||
restrictions for projects used as a network resource, but they don't apply to
|
restrictions for projects used as a network resource, but they don't apply to
|
||||||
this project, so Phuong and Jessie can use spaCy to improve Calibre. They'll
|
this project, so Phuong and Jessie can use spaCy to improve Calibre. They'll
|
||||||
have to release their code, but that was always their intention anyway.
|
have to release their code, but that was always their intention anyway.
|
||||||
|
|
|
@ -23,7 +23,7 @@ parser model and word vectors, which it installs within the spacy.en package dir
|
||||||
|
|
||||||
If you're stuck using a server with an old version of Python, and you don't
|
If you're stuck using a server with an old version of Python, and you don't
|
||||||
have root access, I've prepared a bootstrap script to help you compile a local
|
have root access, I've prepared a bootstrap script to help you compile a local
|
||||||
Python install. Run:
|
Python install. Run:
|
||||||
|
|
||||||
.. code:: bash
|
.. code:: bash
|
||||||
|
|
||||||
|
@ -47,7 +47,7 @@ this is how I build the project.
|
||||||
$ py.test tests/
|
$ py.test tests/
|
||||||
|
|
||||||
Python packaging is awkward at the best of times, and it's particularly tricky
|
Python packaging is awkward at the best of times, and it's particularly tricky
|
||||||
with C extensions, built via Cython, requiring large data files. So, please
|
with C extensions, built via Cython, requiring large data files. So, please
|
||||||
report issues as you encounter them, and bear with me :)
|
report issues as you encounter them, and bear with me :)
|
||||||
|
|
||||||
Usage
|
Usage
|
||||||
|
@ -234,4 +234,3 @@ Features
|
||||||
+---------+-----------------------------------------------------------+
|
+---------+-----------------------------------------------------------+
|
||||||
| prob | Log probability of word, smoothed with Simple Good-Turing |
|
| prob | Log probability of word, smoothed with Simple Good-Turing |
|
||||||
+---------+-----------------------------------------------------------+
|
+---------+-----------------------------------------------------------+
|
||||||
|
|
||||||
|
|
|
@ -7,8 +7,8 @@ Updates
|
||||||
Five days ago I presented the alpha release of spaCy, a natural language
|
Five days ago I presented the alpha release of spaCy, a natural language
|
||||||
processing library that brings state-of-the-art technology to small companies.
|
processing library that brings state-of-the-art technology to small companies.
|
||||||
|
|
||||||
spaCy has been very well received, and there are now a lot of eyes on the project.
|
spaCy has been well received, and there are now a lot of eyes on the project.
|
||||||
Naturally, lots of issues have surfaced. I'm very grateful to those who've reported
|
Naturally, lots of issues have surfaced. I'm grateful to those who've reported
|
||||||
them. I've worked hard to address them as quickly as I could.
|
them. I've worked hard to address them as quickly as I could.
|
||||||
|
|
||||||
Bug Fixes
|
Bug Fixes
|
||||||
|
@ -26,13 +26,13 @@ Bug Fixes
|
||||||
just store an index into that list, instead of a hash.
|
just store an index into that list, instead of a hash.
|
||||||
|
|
||||||
* Parse tree navigation API was rough, and buggy.
|
* Parse tree navigation API was rough, and buggy.
|
||||||
The parse-tree navigation API was the last thing I added before v0.3. I've
|
The parse-tree navigation API was the last thing I added before v0.3. I've
|
||||||
now replaced it with something better. The previous API design was flawed,
|
now replaced it with something better. The previous API design was flawed,
|
||||||
and the implementation was buggy --- Token.child() and Token.head were
|
and the implementation was buggy --- Token.child() and Token.head were
|
||||||
sometimes inconsistent.
|
sometimes inconsistent.
|
||||||
|
|
||||||
I've addressed the most immediate problems, but this part of the design is
|
I've addressed the most immediate problems, but this part of the design is
|
||||||
still a work in progress. It's a difficult problem. The parse is a tree,
|
still a work in progress. It's a difficult problem. The parse is a tree,
|
||||||
and we want to freely navigate up and down it without creating reference
|
and we want to freely navigate up and down it without creating reference
|
||||||
cycles that inhibit garbage collection, and without doing a lot of copying,
|
cycles that inhibit garbage collection, and without doing a lot of copying,
|
||||||
creating and deleting.
|
creating and deleting.
|
||||||
|
@ -53,7 +53,7 @@ pinning down or reproducing. Please send details of your system to the
|
||||||
Enhancements: Train and evaluate on whole paragraphs
|
Enhancements: Train and evaluate on whole paragraphs
|
||||||
----------------------------------------------------
|
----------------------------------------------------
|
||||||
|
|
||||||
.. note:: tl;dr: I shipped the wrong parsing model with 0.3. That model expected input to be segmented into sentences. 0.4 ships the correct model, which uses some algorithmic tricks to minimize the impact of tokenization and sentence segmentation errors on the parser.
|
.. note:: tl;dr: I shipped the wrong parsing model with 0.3. That model expected input to be segmented into sentences. 0.4 ships the correct model, which uses some algorithmic tricks to minimize the impact of tokenization and sentence segmentation errors on the parser.
|
||||||
|
|
||||||
|
|
||||||
Most English parsing research is performed on text with perfect pre-processing:
|
Most English parsing research is performed on text with perfect pre-processing:
|
||||||
|
@ -77,7 +77,7 @@ made a big difference:
|
||||||
| Corrected | 89.9 | 88.8 |
|
| Corrected | 89.9 | 88.8 |
|
||||||
+-------------+-------+----------+
|
+-------------+-------+----------+
|
||||||
|
|
||||||
.. note:: spaCy is evaluated on unlabelled dependencies, where the above accuracy figures refer to phrase-structure trees. Accuracies are non-comparable.
|
.. note:: spaCy is evaluated on unlabelled dependencies, where the above accuracy figures refer to phrase-structure trees. Accuracies are non-comparable.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -108,9 +108,9 @@ input to be segmented into sentences, but with no sentence segmenter. This
|
||||||
caused a drop in parse accuracy of 4%!
|
caused a drop in parse accuracy of 4%!
|
||||||
|
|
||||||
Over the last five days, I've worked hard to correct this. I implemented the
|
Over the last five days, I've worked hard to correct this. I implemented the
|
||||||
modifications to the parsing algorithm I had planned, from Dongdong Zhang et al
|
modifications to the parsing algorithm I had planned, from Dongdong Zhang et al.
|
||||||
(2013), and trained and evaluated the parser on raw text, using the version of
|
(2013), and trained and evaluated the parser on raw text, using the version of
|
||||||
the WSJ distributed by Read et al (2012), and used in Dridan and Oepen's
|
the WSJ distributed by Read et al. (2012), and used in Dridan and Oepen's
|
||||||
experiments.
|
experiments.
|
||||||
|
|
||||||
I'm pleased to say that on the WSJ at least, spaCy 0.4 performs almost exactly
|
I'm pleased to say that on the WSJ at least, spaCy 0.4 performs almost exactly
|
||||||
|
|
2
fabfile.py
vendored
2
fabfile.py
vendored
|
@ -1,4 +1,4 @@
|
||||||
from fabric.api import local, run, lcd, cd, env
|
from fabric.api import local, lcd, env
|
||||||
from os.path import exists as file_exists
|
from os.path import exists as file_exists
|
||||||
from fabtools.python import virtualenv
|
from fabtools.python import virtualenv
|
||||||
from os import path
|
from os import path
|
||||||
|
|
5
setup.py
5
setup.py
|
@ -1,16 +1,11 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
import subprocess
|
|
||||||
from setuptools import setup
|
from setuptools import setup
|
||||||
from glob import glob
|
|
||||||
import shutil
|
import shutil
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
from os import path
|
from os import path
|
||||||
from os.path import splitext
|
|
||||||
|
|
||||||
|
|
||||||
import shutil
|
|
||||||
from setuptools import Extension
|
from setuptools import Extension
|
||||||
from distutils import sysconfig
|
from distutils import sysconfig
|
||||||
import platform
|
import platform
|
||||||
|
|
|
@ -79,5 +79,3 @@ cpdef enum attr_id_t:
|
||||||
POS
|
POS
|
||||||
TAG
|
TAG
|
||||||
DEP
|
DEP
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -22,4 +22,3 @@ cdef class EnPosTagger:
|
||||||
|
|
||||||
cdef int set_morph(self, const int i, const PosTag* tag, TokenC* tokens) except -1
|
cdef int set_morph(self, const int i, const PosTag* tag, TokenC* tokens) except -1
|
||||||
cdef int lemmatize(self, const univ_pos_t pos, const LexemeC* lex) except -1
|
cdef int lemmatize(self, const univ_pos_t pos, const LexemeC* lex) except -1
|
||||||
|
|
||||||
|
|
|
@ -381,4 +381,3 @@ cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
|
||||||
context[7] = 4
|
context[7] = 4
|
||||||
else:
|
else:
|
||||||
context[7] = 0
|
context[7] = 0
|
||||||
|
|
||||||
|
|
|
@ -149,5 +149,3 @@ cpdef enum:
|
||||||
|
|
||||||
|
|
||||||
cdef int fill_context(atom_t* context, State* s, Tokens tokens) except -1
|
cdef int fill_context(atom_t* context, State* s, Tokens tokens) except -1
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -15,4 +15,3 @@ cdef class Span:
|
||||||
cdef public Span head
|
cdef public Span head
|
||||||
cdef public list rights
|
cdef public list rights
|
||||||
cdef public list lefts
|
cdef public list lefts
|
||||||
|
|
||||||
|
|
|
@ -277,5 +277,3 @@ class OracleError(Exception):
|
||||||
|
|
||||||
class UnknownMove(Exception):
|
class UnknownMove(Exception):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -13,5 +13,3 @@ class Config(object):
|
||||||
@classmethod
|
@classmethod
|
||||||
def read(cls, model_dir, name):
|
def read(cls, model_dir, name):
|
||||||
return cls(**json.load(open(path.join(model_dir, '%s.json' % name))))
|
return cls(**json.load(open(path.join(model_dir, '%s.json' % name))))
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -630,4 +630,3 @@ _parse_unset_error = """Text has not been parsed, so cannot be accessed.
|
||||||
Check that the parser data is installed. Run "python -m spacy.en.download" if not.
|
Check that the parser data is installed. Run "python -m spacy.en.download" if not.
|
||||||
Check whether parse=False in the call to English.__call__
|
Check whether parse=False in the call to English.__call__
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
|
@ -94,5 +94,3 @@ ctypedef uint64_t flags_t
|
||||||
ctypedef uint32_t id_t
|
ctypedef uint32_t id_t
|
||||||
ctypedef uint16_t len_t
|
ctypedef uint16_t len_t
|
||||||
ctypedef uint16_t tag_t
|
ctypedef uint16_t tag_t
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
import os
|
|
||||||
from os import path
|
from os import path
|
||||||
import codecs
|
import codecs
|
||||||
import json
|
import json
|
||||||
|
|
|
@ -33,4 +33,3 @@ cdef class Vocab:
|
||||||
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
|
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
|
||||||
|
|
||||||
cdef PreshMap _map
|
cdef PreshMap _map
|
||||||
|
|
||||||
|
|
|
@ -7,6 +7,7 @@ from spacy.lexeme import lex_of
|
||||||
|
|
||||||
from spacy import LEX, NORM, SHAPE, LAST3
|
from spacy import LEX, NORM, SHAPE, LAST3
|
||||||
|
|
||||||
|
|
||||||
def test_group_by_lex():
|
def test_group_by_lex():
|
||||||
tokens = en.tokenize("I like the red one and I like the blue one")
|
tokens = en.tokenize("I like the red one and I like the blue one")
|
||||||
names, hashes, groups = tokens.group_by(LEX)
|
names, hashes, groups = tokens.group_by(LEX)
|
||||||
|
|
|
@ -40,6 +40,7 @@ def test_begin(state, sentence):
|
||||||
assert not state.is_valid('O')
|
assert not state.is_valid('O')
|
||||||
assert not state.is_valid('U-PER')
|
assert not state.is_valid('U-PER')
|
||||||
|
|
||||||
|
|
||||||
def test_in(state, sentence):
|
def test_in(state, sentence):
|
||||||
state.transition('B-PER')
|
state.transition('B-PER')
|
||||||
assert state.n_ents == 0
|
assert state.n_ents == 0
|
||||||
|
|
|
@ -30,6 +30,3 @@ def test_align_continue():
|
||||||
assert aligned[2] == ('re-align', [(5, 7), (7, 8), (8, 13)])
|
assert aligned[2] == ('re-align', [(5, 7), (7, 8), (8, 13)])
|
||||||
assert aligned[3] == ('and', [(13, 16)])
|
assert aligned[3] == ('and', [(13, 16)])
|
||||||
assert aligned[4] == ('continue', [(16, 24)])
|
assert aligned[4] == ('continue', [(16, 24)])
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -37,5 +37,3 @@ def test_dep():
|
||||||
assert feats_array[1][1] == tokens[1].dep
|
assert feats_array[1][1] == tokens[1].dep
|
||||||
assert feats_array[2][1] == tokens[2].dep
|
assert feats_array[2][1] == tokens[2].dep
|
||||||
assert feats_array[3][1] == tokens[3].dep
|
assert feats_array[3][1] == tokens[3].dep
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -2,6 +2,7 @@
|
||||||
"""Sphinx doctest is just too hard. Manually paste doctest examples here"""
|
"""Sphinx doctest is just too hard. Manually paste doctest examples here"""
|
||||||
from spacy.en.attrs import IS_LOWER
|
from spacy.en.attrs import IS_LOWER
|
||||||
|
|
||||||
|
|
||||||
def test_1():
|
def test_1():
|
||||||
import spacy.en
|
import spacy.en
|
||||||
from spacy.parts_of_speech import ADV
|
from spacy.parts_of_speech import ADV
|
||||||
|
@ -39,6 +40,7 @@ def test2():
|
||||||
nlp.vocab[u'quietly'].prob
|
nlp.vocab[u'quietly'].prob
|
||||||
-11.07155704498291
|
-11.07155704498291
|
||||||
|
|
||||||
|
|
||||||
def test3():
|
def test3():
|
||||||
import spacy.en
|
import spacy.en
|
||||||
from spacy.parts_of_speech import ADV
|
from spacy.parts_of_speech import ADV
|
||||||
|
|
|
@ -8,6 +8,7 @@ from spacy.en import English
|
||||||
def EN():
|
def EN():
|
||||||
return English()
|
return English()
|
||||||
|
|
||||||
|
|
||||||
def test_tweebo_challenge(EN):
|
def test_tweebo_challenge(EN):
|
||||||
text = u""":o :/ :'( >:o (: :) >.< XD -__- o.O ;D :-) @_@ :P 8D :1 >:( :D =| ") :> ...."""
|
text = u""":o :/ :'( >:o (: :) >.< XD -__- o.O ;D :-) @_@ :P 8D :1 >:( :D =| ") :> ...."""
|
||||||
tokens = EN(text)
|
tokens = EN(text)
|
||||||
|
|
|
@ -16,6 +16,7 @@ def words():
|
||||||
return ["1997", "19.97", "hello9", "Hello", "HELLO", "Hello9", "\n", "!",
|
return ["1997", "19.97", "hello9", "Hello", "HELLO", "Hello9", "\n", "!",
|
||||||
"!d", "\nd"]
|
"!d", "\nd"]
|
||||||
|
|
||||||
|
|
||||||
def test_is_alpha(words):
|
def test_is_alpha(words):
|
||||||
assert not is_alpha(words[0])
|
assert not is_alpha(words[0])
|
||||||
assert not is_alpha(words[1])
|
assert not is_alpha(words[1])
|
||||||
|
|
|
@ -5,10 +5,12 @@ from spacy.strings import StringStore
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def sstore():
|
def sstore():
|
||||||
return StringStore()
|
return StringStore()
|
||||||
|
|
||||||
|
|
||||||
def test_save_bytes(sstore):
|
def test_save_bytes(sstore):
|
||||||
Hello_i = sstore[b'Hello']
|
Hello_i = sstore[b'Hello']
|
||||||
assert Hello_i == 1
|
assert Hello_i == 1
|
||||||
|
|
|
@ -2,10 +2,12 @@ import pytest
|
||||||
|
|
||||||
from spacy.en import English
|
from spacy.en import English
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def EN():
|
def EN():
|
||||||
return English()
|
return English()
|
||||||
|
|
||||||
|
|
||||||
def test_range_iter(EN):
|
def test_range_iter(EN):
|
||||||
for i in range(len(EN.vocab)):
|
for i in range(len(EN.vocab)):
|
||||||
lex = EN.vocab[i]
|
lex = EN.vocab[i]
|
||||||
|
|
|
@ -35,4 +35,3 @@ def test_merge_heads():
|
||||||
def test_issue_54():
|
def test_issue_54():
|
||||||
text = u'Talks given by women had a slightly higher number of questions asked (3.2$\pm$0.2) than talks given by men (2.6$\pm$0.1).'
|
text = u'Talks given by women had a slightly higher number of questions asked (3.2$\pm$0.2) than talks given by men (2.6$\pm$0.1).'
|
||||||
tokens = NLU(text, merge_mwes=True)
|
tokens = NLU(text, merge_mwes=True)
|
||||||
|
|
||||||
|
|
|
@ -17,6 +17,7 @@ def morph_exc():
|
||||||
'PRP$': {'his': {'L': '-PRP-', 'person': 3, 'case': 2}},
|
'PRP$': {'his': {'L': '-PRP-', 'person': 3, 'case': 2}},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def test_load_exc(EN, morph_exc):
|
def test_load_exc(EN, morph_exc):
|
||||||
EN.tagger.load_morph_exceptions(morph_exc)
|
EN.tagger.load_morph_exceptions(morph_exc)
|
||||||
tokens = EN('I like his style.', tag=True)
|
tokens = EN('I like his style.', tag=True)
|
||||||
|
|
|
@ -3,6 +3,7 @@ from spacy.en import English
|
||||||
|
|
||||||
nlp = English()
|
nlp = English()
|
||||||
|
|
||||||
|
|
||||||
def test_simple_types():
|
def test_simple_types():
|
||||||
tokens = nlp(u'Mr. Best flew to New York on Saturday morning.')
|
tokens = nlp(u'Mr. Best flew to New York on Saturday morning.')
|
||||||
ents = list(tokens.ents)
|
ents = list(tokens.ents)
|
||||||
|
|
|
@ -33,4 +33,3 @@ def test_word():
|
||||||
def test_not_number():
|
def test_not_number():
|
||||||
assert not like_number('dog')
|
assert not like_number('dog')
|
||||||
assert not like_number(',')
|
assert not like_number(',')
|
||||||
|
|
||||||
|
|
|
@ -3,6 +3,7 @@ import pytest
|
||||||
|
|
||||||
from spacy.en import English
|
from spacy.en import English
|
||||||
|
|
||||||
|
|
||||||
def test_only_pre1():
|
def test_only_pre1():
|
||||||
EN = English()
|
EN = English()
|
||||||
assert len(EN("(")) == 1
|
assert len(EN("(")) == 1
|
||||||
|
|
|
@ -58,4 +58,3 @@ def test_child_consistency(nlp, sun_text):
|
||||||
assert not children
|
assert not children
|
||||||
for head_index, children in rights.items():
|
for head_index, children in rights.items():
|
||||||
assert not children
|
assert not children
|
||||||
|
|
||||||
|
|
|
@ -49,4 +49,3 @@ def test_three_same_close(close_puncts, EN):
|
||||||
def test_double_end_quote(EN):
|
def test_double_end_quote(EN):
|
||||||
assert len(EN("Hello''")) == 2
|
assert len(EN("Hello''")) == 2
|
||||||
assert len(EN("''")) == 1
|
assert len(EN("''")) == 1
|
||||||
|
|
||||||
|
|
|
@ -3,6 +3,7 @@ from spacy.en import English
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def EN():
|
def EN():
|
||||||
return English()
|
return English()
|
||||||
|
|
|
@ -8,20 +8,26 @@ from spacy.orth import word_shape as ws
|
||||||
def test_capitalized():
|
def test_capitalized():
|
||||||
assert ws('Nasa') == 'Xxxx'
|
assert ws('Nasa') == 'Xxxx'
|
||||||
|
|
||||||
|
|
||||||
def test_truncate():
|
def test_truncate():
|
||||||
assert ws('capitalized') == 'xxxx'
|
assert ws('capitalized') == 'xxxx'
|
||||||
|
|
||||||
|
|
||||||
def test_digits():
|
def test_digits():
|
||||||
assert ws('999999999') == 'dddd'
|
assert ws('999999999') == 'dddd'
|
||||||
|
|
||||||
|
|
||||||
def test_mix():
|
def test_mix():
|
||||||
assert ws('C3P0') == 'XdXd'
|
assert ws('C3P0') == 'XdXd'
|
||||||
|
|
||||||
|
|
||||||
def test_punct():
|
def test_punct():
|
||||||
assert ws(',') == ','
|
assert ws(',') == ','
|
||||||
|
|
||||||
|
|
||||||
def test_space():
|
def test_space():
|
||||||
assert ws('\n') == '\n'
|
assert ws('\n') == '\n'
|
||||||
|
|
||||||
|
|
||||||
def test_punct_seq():
|
def test_punct_seq():
|
||||||
assert ws('``,-') == '``,-'
|
assert ws('``,-') == '``,-'
|
||||||
|
|
|
@ -3,7 +3,6 @@ from __future__ import unicode_literals
|
||||||
from spacy.en import English
|
from spacy.en import English
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import re
|
|
||||||
|
|
||||||
|
|
||||||
EN = English()
|
EN = English()
|
||||||
|
|
|
@ -13,9 +13,11 @@ def EN():
|
||||||
def test_no_special(EN):
|
def test_no_special(EN):
|
||||||
assert len(EN("(can)")) == 3
|
assert len(EN("(can)")) == 3
|
||||||
|
|
||||||
|
|
||||||
def test_no_punct(EN):
|
def test_no_punct(EN):
|
||||||
assert len(EN("can't")) == 2
|
assert len(EN("can't")) == 2
|
||||||
|
|
||||||
|
|
||||||
def test_prefix(EN):
|
def test_prefix(EN):
|
||||||
assert len(EN("(can't")) == 3
|
assert len(EN("(can't")) == 3
|
||||||
|
|
||||||
|
|
|
@ -16,6 +16,3 @@ def test_one(EN):
|
||||||
assert tokens[0].orth_ == 'Betty'
|
assert tokens[0].orth_ == 'Betty'
|
||||||
tokens2 = EN('Betty also bought a pound of butter.')
|
tokens2 = EN('Betty also bought a pound of butter.')
|
||||||
assert tokens2[0].orth_ == 'Betty'
|
assert tokens2[0].orth_ == 'Betty'
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -16,4 +16,3 @@ def test_subtrees():
|
||||||
assert len(list(bus.children)) == 1
|
assert len(list(bus.children)) == 1
|
||||||
|
|
||||||
assert len(list(wheels.subtree)) == 6
|
assert len(list(wheels.subtree)) == 6
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
from spacy.en import English
|
from spacy.en import English
|
||||||
import six
|
import six
|
||||||
|
|
||||||
|
|
||||||
def test_tag_names():
|
def test_tag_names():
|
||||||
nlp = English()
|
nlp = English()
|
||||||
tokens = nlp(u'I ate pizzas with anchovies.', parse=True, tag=True)
|
tokens = nlp(u'I ate pizzas with anchovies.', parse=True, tag=True)
|
||||||
|
|
|
@ -6,6 +6,7 @@ import pytest
|
||||||
|
|
||||||
NLU = English()
|
NLU = English()
|
||||||
|
|
||||||
|
|
||||||
def test_am_pm():
|
def test_am_pm():
|
||||||
numbers = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12']
|
numbers = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12']
|
||||||
variants = ['a.m.', 'am', 'p.m.', 'pm']
|
variants = ['a.m.', 'am', 'p.m.', 'pm']
|
||||||
|
|
|
@ -4,6 +4,7 @@ import pytest
|
||||||
from spacy.en import English
|
from spacy.en import English
|
||||||
from spacy.parts_of_speech import ADV
|
from spacy.parts_of_speech import ADV
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def nlp():
|
def nlp():
|
||||||
return English()
|
return English()
|
||||||
|
|
|
@ -7,6 +7,8 @@ from spacy.en.attrs import IS_STOP
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
nlp = English()
|
nlp = English()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def token():
|
def token():
|
||||||
tokens = nlp(u'Give it back! He pleaded.')
|
tokens = nlp(u'Give it back! He pleaded.')
|
||||||
|
@ -35,5 +37,3 @@ def test_single_token_string():
|
||||||
nlp = English()
|
nlp = English()
|
||||||
tokens = nlp(u'foobar')
|
tokens = nlp(u'foobar')
|
||||||
assert tokens[0].string == 'foobar'
|
assert tokens[0].string == 'foobar'
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -31,6 +31,7 @@ def _orphan_from_list(toks):
|
||||||
lst.append(tok)
|
lst.append(tok)
|
||||||
return lst
|
return lst
|
||||||
|
|
||||||
|
|
||||||
def test_list_orphans():
|
def test_list_orphans():
|
||||||
# Test case from NSchrading
|
# Test case from NSchrading
|
||||||
nlp = English()
|
nlp = English()
|
||||||
|
|
|
@ -10,10 +10,12 @@ from spacy.en import English
|
||||||
def EN():
|
def EN():
|
||||||
return English().tokenizer
|
return English().tokenizer
|
||||||
|
|
||||||
|
|
||||||
def test_no_word(EN):
|
def test_no_word(EN):
|
||||||
tokens = EN(u'')
|
tokens = EN(u'')
|
||||||
assert len(tokens) == 0
|
assert len(tokens) == 0
|
||||||
|
|
||||||
|
|
||||||
def test_single_word(EN):
|
def test_single_word(EN):
|
||||||
tokens = EN(u'hello')
|
tokens = EN(u'hello')
|
||||||
assert tokens[0].orth_ == 'hello'
|
assert tokens[0].orth_ == 'hello'
|
||||||
|
@ -60,6 +62,7 @@ def test_contraction_punct(EN):
|
||||||
tokens = EN("can't!")
|
tokens = EN("can't!")
|
||||||
assert len(tokens) == 3
|
assert len(tokens) == 3
|
||||||
|
|
||||||
|
|
||||||
def test_sample(EN):
|
def test_sample(EN):
|
||||||
text = """Tributes pour in for late British Labour Party leader
|
text = """Tributes pour in for late British Labour Party leader
|
||||||
|
|
||||||
|
|
|
@ -3,6 +3,7 @@ from spacy.en import English
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def tokens():
|
def tokens():
|
||||||
nlp = English()
|
nlp = English()
|
||||||
|
|
|
@ -2,6 +2,7 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
from spacy.orth import like_url
|
from spacy.orth import like_url
|
||||||
|
|
||||||
|
|
||||||
def test_basic_url():
|
def test_basic_url():
|
||||||
assert like_url('www.google.com')
|
assert like_url('www.google.com')
|
||||||
assert like_url('google.com')
|
assert like_url('google.com')
|
||||||
|
|
|
@ -4,15 +4,18 @@ from spacy.en import English
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def EN():
|
def EN():
|
||||||
return English()
|
return English()
|
||||||
|
|
||||||
|
|
||||||
def test_vec(EN):
|
def test_vec(EN):
|
||||||
hype = EN.vocab['hype']
|
hype = EN.vocab['hype']
|
||||||
assert hype.orth_ == 'hype'
|
assert hype.orth_ == 'hype'
|
||||||
assert 0.08 >= hype.repvec[0] > 0.07
|
assert 0.08 >= hype.repvec[0] > 0.07
|
||||||
|
|
||||||
|
|
||||||
def test_capitalized(EN):
|
def test_capitalized(EN):
|
||||||
hype = EN.vocab['Hype']
|
hype = EN.vocab['Hype']
|
||||||
assert hype.orth_ == 'Hype'
|
assert hype.orth_ == 'Hype'
|
||||||
|
|
|
@ -39,5 +39,3 @@ def test_newline_double_space(EN):
|
||||||
def test_newline_space_wrap(EN):
|
def test_newline_space_wrap(EN):
|
||||||
tokens = EN('hello \n possums')
|
tokens = EN('hello \n possums')
|
||||||
assert len(tokens) == 3
|
assert len(tokens) == 3
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -4,7 +4,6 @@ from spacy.en import English
|
||||||
from spacy.util import utf8open
|
from spacy.util import utf8open
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import os
|
|
||||||
from os import path
|
from os import path
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user