Merge branch 'master' of ssh://github.com/honnibal/spaCy

This commit is contained in:
Matthew Honnibal 2015-04-30 12:45:15 +02:00
commit fb8d50b3d5
94 changed files with 297 additions and 226 deletions

1
.gitignore vendored
View File

@ -17,6 +17,7 @@ models/
spacy/syntax/*.cpp spacy/syntax/*.cpp
spacy/syntax/*.html spacy/syntax/*.html
spacy/en/*.cpp spacy/en/*.cpp
spacy/en/data/*
spacy/*.cpp spacy/*.cpp
spacy/ner/*.cpp spacy/ner/*.cpp
spacy/orthography/*.cpp spacy/orthography/*.cpp

View File

@ -8,12 +8,12 @@ python:
- "2.7" - "2.7"
- "3.4" - "3.4"
# command to install dependencies # install dependencies
install: install:
- "pip install --upgrade setuptools" - "pip install --upgrade setuptools"
- "pip install -r requirements.txt" - "pip install -r requirements.txt"
- "export PYTHONPATH=`pwd`" - "export PYTHONPATH=`pwd`"
- "python setup.py build_ext --inplace" - "python setup.py build_ext --inplace"
# command to run tests # run tests
script: script:
- py.test tests/ - py.test tests/

View File

@ -1,7 +1,7 @@
spaCy is commercial open-source software: you can buy a commercial spaCy is commercial open-source software: you can buy a commercial
license, or you can use it under the AGPL, as described below. license, or you can use it under the AGPL, as described below.
spaCy Natural Language Processing Tools spaCy Natural Language Processing Tools
Copyright (C) 2015 Matthew Honnibal Copyright (C) 2015 Matthew Honnibal
This program is free software: you can redistribute it and/or modify This program is free software: you can redistribute it and/or modify

View File

@ -3,20 +3,18 @@ spaCy
http://honnibal.github.io/spaCy http://honnibal.github.io/spaCy
Fast, state-of-the-art natural language processing pipeline. Commercial licenses available, or use under AGPL. A pipeline for fast, state-of-the-art natural language processing. Commercial licenses available, otherwise under AGPL.
Version 0.80 released Version 0.80 released
--------------------- ---------------------
2015-04-13 2015-04-13
* Preliminary named entity recognition support. Accuracy is currently * Preliminary support for named-entity recognition. Its accuracy is substantially behind the state-of-the-art. I'm working on improvements.
substantially behind the current state-of-the-art. I'm working on
improvements.
* Better sentence boundary detection, drawn from the syntactic structure. * Better sentence boundary detection, drawn from the syntactic structure.
* Lots of bug fixes * Lots of bug fixes.
Supports: Supports:
@ -35,4 +33,3 @@ Difficult to support:
* PyPy 2.7 * PyPy 2.7
* PyPy 3.4 * PyPy 3.4

View File

@ -30,5 +30,3 @@ def main(text_loc):
if __name__ == '__main__': if __name__ == '__main__':
plac.call(main) plac.call(main)

View File

@ -7,8 +7,6 @@ from os import path
import shutil import shutil
import codecs import codecs
import random import random
import time
import gzip
import plac import plac
import cProfile import cProfile
@ -134,7 +132,7 @@ def main(train_loc, dev_loc, model_dir, n_sents=0, out_loc="", verbose=False,
print 'NER P', scorer.ents_p print 'NER P', scorer.ents_p
print 'NER R', scorer.ents_r print 'NER R', scorer.ents_r
print 'NER F', scorer.ents_f print 'NER F', scorer.ents_f
if __name__ == '__main__': if __name__ == '__main__':
plac.call(main) plac.call(main)

View File

@ -1,15 +1,13 @@
"""Read a vector file, and prepare it as binary data, for easy consumption""" """Read a vector file, and prepare it as binary data, for easy consumption"""
import bz2
import plac import plac
import struct
from spacy.vocab import write_binary_vectors from spacy.vocab import write_binary_vectors
def main(in_loc, out_loc): def main(in_loc, out_loc):
write_binary_vectors(in_loc, out_loc) write_binary_vectors(in_loc, out_loc)
if __name__ == '__main__': if __name__ == '__main__':
plac.call(main) plac.call(main)

View File

@ -1,7 +1,7 @@
Signing the Contributors License Agreement Signing the Contributors License Agreement
========================================== ==========================================
SpaCy is a commercial open-source project, owned by Syllogism Co. We require that contributors to SpaCy sign our Contributors License Agreement, which is based on the Oracle Contributor Agreement. SpaCy is a commercial open-source project, owned by Syllogism Co. We require that contributors to SpaCy sign our Contributors License Agreement, which is based on the Oracle Contributor Agreement.
The CLA must be signed on your first pull request. To do this, simply fill in the file cla_template.md, and include the filed in form in your first pull request. The CLA must be signed on your first pull request. To do this, simply fill in the file cla_template.md, and include the filed in form in your first pull request.
@ -11,5 +11,3 @@ The CLA must be signed on your first pull request. To do this, simply fill in th
$ git add -A spaCy/contributors/<your GitHub username>.md $ git add -A spaCy/contributors/<your GitHub username>.md
Now finish your pull request, and you're done. Now finish your pull request, and you're done.

View File

@ -2,7 +2,7 @@ Syllogism Contributor Agreement
=============================== ===============================
This Syllogism Contributor Agreement (“SCA”) is based on the Oracle Contributor This Syllogism Contributor Agreement (“SCA”) is based on the Oracle Contributor
Agreement. The SCA applies to any contribution that you make to any product or Agreement. The SCA applies to any contribution that you make to any product or
project managed by us (the “project”), and sets out the intellectual property project managed by us (the “project”), and sets out the intellectual property
rights you grant to us in the contributed materials. The term “us” shall mean rights you grant to us in the contributed materials. The term “us” shall mean
Syllogism Co. The term "you" shall mean the person or entity identified below. Syllogism Co. The term "you" shall mean the person or entity identified below.

95
contributors/suchow.md Normal file
View File

@ -0,0 +1,95 @@
Syllogism Contributor Agreement
===============================
This Syllogism Contributor Agreement (“SCA”) is based on the Oracle Contributor
Agreement. The SCA applies to any contribution that you make to any product or
project managed by us (the “project”), and sets out the intellectual property
rights you grant to us in the contributed materials. The term “us” shall mean
Syllogism Co. The term "you" shall mean the person or entity identified below.
If you agree to be bound by these terms, fill in the information requested below
and include the filled-in version with your first pull-request, under the file
contrbutors/. The name of the file should be your GitHub username, with the
extension .md. For example, the user example_user would create the file
spaCy/contributors/example_user.md .
Read this agreement carefully before signing. These terms and conditions
constitute a binding legal agreement.
1. The term 'contribution' or contributed materials means any source code,
object code, patch, tool, sample, graphic, specification, manual, documentation,
or any other material posted or submitted by you to the project.
2. With respect to any worldwide copyrights, or copyright applications and registrations,
in your contribution:
* you hereby assign to us joint ownership, and to the extent that such assignment
is or becomes invalid, ineffective or unenforceable, you hereby grant to us a perpetual,
irrevocable, non-exclusive, worldwide, no-charge, royalty-free, unrestricted license
to exercise all rights under those copyrights. This includes, at our option, the
right to sublicense these same rights to third parties through multiple levels of
sublicensees or other licensing arrangements;
* you agree that each of us can do all things in relation to your contribution
as if each of us were the sole owners, and if one of us makes a derivative work
of your contribution, the one who makes the derivative work (or has it made) will
be the sole owner of that derivative work;
* you agree that you will not assert any moral rights in your contribution against
us, our licensees or transferees;
* you agree that we may register a copyright in your contribution and exercise
all ownership rights associated with it; and
* you agree that neither of us has any duty to consult with, obtain the consent
of, pay or render an accounting to the other for any use or distribution of your
contribution.
3. With respect to any patents you own, or that you can license without payment
to any third party, you hereby grant to us a perpetual, irrevocable, non-exclusive,
worldwide, no-charge, royalty-free license to:
* make, have made, use, sell, offer to sell, import, and otherwise transfer your
contribution in whole or in part, alone or in combination with
or included in any product, work or materials arising out of the project to
which your contribution was submitted, and
* at our option, to sublicense these same rights to third parties through multiple
levels of sublicensees or other licensing arrangements.
4. Except as set out above, you keep all right, title, and interest in your
contribution. The rights that you grant to us under these terms are effective on
the date you first submitted a contribution to us, even if your submission took
place before the date you sign these terms.
5. You covenant, represent, warrant and agree that:
* Each contribution that you submit is and shall be an original work of authorship
and you can legally grant the rights set out in this SCA;
* to the best of your knowledge, each contribution will not violate any third
party's copyrights, trademarks, patents, or other intellectual property rights; and
* each contribution shall be in compliance with U.S. export control laws and other
applicable export and import laws. You agree to notify us if you become aware of
any circumstance which would make any of the foregoing representations inaccurate
in any respect. Syllogism Co. may publicly disclose your participation in the project,
including the fact that you have signed the SCA.
6. This SCA is governed by the laws of the State of California and applicable U.S.
Federal law. Any choice of law rules will not apply.
7. Please place an “x” on one of the applicable statement below. Please do NOT
mark both statements:
x___ I am signing on behalf of myself as an individual and no other person or entity, including my employer, has or will have rights with respect my contributions.
____ I am signing on behalf of my employer or a legal entity and I have the actual authority to contractually bind that entity.
| Field | Entry |
|------------------------------- | -------------------- |
| Name | Jordan Suchow |
| Company's name (if applicable) | |
| Title or Role (if applicable) | |
| Date | 2015-04-19 |
| GitHub username | suchow |
| Website (optional) | http://suchow.io |

View File

@ -64,8 +64,6 @@ def clean(ext):
if os.path.exists(html): if os.path.exists(html):
os.unlink(html) os.unlink(html)
HERE = os.path.dirname(__file__) HERE = os.path.dirname(__file__)
virtual_env = os.environ.get('VIRTUAL_ENV', '') virtual_env = os.environ.get('VIRTUAL_ENV', '')
compile_args = [] compile_args = []
@ -102,7 +100,7 @@ exts = [
Extension("spacy.syntax.arc_eager", ["spacy/syntax/arc_eager.pyx"], **ext_args), Extension("spacy.syntax.arc_eager", ["spacy/syntax/arc_eager.pyx"], **ext_args),
Extension("spacy.syntax._parse_features", ["spacy/syntax/_parse_features.pyx"], Extension("spacy.syntax._parse_features", ["spacy/syntax/_parse_features.pyx"],
**ext_args) **ext_args)
#Extension("spacy.pos_feats", ["spacy/pos_feats.pyx"], language="c++", include_dirs=includes), #Extension("spacy.pos_feats", ["spacy/pos_feats.pyx"], language="c++", include_dirs=includes),
#Extension("spacy.ner._state", ["spacy/ner/_state.pyx"], language="c++", include_dirs=includes), #Extension("spacy.ner._state", ["spacy/ner/_state.pyx"], language="c++", include_dirs=includes),
#Extension("spacy.ner.bilou_moves", ["spacy/ner/bilou_moves.pyx"], language="c++", include_dirs=includes), #Extension("spacy.ner.bilou_moves", ["spacy/ner/bilou_moves.pyx"], language="c++", include_dirs=includes),

View File

@ -28,7 +28,7 @@ API
.. autoclass:: spacy.tokens.Tokens .. autoclass:: spacy.tokens.Tokens
+---------------+-------------+-------------+ +---------------+-------------+-------------+
| Attribute | Type | Attr API | | Attribute | Type | Attr API |
+===============+=============+=============+ +===============+=============+=============+
@ -48,7 +48,7 @@ API
For faster access, the underlying C data can be accessed from Cython. You For faster access, the underlying C data can be accessed from Cython. You
can also export the data to a numpy array, via `Tokens.to_array`, if pure Python can also export the data to a numpy array, via `Tokens.to_array`, if pure Python
access is required, and you need slightly better performance. However, this access is required, and you need slightly better performance. However, this
is both slower and has a worse API than Cython access. is both slower and has a worse API than Cython access.
.. autoclass:: spacy.tokens.Token .. autoclass:: spacy.tokens.Token
@ -107,7 +107,7 @@ API
*derivational* suffixes are not stripped, e.g. the lemma of "instutitions" *derivational* suffixes are not stripped, e.g. the lemma of "instutitions"
is "institution", not "institute". Lemmatization is performed using the is "institution", not "institute". Lemmatization is performed using the
WordNet data, but extended to also cover closed-class words such as WordNet data, but extended to also cover closed-class words such as
pronouns. By default, the WN lemmatizer returns "hi" as the lemma of "his". pronouns. By default, the WN lemmatizer returns "hi" as the lemma of "his".
We assign pronouns the lemma -PRON-. We assign pronouns the lemma -PRON-.
lower lower
@ -119,9 +119,9 @@ API
shape shape
A transform of the word's string, to show orthographic features. The A transform of the word's string, to show orthographic features. The
characters a-z are mapped to x, A-Z is mapped to X, 0-9 is mapped to d. characters a-z are mapped to x, A-Z is mapped to X, 0-9 is mapped to d.
After these mappings, sequences of 4 or more of the same character are After these mappings, sequences of 4 or more of the same character are
truncated to length 4. Examples: C3Po --> XdXx, favorite --> xxxx, truncated to length 4. Examples: C3Po --> XdXx, favorite --> xxxx,
:) --> :) :) --> :)
prefix prefix
@ -161,7 +161,7 @@ API
pos pos
A part-of-speech tag, from the Google Universal Tag Set, e.g. NOUN, VERB, A part-of-speech tag, from the Google Universal Tag Set, e.g. NOUN, VERB,
ADV. Constants for the 17 tag values are provided in spacy.parts\_of\_speech. ADV. Constants for the 17 tag values are provided in spacy.parts\_of\_speech.
dep dep
The type of syntactic dependency relation between the word and its The type of syntactic dependency relation between the word and its
syntactic head. syntactic head.
@ -185,10 +185,10 @@ API
rights rights
An iterator for the immediate rightward syntactic children of the word. An iterator for the immediate rightward syntactic children of the word.
children children
An iterator that yields from lefts, and then yields from rights. An iterator that yields from lefts, and then yields from rights.
subtree subtree
An iterator for the part of the sentence syntactically governed by the An iterator for the part of the sentence syntactically governed by the
word, including the word itself. word, including the word itself.
@ -205,15 +205,15 @@ API
.. py:class:: vocab.Vocab(self, data_dir=None, lex_props_getter=None) .. py:class:: vocab.Vocab(self, data_dir=None, lex_props_getter=None)
.. py:method:: __len__(self) --> int .. py:method:: __len__(self) --> int
.. py:method:: __getitem__(self, id: int) --> unicode .. py:method:: __getitem__(self, id: int) --> unicode
.. py:method:: __getitem__(self, string: unicode) --> int .. py:method:: __getitem__(self, string: unicode) --> int
.. py:method:: __setitem__(self, py_str: unicode, props: Dict[str, int[float]) --> None .. py:method:: __setitem__(self, py_str: unicode, props: Dict[str, int[float]) --> None
.. py:method:: dump(self, loc: unicode) --> None .. py:method:: dump(self, loc: unicode) --> None
.. py:method:: load_lexemes(self, loc: unicode) --> None .. py:method:: load_lexemes(self, loc: unicode) --> None
.. py:method:: load_vectors(self, loc: unicode) --> None .. py:method:: load_vectors(self, loc: unicode) --> None
@ -223,9 +223,9 @@ API
.. py:method:: __len__(self) --> int .. py:method:: __len__(self) --> int
.. py:method:: __getitem__(self, id: int) --> unicode .. py:method:: __getitem__(self, id: int) --> unicode
.. py:method:: __getitem__(self, string: bytes) --> id .. py:method:: __getitem__(self, string: bytes) --> id
.. py:method:: __getitem__(self, string: unicode) --> id .. py:method:: __getitem__(self, string: unicode) --> id
.. py:method:: dump(self, loc: unicode) --> None .. py:method:: dump(self, loc: unicode) --> None

View File

@ -66,7 +66,7 @@ Boolean features
+-------------+--------------------------------------------------------------+ +-------------+--------------------------------------------------------------+
| IS_UPPER | The result of sic.isupper() | | IS_UPPER | The result of sic.isupper() |
+-------------+--------------------------------------------------------------+ +-------------+--------------------------------------------------------------+
| LIKE_URL | Check whether the string looks like it could be a URL. Aims | | LIKE_URL | Check whether the string looks like it could be a URL. Aims |
| | for low false negative rate. | | | for low false negative rate. |
+-------------+--------------------------------------------------------------+ +-------------+--------------------------------------------------------------+
| LIKE_NUMBER | Check whether the string looks like it could be a numeric | | LIKE_NUMBER | Check whether the string looks like it could be a numeric |
@ -75,4 +75,3 @@ Boolean features
+-------------+--------------------------------------------------------------+ +-------------+--------------------------------------------------------------+
| IN_LIST | Facility for loading arbitrary run-time word lists? | | IN_LIST | Facility for loading arbitrary run-time word lists? |
+-------------+--------------------------------------------------------------+ +-------------+--------------------------------------------------------------+

View File

@ -6,7 +6,7 @@ What and Why
spaCy is a lightning-fast, full-cream NLP tokenizer and lexicon. spaCy is a lightning-fast, full-cream NLP tokenizer and lexicon.
Most tokenizers give you a sequence of strings. That's barbaric. Most tokenizers give you a sequence of strings. That's barbaric.
Giving you strings invites you to compute on every *token*, when what Giving you strings invites you to compute on every *token*, when what
you should be doing is computing on every *type*. Remember you should be doing is computing on every *type*. Remember
`Zipf's law <http://en.wikipedia.org/wiki/Zipf's_law>`_: you'll `Zipf's law <http://en.wikipedia.org/wiki/Zipf's_law>`_: you'll
@ -28,14 +28,14 @@ can access an excellent set of pre-computed orthographic and distributional feat
>>> are.check_flag(en.CAN_NOUN) >>> are.check_flag(en.CAN_NOUN)
False False
spaCy makes it easy to write very efficient NLP applications, because your feature spaCy makes it easy to write efficient NLP applications, because your feature
functions have to do almost no work: almost every lexical property you'll want functions have to do almost no work: almost every lexical property you'll want
is pre-computed for you. See the tutorial for an example POS tagger. is pre-computed for you. See the tutorial for an example POS tagger.
Benchmark Benchmark
--------- ---------
The tokenizer itself is also very efficient: The tokenizer itself is also efficient:
+--------+-------+--------------+--------------+ +--------+-------+--------------+--------------+
| System | Time | Words/second | Speed Factor | | System | Time | Words/second | Speed Factor |
@ -56,7 +56,7 @@ Pros:
- All tokens come with indices into the original string - All tokens come with indices into the original string
- Full unicode support - Full unicode support
- Extensible to other languages - Extendable to other languages
- Batch operations computed efficiently in Cython - Batch operations computed efficiently in Cython
- Cython API - Cython API
- numpy interoperability - numpy interoperability
@ -68,4 +68,3 @@ Cons:
- Higher memory usage (up to 1gb) - Higher memory usage (up to 1gb)
- More conceptually complicated - More conceptually complicated
- Tokenization rules expressed in code, not as data - Tokenization rules expressed in code, not as data

View File

@ -116,13 +116,13 @@ this was written quickly and has not been executed):
This procedure splits off tokens from the start and end of the string, at each This procedure splits off tokens from the start and end of the string, at each
point checking whether the remaining string is in our special-cases table. If point checking whether the remaining string is in our special-cases table. If
it is, we stop splitting, and return the tokenization at that point. it is, we stop splitting, and return the tokenization at that point.
The advantage of this design is that the prefixes, suffixes and special-cases The advantage of this design is that the prefixes, suffixes and special-cases
can be declared separately, in easy-to-understand files. If a new entry is can be declared separately, in easy-to-understand files. If a new entry is
added to the special-cases, you can be sure that it won't have some unforeseen added to the special-cases, you can be sure that it won't have some unforeseen
consequence to a complicated regular-expression grammar. consequence to a complicated regular-expression grammar.
Coupling the Tokenizer and Lexicon Coupling the Tokenizer and Lexicon
################################## ##################################
@ -135,7 +135,7 @@ lexical types.
In a sample of text, vocabulary size grows exponentially slower than word In a sample of text, vocabulary size grows exponentially slower than word
count. So any computations we can perform over the vocabulary and apply to the count. So any computations we can perform over the vocabulary and apply to the
word count are very efficient. word count are efficient.
Part-of-speech Tagger Part-of-speech Tagger
@ -159,7 +159,7 @@ Dependency Parser
The parser uses the algorithm described in my `2014 blog post`_. The parser uses the algorithm described in my `2014 blog post`_.
This algorithm, shift-reduce dependency parsing, is becoming widely adopted due This algorithm, shift-reduce dependency parsing, is becoming widely adopted due
to its compelling speed/accuracy trade-off. to its compelling speed/accuracy trade-off.
Some quick details about spaCy's take on this, for those who happen to know Some quick details about spaCy's take on this, for those who happen to know
these models well. I'll write up a better description shortly. these models well. I'll write up a better description shortly.
@ -176,7 +176,7 @@ scored 91.0. So how have I gotten it to 92.4? The following tweaks:
1. I use Brown cluster features --- these help a lot; 1. I use Brown cluster features --- these help a lot;
2. I redesigned the feature set. I've long known that the Zhang and Nivre 2. I redesigned the feature set. I've long known that the Zhang and Nivre
(2011) feature set was suboptimal, but a few features don't make a very (2011) feature set was suboptimal, but a few features don't make a very
compelling publication. Still, they're important. compelling publication. Still, they're important.
3. When I do the dynamic oracle training, I also make 3. When I do the dynamic oracle training, I also make
the upate cost-sensitive: if the oracle determines that the move the parser the upate cost-sensitive: if the oracle determines that the move the parser
took has a cost of N, then the weights for the gold class are incremented by took has a cost of N, then the weights for the gold class are incremented by
@ -206,8 +206,8 @@ loop:
class_, score = max(enumerate(scores), key=lambda item: item[1]) class_, score = max(enumerate(scores), key=lambda item: item[1])
transition(state, class_) transition(state, class_)
The parser makes 2N transitions for a sentence of length N. In order to select The parser makes 2N transitions for a sentence of length N. In order to select
the transition, it extracts a vector of K features from the state. Each feature the transition, it extracts a vector of K features from the state. Each feature
is used as a key into a hash table managed by the model. The features map to is used as a key into a hash table managed by the model. The features map to
a vector of weights, of length C. We then dot product the feature weights to the a vector of weights, of length C. We then dot product the feature weights to the
scores vector we are building for that instance. scores vector we are building for that instance.
@ -253,12 +253,10 @@ the classes. In the case of the parser, this means the hash table is accessed
2NKC times, instead of the 2NK times if you have a weights vector. You should 2NKC times, instead of the 2NK times if you have a weights vector. You should
also be careful to store the weights contiguously in memory --- you don't want also be careful to store the weights contiguously in memory --- you don't want
a linked list here. I use a block-sparse format, because my problems tend to a linked list here. I use a block-sparse format, because my problems tend to
have a few dozen classes. have a few dozen classes.
I guess if I had to summarize my experience, I'd say that the efficiency of I guess if I had to summarize my experience, I'd say that the efficiency of
these models is really all about the data structures. We want to stay small, these models is really all about the data structures. We want to stay small,
and stay contiguous. Minimize redundancy and minimize pointer chasing. and stay contiguous. Minimize redundancy and minimize pointer chasing.
That's why Cython is so well suited to this: we get to lay out our data That's why Cython is so well suited to this: we get to lay out our data
structures, and manage the memory ourselves, with full C-level control. structures, and manage the memory ourselves, with full C-level control.

View File

@ -10,7 +10,7 @@ spaCy: Industrial-strength NLP
.. _Issue Tracker: https://github.com/honnibal/spaCy/issues .. _Issue Tracker: https://github.com/honnibal/spaCy/issues
**13/04**: *Version 0.80 released. Includes named entity recognition, better sentence **13/04**: *Version 0.80 released. Includes named entity recognition, better sentence
boundary detection, and many bug fixes.* boundary detection, and many bug fixes.*
`spaCy`_ is a new library for text processing in Python and Cython. `spaCy`_ is a new library for text processing in Python and Cython.
@ -28,7 +28,7 @@ If they don't want to stay in academia, they join Google, IBM, etc.
The net result is that outside of the tech giants, commercial NLP has changed The net result is that outside of the tech giants, commercial NLP has changed
little in the last ten years. In academia, it's changed entirely. Amazing little in the last ten years. In academia, it's changed entirely. Amazing
improvements in quality. Orders of magnitude faster. But the improvements in quality. Orders of magnitude faster. But the
academic code is always GPL, undocumented, unuseable, or all three. You could academic code is always GPL, undocumented, unuseable, or all three. You could
implement the ideas yourself, but the papers are hard to read, and training implement the ideas yourself, but the papers are hard to read, and training
data is exorbitantly expensive. So what are you left with? A common answer is data is exorbitantly expensive. So what are you left with? A common answer is
@ -37,7 +37,7 @@ tokenizer is suitable for production use.
I used to think that the NLP community just needed to do more to communicate I used to think that the NLP community just needed to do more to communicate
its findings to software engineers. So I wrote two blog posts, explaining its findings to software engineers. So I wrote two blog posts, explaining
`how to write a part-of-speech tagger`_ and `parser`_. Both were very well received, `how to write a part-of-speech tagger`_ and `parser`_. Both were well received,
and there's been a bit of interest in `my research software`_ --- even though and there's been a bit of interest in `my research software`_ --- even though
it's entirely undocumented, and mostly unuseable to anyone but me. it's entirely undocumented, and mostly unuseable to anyone but me.
@ -58,14 +58,14 @@ to embedded word representations, and a range of useful features are pre-calcula
and cached. and cached.
If none of that made any sense to you, here's the gist of it. Computers don't If none of that made any sense to you, here's the gist of it. Computers don't
understand text. This is unfortunate, because that's what the web almost entirely understand text. This is unfortunate, because that's what the web almost entirely
consists of. We want to recommend people text based on other text they liked. consists of. We want to recommend people text based on other text they liked.
We want to shorten text to display it on a mobile screen. We want to aggregate We want to shorten text to display it on a mobile screen. We want to aggregate
it, link it, filter it, categorise it, generate it and correct it. it, link it, filter it, categorise it, generate it and correct it.
spaCy provides a library of utility functions that help programmers build such spaCy provides a library of utility functions that help programmers build such
products. It's commercial open source software: you can either use it under products. It's commercial open source software: you can either use it under
the AGPL, or you can `buy a commercial license`_ for a one-time fee. the AGPL, or you can `buy a commercial license`_ for a one-time fee.
.. _buy a commercial license: license.html .. _buy a commercial license: license.html
@ -148,7 +148,7 @@ cosine metric:
>>> from numpy import dot >>> from numpy import dot
>>> from numpy.linalg import norm >>> from numpy.linalg import norm
>>> cosine = lambda v1, v2: dot(v1, v2) / (norm(v1) * norm(v2)) >>> cosine = lambda v1, v2: dot(v1, v2) / (norm(v1) * norm(v2))
>>> words = [w for w in nlp.vocab if w.has_repvec] >>> words = [w for w in nlp.vocab if w.has_repvec]
>>> words.sort(key=lambda w: cosine(w.repvec, pleaded.repvec)) >>> words.sort(key=lambda w: cosine(w.repvec, pleaded.repvec))
@ -200,9 +200,9 @@ this:
We wanted to refine the logic so that only adverbs modifying evocative verbs We wanted to refine the logic so that only adverbs modifying evocative verbs
of communication, like "pleaded", were highlighted. We've now built a vector that of communication, like "pleaded", were highlighted. We've now built a vector that
represents that type of word, so now we can highlight adverbs based on very represents that type of word, so now we can highlight adverbs based on
subtle logic, honing in on adverbs that seem the most stylistically subtle logic, honing in on adverbs that seem the most stylistically
problematic, given our starting assumptions: problematic, given our starting assumptions:
@ -213,7 +213,7 @@ problematic, given our starting assumptions:
>>> from spacy.parts_of_speech import ADV, VERB >>> from spacy.parts_of_speech import ADV, VERB
>>> cosine = lambda v1, v2: dot(v1, v2) / (norm(v1) * norm(v2)) >>> cosine = lambda v1, v2: dot(v1, v2) / (norm(v1) * norm(v2))
>>> def is_bad_adverb(token, target_verb, tol): >>> def is_bad_adverb(token, target_verb, tol):
... if token.pos != ADV ... if token.pos != ADV
... return False ... return False
... elif token.head.pos != VERB: ... elif token.head.pos != VERB:
... return False ... return False
@ -238,11 +238,11 @@ database, and processed with an NLP library, to one of three levels of detail
--- tokenization, tagging, or parsing. The tasks are additive: to parse the --- tokenization, tagging, or parsing. The tasks are additive: to parse the
text you have to tokenize and tag it. The pre-processing was not subtracted text you have to tokenize and tag it. The pre-processing was not subtracted
from the times --- I report the time required for the pipeline to complete. from the times --- I report the time required for the pipeline to complete.
I report mean times per document, in milliseconds. I report mean times per document, in milliseconds.
**Hardware**: Intel i7-3770 (2012) **Hardware**: Intel i7-3770 (2012)
.. table:: Efficiency comparison. Lower is better. .. table:: Efficiency comparison. Lower is better.
+--------------+---------------------------+--------------------------------+ +--------------+---------------------------+--------------------------------+
| | Absolute (ms per doc) | Relative (to spaCy) | | | Absolute (ms per doc) | Relative (to spaCy) |
@ -278,7 +278,7 @@ representations.
publish or perform any benchmark or performance tests or analysis relating to publish or perform any benchmark or performance tests or analysis relating to
the Service or the use thereof without express authorization from AlchemyAPI; the Service or the use thereof without express authorization from AlchemyAPI;
.. Did you get that? You're not allowed to evaluate how well their system works, .. Did you get that? You're not allowed to evaluate how well their system works,
unless you're granted a special exception. Their system must be pretty unless you're granted a special exception. Their system must be pretty
terrible to motivate such an embarrassing restriction. terrible to motivate such an embarrassing restriction.
They must know this makes them look bad, but they apparently believe allowing They must know this makes them look bad, but they apparently believe allowing
@ -287,7 +287,7 @@ representations.
.. spaCy is based on science, not alchemy. It's open source, and I am happy to .. spaCy is based on science, not alchemy. It's open source, and I am happy to
clarify any detail of the algorithms I've implemented. clarify any detail of the algorithms I've implemented.
It's evaluated against the current best published systems, following the standard It's evaluated against the current best published systems, following the standard
methodologies. These evaluations show that it performs extremely well. methodologies. These evaluations show that it performs extremely well.
Accuracy Comparison Accuracy Comparison
------------------- -------------------
@ -299,7 +299,7 @@ Accuracy Comparison
+--------------+----------+------------+ +--------------+----------+------------+
| spaCy | 97.2 | 92.4 | | spaCy | 97.2 | 92.4 |
+--------------+----------+------------+ +--------------+----------+------------+
| CoreNLP | 96.9 | 92.2 | | CoreNLP | 96.9 | 92.2 |
+--------------+----------+------------+ +--------------+----------+------------+
| ZPar | 97.3 | 92.9 | | ZPar | 97.3 | 92.9 |
+--------------+----------+------------+ +--------------+----------+------------+
@ -329,5 +329,5 @@ previous fastest parser that I'm aware of.
quickstart.rst quickstart.rst
api.rst api.rst
howworks.rst howworks.rst
license.rst license.rst
updates.rst updates.rst

View File

@ -97,7 +97,7 @@ like lead-text take a while to float up the priority list. This strategy also h
the advantage of transparency: it's obvious to users how the decision is being the advantage of transparency: it's obvious to users how the decision is being
made, so nobody is likely to complain about the feature if it works this way. made, so nobody is likely to complain about the feature if it works this way.
Instead of cutting off the text mid-word, we can tokenize the text, and Instead of cutting off the text mid-word, we can tokenize the text, and
+----------------+-----------+ +----------------+-----------+
| System | Rouge-1 R | | System | Rouge-1 R |
@ -116,7 +116,7 @@ A simple bag-of-words model can be created using the `count_by` method, which
produces a dictionary of frequencies, keyed by string IDs: produces a dictionary of frequencies, keyed by string IDs:
.. code:: python .. code:: python
>>> from spacy.en import English >>> from spacy.en import English
>>> from spacy.en.attrs import SIC >>> from spacy.en.attrs import SIC
>>> nlp = English() >>> nlp = English()
@ -148,7 +148,7 @@ from any token:
.. _word embeddings: https://colah.github.io/posts/2014-07-NLP-RNNs-Representations/ .. _word embeddings: https://colah.github.io/posts/2014-07-NLP-RNNs-Representations/
@ -196,8 +196,8 @@ undirected --- so, it's natural to represent this as a matrix:
from scipy.spatial.distance import cosine from scipy.spatial.distance import cosine
import numpy import numpy
def lexrank(sent_vectors): def lexrank(sent_vectors):
n = len(sent_vectors) n = len(sent_vectors)
# Build the cosine similarity matrix # Build the cosine similarity matrix
@ -205,7 +205,7 @@ undirected --- so, it's natural to represent this as a matrix:
for i in range(n): for i in range(n):
for j in range(n): for j in range(n):
matrix[i, j] = cosine(sent_vectors[i], sent_vectors[j]) matrix[i, j] = cosine(sent_vectors[i], sent_vectors[j])
# Normalize # Normalize
for i in range(n): for i in range(n):
matrix[i] /= sum(matrix[i]) matrix[i] /= sum(matrix[i])
return _pagerank(matrix) return _pagerank(matrix)
@ -278,6 +278,3 @@ sentence represents the document as a whole.
Document Model Document Model
-------------- --------------

View File

@ -13,7 +13,7 @@ I've been writing spaCy for six months now, and I'm very excited to release it.
I think it's the most valuable thing I could have built. When I was in I think it's the most valuable thing I could have built. When I was in
academia, I noticed that small companies couldn't really make use of our work. academia, I noticed that small companies couldn't really make use of our work.
Meanwhile the tech giants have been hiring *everyone*, and putting this stuff Meanwhile the tech giants have been hiring *everyone*, and putting this stuff
into production. I think spaCy can change that. into production. I think spaCy can change that.
+------------+-----------+----------+-------------------------------------+ +------------+-----------+----------+-------------------------------------+
@ -35,7 +35,7 @@ And if you're ever in acquisition or IPO talks, the story is simple.
spaCy can also be used as free open-source software, under the Aferro GPL spaCy can also be used as free open-source software, under the Aferro GPL
license. If you use it this way, you must comply with the AGPL license terms. license. If you use it this way, you must comply with the AGPL license terms.
When you distribute your project, or offer it as a network service, you must When you distribute your project, or offer it as a network service, you must
distribute the source-code, and grant users an AGPL license to it. distribute the source-code and grant users an AGPL license to it.
.. I left academia in June 2014, just when I should have been submitting my first .. I left academia in June 2014, just when I should have been submitting my first
@ -52,14 +52,14 @@ Examples
-------- --------
In order to clarify how spaCy's license structure might apply to you, I've In order to clarify how spaCy's license structure might apply to you, I've
written a few examples, in the form of user-stories. written a few examples, in the form of user-stories.
Ashley and Casey: Seed stage start-up Ashley and Casey: Seed stage start-up
##################################### #####################################
Ashley and Casey have an idea for a start-up. To explore their idea, they want Ashley and Casey have an idea for a start-up. To explore their idea, they want
to build a minimum viable product they can put in front of potential users and to build a minimum viable product they can put in front of potential users and
investors. investors.
They have two options. They have two options.
@ -75,7 +75,7 @@ They have two options.
import a module that imports it, etc). They also cannot use spaCy as import a module that imports it, etc). They also cannot use spaCy as
a network resource, by running it as a service --- this is the a network resource, by running it as a service --- this is the
loophole that the "A" part of the AGPL is designed to close. loophole that the "A" part of the AGPL is designed to close.
Ashley and Casey find the AGPL license unattractive for commercial use. Ashley and Casey find the AGPL license unattractive for commercial use.
They decide to take up the trial commercial license. They decide to take up the trial commercial license.
However, over the next 90 days, Ashley has to move house twice, and Casey gets However, over the next 90 days, Ashley has to move house twice, and Casey gets
@ -92,7 +92,7 @@ developing. They own the copyright to any modifications they make to spaCy,
but not to the original spaCy code. but not to the original spaCy code.
No additional fees will be due when they hire new developers, run spaCy on No additional fees will be due when they hire new developers, run spaCy on
additional internal servers, etc. If their company is acquired, the license will additional internal servers, etc. If their company is acquired, the license will
be transferred to the company acquiring them. However, to use spaCy in another be transferred to the company acquiring them. However, to use spaCy in another
product, they will have to buy a second license. product, they will have to buy a second license.
@ -115,9 +115,9 @@ In order to do this, they must sign a contributor agreement, ceding their
copyright. When commercial licenses to spaCy are sold, Alex and Sasha will copyright. When commercial licenses to spaCy are sold, Alex and Sasha will
not be able to claim any royalties from their contributions. not be able to claim any royalties from their contributions.
Later, Alex and Sasha implement new features into spaCy, for another paper. The Later, Alex and Sasha implement new features into spaCy, for another paper. The
code was quite rushed, and they don't want to take the time to put together a code was quite rushed, and they don't want to take the time to put together a
proper pull request. They must release their modifications under the AGPL, but proper pull request. They must release their modifications under the AGPL, but
they are not obliged to contribute it to the spaCy repository, or concede their they are not obliged to contribute it to the spaCy repository, or concede their
copyright. copyright.
@ -126,8 +126,8 @@ Phuong and Jessie: Open Source developers
######################################### #########################################
Phuong and Jessie use the open-source software Calibre to manage their e-book Phuong and Jessie use the open-source software Calibre to manage their e-book
libraries. They have an idea for a search feature, and they want to use spaCy libraries. They have an idea for a search feature, and they want to use spaCy
to implement it. Calibre is released under the GPLv3. The AGPL has additional to implement it. Calibre is released under the GPLv3. The AGPL has additional
restrictions for projects used as a network resource, but they don't apply to restrictions for projects used as a network resource, but they don't apply to
this project, so Phuong and Jessie can use spaCy to improve Calibre. They'll this project, so Phuong and Jessie can use spaCy to improve Calibre. They'll
have to release their code, but that was always their intention anyway. have to release their code, but that was always their intention anyway.

View File

@ -18,12 +18,12 @@ With Python 2.7 or Python 3, using Linux or OSX, run:
.. _300 mb of data: http://s3-us-west-1.amazonaws.com/media.spacynlp.com/en_data_all-0.4.tgz .. _300 mb of data: http://s3-us-west-1.amazonaws.com/media.spacynlp.com/en_data_all-0.4.tgz
The download command fetches and installs about 300mb of data, for the The download command fetches and installs about 300mb of data, for the
parser model and word vectors, which it installs within the spacy.en package directory. parser model and word vectors, which it installs within the spacy.en package directory.
If you're stuck using a server with an old version of Python, and you don't If you're stuck using a server with an old version of Python, and you don't
have root access, I've prepared a bootstrap script to help you compile a local have root access, I've prepared a bootstrap script to help you compile a local
Python install. Run: Python install. Run:
.. code:: bash .. code:: bash
@ -47,7 +47,7 @@ this is how I build the project.
$ py.test tests/ $ py.test tests/
Python packaging is awkward at the best of times, and it's particularly tricky Python packaging is awkward at the best of times, and it's particularly tricky
with C extensions, built via Cython, requiring large data files. So, please with C extensions, built via Cython, requiring large data files. So, please
report issues as you encounter them, and bear with me :) report issues as you encounter them, and bear with me :)
Usage Usage
@ -88,7 +88,7 @@ the original orthographic form of the word.
.. py:class:: spacy.en.English(self, data_dir=join(dirname(__file__), 'data')) .. py:class:: spacy.en.English(self, data_dir=join(dirname(__file__), 'data'))
.. py:method:: __call__(self, text: unicode, tag=True, parse=True, entity=True, merge_mwes=False) --> Tokens .. py:method:: __call__(self, text: unicode, tag=True, parse=True, entity=True, merge_mwes=False) --> Tokens
+-----------------+--------------+--------------+ +-----------------+--------------+--------------+
| Attribute | Type | Its API | | Attribute | Type | Its API |
@ -121,7 +121,7 @@ the original orthographic form of the word.
**Get sentence or named entity spans** **Get sentence or named entity spans**
.. py:attribute:: tokens.Tokens.sents --> Iterator[Span] .. py:attribute:: tokens.Tokens.sents --> Iterator[Span]
.. py:attribute:: tokens.Tokens.ents --> Iterator[Span] .. py:attribute:: tokens.Tokens.ents --> Iterator[Span]
You can iterate over a Span to access individual Tokens, or access its You can iterate over a Span to access individual Tokens, or access its
@ -131,7 +131,7 @@ the original orthographic form of the word.
**Embedded word representenations** **Embedded word representenations**
.. py:attribute:: tokens.Token.repvec .. py:attribute:: tokens.Token.repvec
.. py:attribute:: lexeme.Lexeme.repvec .. py:attribute:: lexeme.Lexeme.repvec
@ -150,13 +150,13 @@ the original orthographic form of the word.
**Align to original string** **Align to original string**
.. py:attribute:: string: unicode .. py:attribute:: string: unicode
Padded with original whitespace. Padded with original whitespace.
.. py:attribute:: length: int .. py:attribute:: length: int
Length, in unicode code-points. Equal to len(self.orth_). Length, in unicode code-points. Equal to len(self.orth_).
.. py:attribute:: idx: int .. py:attribute:: idx: int
Starting offset of word in the original string. Starting offset of word in the original string.
@ -234,4 +234,3 @@ Features
+---------+-----------------------------------------------------------+ +---------+-----------------------------------------------------------+
| prob | Log probability of word, smoothed with Simple Good-Turing | | prob | Log probability of word, smoothed with Simple Good-Turing |
+---------+-----------------------------------------------------------+ +---------+-----------------------------------------------------------+

View File

@ -7,8 +7,8 @@ Updates
Five days ago I presented the alpha release of spaCy, a natural language Five days ago I presented the alpha release of spaCy, a natural language
processing library that brings state-of-the-art technology to small companies. processing library that brings state-of-the-art technology to small companies.
spaCy has been very well received, and there are now a lot of eyes on the project. spaCy has been well received, and there are now a lot of eyes on the project.
Naturally, lots of issues have surfaced. I'm very grateful to those who've reported Naturally, lots of issues have surfaced. I'm grateful to those who've reported
them. I've worked hard to address them as quickly as I could. them. I've worked hard to address them as quickly as I could.
Bug Fixes Bug Fixes
@ -21,22 +21,22 @@ Bug Fixes
all look-ups into the vocabulary failed on wide unicode builds, which all look-ups into the vocabulary failed on wide unicode builds, which
further meant that the part-of-speech tagger and parser features were not further meant that the part-of-speech tagger and parser features were not
computed correctly. computed correctly.
The fix is simple: we already have to read in a list of all the strings, so The fix is simple: we already have to read in a list of all the strings, so
just store an index into that list, instead of a hash. just store an index into that list, instead of a hash.
* Parse tree navigation API was rough, and buggy. * Parse tree navigation API was rough, and buggy.
The parse-tree navigation API was the last thing I added before v0.3. I've The parse-tree navigation API was the last thing I added before v0.3. I've
now replaced it with something better. The previous API design was flawed, now replaced it with something better. The previous API design was flawed,
and the implementation was buggy --- Token.child() and Token.head were and the implementation was buggy --- Token.child() and Token.head were
sometimes inconsistent. sometimes inconsistent.
I've addressed the most immediate problems, but this part of the design is I've addressed the most immediate problems, but this part of the design is
still a work in progress. It's a difficult problem. The parse is a tree, still a work in progress. It's a difficult problem. The parse is a tree,
and we want to freely navigate up and down it without creating reference and we want to freely navigate up and down it without creating reference
cycles that inhibit garbage collection, and without doing a lot of copying, cycles that inhibit garbage collection, and without doing a lot of copying,
creating and deleting. creating and deleting.
I think I've got a promising solution to this, but I suspect there's I think I've got a promising solution to this, but I suspect there's
currently a memory leak. Please get in touch no the tracker if you want to currently a memory leak. Please get in touch no the tracker if you want to
know more, especially if you think you can help. know more, especially if you think you can help.
@ -53,14 +53,14 @@ pinning down or reproducing. Please send details of your system to the
Enhancements: Train and evaluate on whole paragraphs Enhancements: Train and evaluate on whole paragraphs
---------------------------------------------------- ----------------------------------------------------
.. note:: tl;dr: I shipped the wrong parsing model with 0.3. That model expected input to be segmented into sentences. 0.4 ships the correct model, which uses some algorithmic tricks to minimize the impact of tokenization and sentence segmentation errors on the parser. .. note:: tl;dr: I shipped the wrong parsing model with 0.3. That model expected input to be segmented into sentences. 0.4 ships the correct model, which uses some algorithmic tricks to minimize the impact of tokenization and sentence segmentation errors on the parser.
Most English parsing research is performed on text with perfect pre-processing: Most English parsing research is performed on text with perfect pre-processing:
one newline between every sentence, one space between every token. one newline between every sentence, one space between every token.
It's always been done this way, and it's good. It's a useful idealisation, It's always been done this way, and it's good. It's a useful idealisation,
because the pre-processing has few algorithmic implications. because the pre-processing has few algorithmic implications.
But, for practical performance, this stuff can matter a lot. But, for practical performance, this stuff can matter a lot.
Dridan and Oepen (2013) did a simple but rare thing: they actually ran a few Dridan and Oepen (2013) did a simple but rare thing: they actually ran a few
parsers on raw text. Even on the standard Wall Street Journal corpus, parsers on raw text. Even on the standard Wall Street Journal corpus,
@ -77,7 +77,7 @@ made a big difference:
| Corrected | 89.9 | 88.8 | | Corrected | 89.9 | 88.8 |
+-------------+-------+----------+ +-------------+-------+----------+
.. note:: spaCy is evaluated on unlabelled dependencies, where the above accuracy figures refer to phrase-structure trees. Accuracies are non-comparable. .. note:: spaCy is evaluated on unlabelled dependencies, where the above accuracy figures refer to phrase-structure trees. Accuracies are non-comparable.
@ -108,9 +108,9 @@ input to be segmented into sentences, but with no sentence segmenter. This
caused a drop in parse accuracy of 4%! caused a drop in parse accuracy of 4%!
Over the last five days, I've worked hard to correct this. I implemented the Over the last five days, I've worked hard to correct this. I implemented the
modifications to the parsing algorithm I had planned, from Dongdong Zhang et al modifications to the parsing algorithm I had planned, from Dongdong Zhang et al.
(2013), and trained and evaluated the parser on raw text, using the version of (2013), and trained and evaluated the parser on raw text, using the version of
the WSJ distributed by Read et al (2012), and used in Dridan and Oepen's the WSJ distributed by Read et al. (2012), and used in Dridan and Oepen's
experiments. experiments.
I'm pleased to say that on the WSJ at least, spaCy 0.4 performs almost exactly I'm pleased to say that on the WSJ at least, spaCy 0.4 performs almost exactly

2
fabfile.py vendored
View File

@ -1,4 +1,4 @@
from fabric.api import local, run, lcd, cd, env from fabric.api import local, lcd, env
from os.path import exists as file_exists from os.path import exists as file_exists
from fabtools.python import virtualenv from fabtools.python import virtualenv
from os import path from os import path

View File

@ -1,7 +1,7 @@
{ {
"PRP": { "PRP": {
"I": {"L": "-PRON-", "person": 1, "number": 1, "gender": 0, "case": 1}, "I": {"L": "-PRON-", "person": 1, "number": 1, "gender": 0, "case": 1},
"me": {"L": "-PRON-", "person": 1, "number": 1, "gender": 0, "case": 3}, "me": {"L": "-PRON-", "person": 1, "number": 1, "gender": 0, "case": 3},
"mine": {"L": "-PRON-", "person": 1, "number": 1, "gender": 0, "case": 2}, "mine": {"L": "-PRON-", "person": 1, "number": 1, "gender": 0, "case": 2},
"myself": {"L": "-PRON-", "person": 1, "number": 1, "gender": 0, "case": 4}, "myself": {"L": "-PRON-", "person": 1, "number": 1, "gender": 0, "case": 4},
"you": {"L": "-PRON-", "person": 2, "number": 0, "gender": 0, "case": 0}, "you": {"L": "-PRON-", "person": 2, "number": 0, "gender": 0, "case": 0},

View File

@ -1,16 +1,11 @@
#!/usr/bin/env python #!/usr/bin/env python
import subprocess
from setuptools import setup from setuptools import setup
from glob import glob
import shutil import shutil
import sys import sys
import os import os
from os import path from os import path
from os.path import splitext
import shutil
from setuptools import Extension from setuptools import Extension
from distutils import sysconfig from distutils import sysconfig
import platform import platform
@ -155,7 +150,7 @@ MOD_NAMES = ['spacy.parts_of_speech', 'spacy.strings',
'spacy.morphology', 'spacy.morphology',
'spacy._ml', 'spacy.tokenizer', 'spacy.en.attrs', 'spacy._ml', 'spacy.tokenizer', 'spacy.en.attrs',
'spacy.en.pos', 'spacy.syntax.parser', 'spacy.syntax._state', 'spacy.en.pos', 'spacy.syntax.parser', 'spacy.syntax._state',
'spacy.syntax.transition_system', 'spacy.syntax.transition_system',
'spacy.syntax.arc_eager', 'spacy.syntax._parse_features', 'spacy.syntax.arc_eager', 'spacy.syntax._parse_features',
'spacy.syntax.conll', 'spacy.orth', 'spacy.syntax.conll', 'spacy.orth',
'spacy.syntax.ner'] 'spacy.syntax.ner']

View File

@ -33,7 +33,7 @@ cdef class Model:
cdef class HastyModel: cdef class HastyModel:
cdef Pool mem cdef Pool mem
cdef weight_t* _scores cdef weight_t* _scores
cdef const weight_t* score(self, atom_t* context) except NULL cdef const weight_t* score(self, atom_t* context) except NULL
cdef int update(self, atom_t* context, class_t guess, class_t gold, int cost) except -1 cdef int update(self, atom_t* context, class_t guess, class_t gold, int cost) except -1

View File

@ -79,5 +79,3 @@ cpdef enum attr_id_t:
POS POS
TAG TAG
DEP DEP

View File

@ -129,19 +129,19 @@ class English(object):
entity=parse_if_model_present, merge_mwes=False): entity=parse_if_model_present, merge_mwes=False):
"""Apply the pipeline to some text. The text can span multiple sentences, """Apply the pipeline to some text. The text can span multiple sentences,
and can contain arbtrary whitespace. Alignment into the original string and can contain arbtrary whitespace. Alignment into the original string
The tagger and parser are lazy-loaded the first time they are required. The tagger and parser are lazy-loaded the first time they are required.
Loading the parser model usually takes 5-10 seconds. Loading the parser model usually takes 5-10 seconds.
Args: Args:
text (unicode): The text to be processed. text (unicode): The text to be processed.
Keyword args: Keyword args:
tag (bool): Whether to add part-of-speech tags to the text. Also tag (bool): Whether to add part-of-speech tags to the text. Also
sets morphological analysis and lemmas. sets morphological analysis and lemmas.
parse (True, False, -1): Whether to add labelled syntactic dependencies. parse (True, False, -1): Whether to add labelled syntactic dependencies.
-1 (default) is "guess": It will guess True if tag=True and the -1 (default) is "guess": It will guess True if tag=True and the
model has been installed. model has been installed.

View File

@ -39,7 +39,7 @@ def install_parser_model(url, dest_dir):
def install_dep_vectors(url, dest_dir): def install_dep_vectors(url, dest_dir):
if not os.path.exists(dest_dir): if not os.path.exists(dest_dir):
os.mkdir(dest_dir) os.mkdir(dest_dir)
filename = download_file(url, dest_dir) filename = download_file(url, dest_dir)

View File

@ -22,4 +22,3 @@ cdef class EnPosTagger:
cdef int set_morph(self, const int i, const PosTag* tag, TokenC* tokens) except -1 cdef int set_morph(self, const int i, const PosTag* tag, TokenC* tokens) except -1
cdef int lemmatize(self, const univ_pos_t pos, const LexemeC* lex) except -1 cdef int lemmatize(self, const univ_pos_t pos, const LexemeC* lex) except -1

View File

@ -353,7 +353,7 @@ cdef class EnPosTagger:
cached.lemma = self.strings[lemma_str] cached.lemma = self.strings[lemma_str]
set_morph_from_dict(&cached.morph, props) set_morph_from_dict(&cached.morph, props)
self._morph_cache.set(pos, orth, <void*>cached) self._morph_cache.set(pos, orth, <void*>cached)
cdef int fill_context(atom_t* context, const int i, const TokenC* tokens) except -1: cdef int fill_context(atom_t* context, const int i, const TokenC* tokens) except -1:
_fill_from_token(&context[P2_orth], &tokens[i-2]) _fill_from_token(&context[P2_orth], &tokens[i-2])
@ -381,4 +381,3 @@ cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
context[7] = 4 context[7] = 4
else: else:
context[7] = 0 context[7] = 0

View File

@ -12,7 +12,7 @@ cdef LexemeC EMPTY_LEXEME
cdef int set_lex_struct_props(LexemeC* lex, dict props, StringStore strings, cdef int set_lex_struct_props(LexemeC* lex, dict props, StringStore strings,
const float* empty_vec) except -1 const float* empty_vec) except -1
cdef class Lexeme: cdef class Lexeme:
cdef readonly ndarray repvec cdef readonly ndarray repvec

View File

@ -17,12 +17,12 @@ cdef int set_lex_struct_props(LexemeC* lex, dict props, StringStore string_store
const float* empty_vec) except -1: const float* empty_vec) except -1:
lex.length = props['length'] lex.length = props['length']
lex.orth = string_store[props['orth']] lex.orth = string_store[props['orth']]
lex.lower = string_store[props['lower']] lex.lower = string_store[props['lower']]
lex.norm = string_store[props['norm']] lex.norm = string_store[props['norm']]
lex.shape = string_store[props['shape']] lex.shape = string_store[props['shape']]
lex.prefix = string_store[props['prefix']] lex.prefix = string_store[props['prefix']]
lex.suffix = string_store[props['suffix']] lex.suffix = string_store[props['suffix']]
lex.cluster = props['cluster'] lex.cluster = props['cluster']
lex.prob = props['prob'] lex.prob = props['prob']
lex.sentiment = props['sentiment'] lex.sentiment = props['sentiment']

View File

@ -58,10 +58,10 @@ LOCAL = (
(N3.sic,), (N3.sic,),
(P4.sic,), (P4.sic,),
(N4.sic,), (N4.sic,),
(P1.sic, N0.sic,), (P1.sic, N0.sic,),
(N0.sic, N1.sic), (N0.sic, N1.sic),
(N0.prefix,), (N0.prefix,),
(N0.suffix,), (N0.suffix,),

View File

@ -11,7 +11,7 @@ cdef class NERAnnotation:
memset(self.starts, -1, sizeof(int) * length) memset(self.starts, -1, sizeof(int) * length)
memset(self.ends, -1, sizeof(int) * length) memset(self.ends, -1, sizeof(int) * length)
memset(self.labels, -1, sizeof(int) * length) memset(self.labels, -1, sizeof(int) * length)
cdef int start, end, label cdef int start, end, label
for start, end, label in entities: for start, end, label in entities:
for i in range(start, end): for i in range(start, end):

View File

@ -107,7 +107,7 @@ cdef bint is_oracle(ActionType act, int tag, ActionType g_act, int g_tag,
# U, Gold L --> False # U, Gold L --> False
# U, Gold O --> False # U, Gold O --> False
return False return False
cdef int set_accept_if_valid(Move* moves, int n_classes, State* s) except 0: cdef int set_accept_if_valid(Move* moves, int n_classes, State* s) except 0:
cdef int n_accept = 0 cdef int n_accept = 0
@ -160,7 +160,7 @@ cdef Move* best_accepted(Move* moves, weight_t* scores, int n) except NULL:
cdef int best = first_accept cdef int best = first_accept
cdef weight_t score = scores[first_accept-1] cdef weight_t score = scores[first_accept-1]
cdef int i cdef int i
for i in range(first_accept+1, n): for i in range(first_accept+1, n):
if moves[i].accept and scores[i-1] > score: if moves[i].accept and scores[i-1] > score:
best = i best = i
score = scores[i-1] score = scores[i-1]
@ -179,7 +179,7 @@ cdef int transition(State *s, Move* move) except -1:
end_entity(s) end_entity(s)
elif move.action == OUT: elif move.action == OUT:
pass pass
s.tags[s.i] = move.clas s.tags[s.i] = move.clas
s.i += 1 s.i += 1

View File

@ -149,5 +149,3 @@ cpdef enum:
cdef int fill_context(atom_t* context, State* s, Tokens tokens) except -1 cdef int fill_context(atom_t* context, State* s, Tokens tokens) except -1

View File

@ -18,7 +18,7 @@ cdef int _fill_token(atom_t* c, Lexeme* lex, atom_t pos):
c[T_postype] = lex.postype c[T_postype] = lex.postype
c[T_nertype] = 0 c[T_nertype] = 0
c[T_sensetype] = 0 c[T_sensetype] = 0
c[T_is_alpha] = lex.flags & (1 << IS_ALPHA) c[T_is_alpha] = lex.flags & (1 << IS_ALPHA)
c[T_is_digit] = lex.flags & (1 << IS_DIGIT) c[T_is_digit] = lex.flags & (1 << IS_DIGIT)
c[T_is_lower] = lex.flags & (1 << IS_LOWER) c[T_is_lower] = lex.flags & (1 << IS_LOWER)

View File

@ -7,10 +7,10 @@ LOCAL = (
(N1_sic,), (N1_sic,),
(P2_sic,), (P2_sic,),
(N2_sic,), (N2_sic,),
(P1_sic, W_sic,), (P1_sic, W_sic,),
(W_sic, N1_sic), (W_sic, N1_sic),
(W_prefix,), (W_prefix,),
(W_suffix,), (W_suffix,),

View File

@ -92,7 +92,7 @@ cdef class NERParser:
fill_context(self._context, s, tokens) fill_context(self._context, s, tokens)
self.extractor.extract(self._feats, self._values, self._context, NULL) self.extractor.extract(self._feats, self._values, self._context, NULL)
self.model.score(self._scores, self._feats, self._values) self.model.score(self._scores, self._feats, self._values)
set_accept_if_valid(self._moves, self.n_classes, s) set_accept_if_valid(self._moves, self.n_classes, s)
guess = best_accepted(self._moves, self._scores, self.n_classes) guess = best_accepted(self._moves, self._scores, self.n_classes)
assert guess.clas != 0 assert guess.clas != 0

View File

@ -16,7 +16,7 @@ cpdef enum ActionType:
cdef int set_accept_if_oracle(Move* moves, int n, State* s, cdef int set_accept_if_oracle(Move* moves, int n, State* s,
int* g_starts, int* g_ends, int* g_labels) except 0 int* g_starts, int* g_ends, int* g_labels) except 0
cdef int set_accept_if_valid(Move* moves, int n, State* s) except 0 cdef int set_accept_if_valid(Move* moves, int n, State* s) except 0
cdef Move* best_accepted(Move* moves, weight_t* scores, int n) except NULL cdef Move* best_accepted(Move* moves, weight_t* scores, int n) except NULL

View File

@ -97,7 +97,7 @@ cdef Move* best_accepted(Move* moves, weight_t* scores, int n) except NULL:
cdef int best = first_accept cdef int best = first_accept
cdef weight_t score = scores[first_accept-1] cdef weight_t score = scores[first_accept-1]
cdef int i cdef int i
for i in range(first_accept+1, n): for i in range(first_accept+1, n):
if moves[i].accept and scores[i-1] > score: if moves[i].accept and scores[i-1] > score:
best = i best = i
score = scores[i-1] score = scores[i-1]
@ -105,7 +105,7 @@ cdef Move* best_accepted(Move* moves, weight_t* scores, int n) except NULL:
cdef int transition(State *s, Move* move) except -1: cdef int transition(State *s, Move* move) except -1:
s.tags[s.i] = move.clas s.tags[s.i] = move.clas
if move.action == OUT: if move.action == OUT:
s.i += 1 s.i += 1
elif move.action == SHIFT: elif move.action == SHIFT:

View File

@ -8,7 +8,7 @@ cdef class PyState:
cdef readonly list tag_names cdef readonly list tag_names
cdef readonly int n_classes cdef readonly int n_classes
cdef readonly dict moves_by_name cdef readonly dict moves_by_name
cdef Move* _moves cdef Move* _moves
cdef Move* _golds cdef Move* _golds
cdef State* _s cdef State* _s

View File

@ -33,7 +33,7 @@ class Scorer(object):
@property @property
def ents_r(self): def ents_r(self):
return (self.ents_tp / (self.ents_tp + self.ents_fn + 1e-100)) * 100 return (self.ents_tp / (self.ents_tp + self.ents_fn + 1e-100)) * 100
@property @property
def ents_f(self): def ents_f(self):
return (2 * self.ents_p * self.ents_r) / (self.ents_p + self.ents_r + 1e-100) return (2 * self.ents_p * self.ents_r) / (self.ents_p + self.ents_r + 1e-100)

View File

@ -5,7 +5,7 @@ from .structs cimport Morphology, TokenC, LexemeC
from .vocab cimport Vocab from .vocab cimport Vocab
from .strings cimport StringStore from .strings cimport StringStore
cdef class Span: cdef class Span:
cdef readonly Tokens _seq cdef readonly Tokens _seq
cdef public int i cdef public int i
@ -15,4 +15,3 @@ cdef class Span:
cdef public Span head cdef public Span head
cdef public list rights cdef public list rights
cdef public list lefts cdef public list lefts

View File

@ -8,7 +8,7 @@ cdef struct LexemeC:
const float* repvec const float* repvec
flags_t flags flags_t flags
attr_t id attr_t id
attr_t length attr_t length
@ -18,7 +18,7 @@ cdef struct LexemeC:
attr_t shape attr_t shape
attr_t prefix attr_t prefix
attr_t suffix attr_t suffix
attr_t cluster attr_t cluster
float prob float prob

View File

@ -99,7 +99,7 @@ cpdef enum:
S0_shape S0_shape
S0_ne_iob S0_ne_iob
S0_ne_type S0_ne_type
S0r2w S0r2w
S0r2W S0r2W
S0r2p S0r2p
@ -164,7 +164,7 @@ cpdef enum:
N0_shape N0_shape
N0_ne_iob N0_ne_iob
N0_ne_type N0_ne_type
N1w N1w
N1W N1W
N1p N1p
@ -190,7 +190,7 @@ cpdef enum:
N2_shape N2_shape
N2_ne_iob N2_ne_iob
N2_ne_type N2_ne_type
P1w P1w
P1W P1W
P1p P1p
@ -203,7 +203,7 @@ cpdef enum:
P1_shape P1_shape
P1_ne_iob P1_ne_iob
P1_ne_type P1_ne_type
P2w P2w
P2W P2W
P2p P2p
@ -216,7 +216,7 @@ cpdef enum:
P2_shape P2_shape
P2_ne_iob P2_ne_iob
P2_ne_type P2_ne_type
E0w E0w
E0W E0W
E0p E0p
@ -229,7 +229,7 @@ cpdef enum:
E0_shape E0_shape
E0_ne_iob E0_ne_iob
E0_ne_type E0_ne_type
E1w E1w
E1W E1W
E1p E1p
@ -242,7 +242,7 @@ cpdef enum:
E1_shape E1_shape
E1_ne_iob E1_ne_iob
E1_ne_type E1_ne_type
# Misc features at the end # Misc features at the end
dist dist
N0lv N0lv

View File

@ -111,10 +111,10 @@ ner = (
(N1W,), (N1W,),
(P2W,), (P2W,),
(N2W,), (N2W,),
(P1W, N0W,), (P1W, N0W,),
(N0W, N1W), (N0W, N1W),
(N0_prefix,), (N0_prefix,),
(N0_suffix,), (N0_suffix,),
@ -205,22 +205,22 @@ ner = (
unigrams = ( unigrams = (
(S2W, S2p), (S2W, S2p),
(S2c6, S2p), (S2c6, S2p),
(S1W, S1p), (S1W, S1p),
(S1c6, S1p), (S1c6, S1p),
(S0W, S0p), (S0W, S0p),
(S0c6, S0p), (S0c6, S0p),
(N0W, N0p), (N0W, N0p),
(N0p,), (N0p,),
(N0c,), (N0c,),
(N0c6, N0p), (N0c6, N0p),
(N0L,), (N0L,),
(N1W, N1p), (N1W, N1p),
(N1c6, N1p), (N1c6, N1p),
(N2W, N2p), (N2W, N2p),
(N2c6, N2p), (N2c6, N2p),

View File

@ -27,7 +27,7 @@ cdef int pop_stack(State *s) except -1:
s.stack -= 1 s.stack -= 1
if s.stack_len == 0 and not at_eol(s): if s.stack_len == 0 and not at_eol(s):
push_stack(s) push_stack(s)
cdef int push_stack(State *s) except -1: cdef int push_stack(State *s) except -1:
assert s.i < s.sent_len assert s.i < s.sent_len

View File

@ -3,7 +3,7 @@ from cymem.cymem cimport Pool
from thinc.typedefs cimport weight_t from thinc.typedefs cimport weight_t
from ._state cimport State from ._state cimport State
from .transition_system cimport TransitionSystem, Transition from .transition_system cimport TransitionSystem, Transition

View File

@ -277,5 +277,3 @@ class OracleError(Exception):
class UnknownMove(Exception): class UnknownMove(Exception):
pass pass

View File

@ -36,7 +36,7 @@ from . import _parse_features
from ._parse_features cimport fill_context, CONTEXT_SIZE from ._parse_features cimport fill_context, CONTEXT_SIZE
DEBUG = False DEBUG = False
def set_debug(val): def set_debug(val):
global DEBUG global DEBUG
DEBUG = val DEBUG = val
@ -111,7 +111,7 @@ cdef class GreedyParser:
scores = self.model.score(context) scores = self.model.score(context)
guess = self.moves.best_valid(scores, state) guess = self.moves.best_valid(scores, state)
best = self.moves.best_gold(scores, state, gold) best = self.moves.best_gold(scores, state, gold)
cost = guess.get_cost(&guess, state, gold) cost = guess.get_cost(&guess, state, gold)
self.model.update(context, guess.clas, best.clas, cost) self.model.update(context, guess.clas, best.clas, cost)

View File

@ -34,16 +34,16 @@ cdef class TransitionSystem:
cdef int finalize_state(self, State* state) except -1 cdef int finalize_state(self, State* state) except -1
cdef int preprocess_gold(self, GoldParse gold) except -1 cdef int preprocess_gold(self, GoldParse gold) except -1
cdef Transition lookup_transition(self, object name) except * cdef Transition lookup_transition(self, object name) except *
cdef Transition init_transition(self, int clas, int move, int label) except * cdef Transition init_transition(self, int clas, int move, int label) except *
cdef Transition best_valid(self, const weight_t* scores, const State* state) except * cdef Transition best_valid(self, const weight_t* scores, const State* state) except *
cdef Transition best_gold(self, const weight_t* scores, const State* state, cdef Transition best_gold(self, const weight_t* scores, const State* state,
GoldParse gold) except * GoldParse gold) except *
#cdef class PyState: #cdef class PyState:
# """Provide a Python class for testing purposes.""" # """Provide a Python class for testing purposes."""

View File

@ -13,5 +13,3 @@ class Config(object):
@classmethod @classmethod
def read(cls, model_dir, name): def read(cls, model_dir, name):
return cls(**json.load(open(path.join(model_dir, '%s.json' % name)))) return cls(**json.load(open(path.join(model_dir, '%s.json' % name))))

View File

@ -60,7 +60,7 @@ cdef class Tokenizer:
split off a suffix, and repeat. split off a suffix, and repeat.
Args: Args:
string (unicode): The string to be tokenized. string (unicode): The string to be tokenized.
Returns: Returns:
tokens (Tokens): A Tokens object, giving access to a sequence of LexemeCs. tokens (Tokens): A Tokens object, giving access to a sequence of LexemeCs.
@ -213,7 +213,7 @@ cdef class Tokenizer:
cdef unicode string = chars[:length] cdef unicode string = chars[:length]
match = self._infix_re.search(string) match = self._infix_re.search(string)
return match.start() if match is not None else 0 return match.start() if match is not None else 0
cdef int _find_prefix(self, Py_UNICODE* chars, size_t length) except -1: cdef int _find_prefix(self, Py_UNICODE* chars, size_t length) except -1:
cdef unicode string = chars[:length] cdef unicode string = chars[:length]
match = self._prefix_re.search(string) match = self._prefix_re.search(string)

View File

@ -31,9 +31,9 @@ cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil:
cdef class Tokens: cdef class Tokens:
cdef Pool mem cdef Pool mem
cdef Vocab vocab cdef Vocab vocab
cdef TokenC* data cdef TokenC* data
cdef list _py_tokens cdef list _py_tokens
cdef unicode _string cdef unicode _string
@ -61,7 +61,7 @@ cdef class Token:
cdef int array_len cdef int array_len
cdef bint _owns_c_data cdef bint _owns_c_data
cdef Tokens _seq cdef Tokens _seq
@staticmethod @staticmethod

View File

@ -105,10 +105,10 @@ cdef class Tokens:
def __getitem__(self, object i): def __getitem__(self, object i):
"""Retrieve a token. """Retrieve a token.
The Python Token objects are created lazily from internal C data, and The Python Token objects are created lazily from internal C data, and
cached in _py_tokens cached in _py_tokens
Returns: Returns:
token (Token): token (Token):
""" """
@ -181,7 +181,7 @@ cdef class Tokens:
yield Span(self, start, i+1) yield Span(self, start, i+1)
start = None start = None
if start is not None: if start is not None:
yield Span(self, start, self.length) yield Span(self, start, self.length)
cdef int push_back(self, int idx, LexemeOrToken lex_or_tok) except -1: cdef int push_back(self, int idx, LexemeOrToken lex_or_tok) except -1:
if self.length == self.max_length: if self.length == self.max_length:
@ -299,7 +299,7 @@ cdef class Tokens:
# What to do about morphology?? # What to do about morphology??
# TODO: token.morph = ??? # TODO: token.morph = ???
token.tag = self.vocab.strings[tag] token.tag = self.vocab.strings[tag]
token.lemma = self.vocab.strings[lemma] token.lemma = self.vocab.strings[lemma]
if ent_type == 'O': if ent_type == 'O':
token.ent_iob = 2 token.ent_iob = 2
token.ent_type = 0 token.ent_type = 0
@ -356,7 +356,7 @@ cdef class Tokens:
self._py_tokens = [None] * self.length self._py_tokens = [None] * self.length
# Return the merged Python object # Return the merged Python object
return self[start] return self[start]
cdef class Token: cdef class Token:
"""An individual token --- i.e. a word, a punctuation symbol, etc. Created """An individual token --- i.e. a word, a punctuation symbol, etc. Created
@ -630,4 +630,3 @@ _parse_unset_error = """Text has not been parsed, so cannot be accessed.
Check that the parser data is installed. Run "python -m spacy.en.download" if not. Check that the parser data is installed. Run "python -m spacy.en.download" if not.
Check whether parse=False in the call to English.__call__ Check whether parse=False in the call to English.__call__
""" """

View File

@ -94,5 +94,3 @@ ctypedef uint64_t flags_t
ctypedef uint32_t id_t ctypedef uint32_t id_t
ctypedef uint16_t len_t ctypedef uint16_t len_t
ctypedef uint16_t tag_t ctypedef uint16_t tag_t

View File

@ -1,4 +1,3 @@
import os
from os import path from os import path
import codecs import codecs
import json import json
@ -72,7 +71,7 @@ def read_detoken_rules(lang):
for line in file_: for line in file_:
entries.append(line.strip()) entries.append(line.strip())
return entries return entries
def align_tokens(ref, indices): def align_tokens(ref, indices):
start = 0 start = 0
@ -88,7 +87,7 @@ def align_tokens(ref, indices):
def detokenize(token_rules, words): def detokenize(token_rules, words):
"""To align with treebanks, return a list of "chunks", where a chunk is a """To align with treebanks, return a list of "chunks", where a chunk is a
sequence of tokens that are separated by whitespace in actual strings. Each sequence of tokens that are separated by whitespace in actual strings. Each
chunk should be a tuple of token indices, e.g. chunk should be a tuple of token indices, e.g.

View File

@ -31,6 +31,5 @@ cdef class Vocab:
cdef const LexemeC* get(self, Pool mem, UniStr* s) except NULL cdef const LexemeC* get(self, Pool mem, UniStr* s) except NULL
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1 cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
cdef PreshMap _map cdef PreshMap _map

View File

@ -170,7 +170,7 @@ cdef class Vocab:
self.lexemes[lexeme.id] = lexeme self.lexemes[lexeme.id] = lexeme
i += 1 i += 1
fclose(fp) fclose(fp)
def load_rep_vectors(self, loc): def load_rep_vectors(self, loc):
file_ = _CFile(loc, b'rb') file_ = _CFile(loc, b'rb')
cdef int32_t word_len cdef int32_t word_len
@ -187,7 +187,7 @@ cdef class Vocab:
except IOError: except IOError:
break break
file_.read(&vec_len, sizeof(vec_len), 1) file_.read(&vec_len, sizeof(vec_len), 1)
mem = Address(word_len, sizeof(char)) mem = Address(word_len, sizeof(char))
chars = <char*>mem.ptr chars = <char*>mem.ptr
vec = <float*>self.mem.alloc(vec_len, sizeof(float)) vec = <float*>self.mem.alloc(vec_len, sizeof(float))

View File

@ -7,6 +7,7 @@ from spacy.lexeme import lex_of
from spacy import LEX, NORM, SHAPE, LAST3 from spacy import LEX, NORM, SHAPE, LAST3
def test_group_by_lex(): def test_group_by_lex():
tokens = en.tokenize("I like the red one and I like the blue one") tokens = en.tokenize("I like the red one and I like the blue one")
names, hashes, groups = tokens.group_by(LEX) names, hashes, groups = tokens.group_by(LEX)

View File

@ -40,6 +40,7 @@ def test_begin(state, sentence):
assert not state.is_valid('O') assert not state.is_valid('O')
assert not state.is_valid('U-PER') assert not state.is_valid('U-PER')
def test_in(state, sentence): def test_in(state, sentence):
state.transition('B-PER') state.transition('B-PER')
assert state.n_ents == 0 assert state.n_ents == 0

View File

@ -1,4 +1,4 @@
The Sun is the star at the center of the Solar System. It is almost perfectly spherical and consists of hot plasma interwoven with magnetic fields. [ 12 ] [ 13 ] It has a diameter of about 1 , 392 , 684 km ( 865 , 374 mi ) , [ 5 ] around 109 times that of Earth , and its mass ( 1.989×1030 kilograms , approximately 330 , 000 times the mass of Earth ) accounts for about 99.86 % of the total mass of the Solar System. [ 14 ] Chemically , about three quarters of the Sun 's mass consists of hydrogen , while the rest is mostly helium. The remaining 1.69 % ( equal to 5 , 600 times the mass of Earth ) consists of heavier elements , including oxygen , carbon , neon and iron , among others. [ 15 ] The Sun is the star at the center of the Solar System. It is almost perfectly spherical and consists of hot plasma interwoven with magnetic fields. [ 12 ] [ 13 ] It has a diameter of about 1 , 392 , 684 km ( 865 , 374 mi ) , [ 5 ] around 109 times that of Earth , and its mass ( 1.989×1030 kilograms , approximately 330 , 000 times the mass of Earth ) accounts for about 99.86 % of the total mass of the Solar System. [ 14 ] Chemically , about three quarters of the Sun 's mass consists of hydrogen , while the rest is mostly helium. The remaining 1.69 % ( equal to 5 , 600 times the mass of Earth ) consists of heavier elements , including oxygen , carbon , neon and iron , among others. [ 15 ]
The Sun formed about 4.567 billion [ a ] [ 16 ] years ago from the gravitational collapse of a region within a large molecular cloud. Most of the matter gathered in the center , while the rest flattened into an orbiting disk that would become the Solar System. The central mass became increasingly hot and dense , eventually initiating thermonuclear fusion in its core. It is thought that almost all stars form by this process. The Sun is a G-type main-sequence star ( G2V ) based on spectral class and it is informally designated as a yellow dwarf because its visible radiation is most intense in the yellow-green portion of the spectrum , and although it is actually white in color , from the surface of the Earth it may appear yellow because of atmospheric scattering of blue light. [ 17 ] In the spectral class label , G2 indicates its surface temperature , of approximately 5778 K ( 5505 °C ) , and V indicates that the Sun , like most stars , is a main-sequence star , and thus generates its energy by nuclear fusion of hydrogen nuclei into helium. In its core , the Sun fuses about 620 million metric tons of hydrogen each second. [ 18 ] [ 19 ] The Sun formed about 4.567 billion [ a ] [ 16 ] years ago from the gravitational collapse of a region within a large molecular cloud. Most of the matter gathered in the center , while the rest flattened into an orbiting disk that would become the Solar System. The central mass became increasingly hot and dense , eventually initiating thermonuclear fusion in its core. It is thought that almost all stars form by this process. The Sun is a G-type main-sequence star ( G2V ) based on spectral class and it is informally designated as a yellow dwarf because its visible radiation is most intense in the yellow-green portion of the spectrum , and although it is actually white in color , from the surface of the Earth it may appear yellow because of atmospheric scattering of blue light. [ 17 ] In the spectral class label , G2 indicates its surface temperature , of approximately 5778 K ( 5505 °C ) , and V indicates that the Sun , like most stars , is a main-sequence star , and thus generates its energy by nuclear fusion of hydrogen nuclei into helium. In its core , the Sun fuses about 620 million metric tons of hydrogen each second. [ 18 ] [ 19 ]
Once regarded by astronomers as a small and relatively insignificant star , the Sun is now thought to be brighter than about 85 % of the stars in the Milky Way , most of which are red dwarfs. [ 20 ] [ 21 ] The absolute magnitude of the Sun is +4.83 ; however , as the star closest to Earth , the Sun is by far the brightest object in the sky with an apparent magnitude of 26.74. [ 22 ] [ 23 ] This is about 13 billion times brighter than the next brightest star , Sirius , with an apparent magnitude of 1.46. The Sun 's hot corona continuously expands in space creating the solar wind , a stream of charged particles that extends to the heliopause at roughly 100 astronomical units. The bubble in the interstellar medium formed by the solar wind , the heliosphere , is the largest continuous structure in the Solar System. [ 24 ] [ 25 ] Once regarded by astronomers as a small and relatively insignificant star , the Sun is now thought to be brighter than about 85 % of the stars in the Milky Way , most of which are red dwarfs. [ 20 ] [ 21 ] The absolute magnitude of the Sun is +4.83 ; however , as the star closest to Earth , the Sun is by far the brightest object in the sky with an apparent magnitude of 26.74. [ 22 ] [ 23 ] This is about 13 billion times brighter than the next brightest star , Sirius , with an apparent magnitude of 1.46. The Sun 's hot corona continuously expands in space creating the solar wind , a stream of charged particles that extends to the heliopause at roughly 100 astronomical units. The bubble in the interstellar medium formed by the solar wind , the heliosphere , is the largest continuous structure in the Solar System. [ 24 ] [ 25 ]

View File

@ -30,6 +30,3 @@ def test_align_continue():
assert aligned[2] == ('re-align', [(5, 7), (7, 8), (8, 13)]) assert aligned[2] == ('re-align', [(5, 7), (7, 8), (8, 13)])
assert aligned[3] == ('and', [(13, 16)]) assert aligned[3] == ('and', [(13, 16)])
assert aligned[4] == ('continue', [(16, 24)]) assert aligned[4] == ('continue', [(16, 24)])

View File

@ -37,5 +37,3 @@ def test_dep():
assert feats_array[1][1] == tokens[1].dep assert feats_array[1][1] == tokens[1].dep
assert feats_array[2][1] == tokens[2].dep assert feats_array[2][1] == tokens[2].dep
assert feats_array[3][1] == tokens[3].dep assert feats_array[3][1] == tokens[3].dep

View File

@ -2,6 +2,7 @@
"""Sphinx doctest is just too hard. Manually paste doctest examples here""" """Sphinx doctest is just too hard. Manually paste doctest examples here"""
from spacy.en.attrs import IS_LOWER from spacy.en.attrs import IS_LOWER
def test_1(): def test_1():
import spacy.en import spacy.en
from spacy.parts_of_speech import ADV from spacy.parts_of_speech import ADV
@ -39,6 +40,7 @@ def test2():
nlp.vocab[u'quietly'].prob nlp.vocab[u'quietly'].prob
-11.07155704498291 -11.07155704498291
def test3(): def test3():
import spacy.en import spacy.en
from spacy.parts_of_speech import ADV from spacy.parts_of_speech import ADV
@ -57,7 +59,7 @@ def test3():
assert sum(o) != 0 assert sum(o) != 0
from numpy import dot from numpy import dot
from numpy.linalg import norm from numpy.linalg import norm
cosine = lambda v1, v2: dot(v1, v2) / (norm(v1) * norm(v2)) cosine = lambda v1, v2: dot(v1, v2) / (norm(v1) * norm(v2))
words = [w for w in nlp.vocab if w.check(IS_LOWER) and w.has_repvec] words = [w for w in nlp.vocab if w.check(IS_LOWER) and w.has_repvec]
words.sort(key=lambda w: cosine(w.repvec, pleaded.repvec)) words.sort(key=lambda w: cosine(w.repvec, pleaded.repvec))

View File

@ -8,6 +8,7 @@ from spacy.en import English
def EN(): def EN():
return English() return English()
def test_tweebo_challenge(EN): def test_tweebo_challenge(EN):
text = u""":o :/ :'( >:o (: :) >.< XD -__- o.O ;D :-) @_@ :P 8D :1 >:( :D =| ") :> ....""" text = u""":o :/ :'( >:o (: :) >.< XD -__- o.O ;D :-) @_@ :P 8D :1 >:( :D =| ") :> ...."""
tokens = EN(text) tokens = EN(text)

View File

@ -16,6 +16,7 @@ def words():
return ["1997", "19.97", "hello9", "Hello", "HELLO", "Hello9", "\n", "!", return ["1997", "19.97", "hello9", "Hello", "HELLO", "Hello9", "\n", "!",
"!d", "\nd"] "!d", "\nd"]
def test_is_alpha(words): def test_is_alpha(words):
assert not is_alpha(words[0]) assert not is_alpha(words[0])
assert not is_alpha(words[1]) assert not is_alpha(words[1])

View File

@ -5,10 +5,12 @@ from spacy.strings import StringStore
import pytest import pytest
@pytest.fixture @pytest.fixture
def sstore(): def sstore():
return StringStore() return StringStore()
def test_save_bytes(sstore): def test_save_bytes(sstore):
Hello_i = sstore[b'Hello'] Hello_i = sstore[b'Hello']
assert Hello_i == 1 assert Hello_i == 1

View File

@ -2,10 +2,12 @@ import pytest
from spacy.en import English from spacy.en import English
@pytest.fixture @pytest.fixture
def EN(): def EN():
return English() return English()
def test_range_iter(EN): def test_range_iter(EN):
for i in range(len(EN.vocab)): for i in range(len(EN.vocab)):
lex = EN.vocab[i] lex = EN.vocab[i]

View File

@ -35,4 +35,3 @@ def test_merge_heads():
def test_issue_54(): def test_issue_54():
text = u'Talks given by women had a slightly higher number of questions asked (3.2$\pm$0.2) than talks given by men (2.6$\pm$0.1).' text = u'Talks given by women had a slightly higher number of questions asked (3.2$\pm$0.2) than talks given by men (2.6$\pm$0.1).'
tokens = NLU(text, merge_mwes=True) tokens = NLU(text, merge_mwes=True)

View File

@ -17,6 +17,7 @@ def morph_exc():
'PRP$': {'his': {'L': '-PRP-', 'person': 3, 'case': 2}}, 'PRP$': {'his': {'L': '-PRP-', 'person': 3, 'case': 2}},
} }
def test_load_exc(EN, morph_exc): def test_load_exc(EN, morph_exc):
EN.tagger.load_morph_exceptions(morph_exc) EN.tagger.load_morph_exceptions(morph_exc)
tokens = EN('I like his style.', tag=True) tokens = EN('I like his style.', tag=True)

View File

@ -3,6 +3,7 @@ from spacy.en import English
nlp = English() nlp = English()
def test_simple_types(): def test_simple_types():
tokens = nlp(u'Mr. Best flew to New York on Saturday morning.') tokens = nlp(u'Mr. Best flew to New York on Saturday morning.')
ents = list(tokens.ents) ents = list(tokens.ents)

View File

@ -33,4 +33,3 @@ def test_word():
def test_not_number(): def test_not_number():
assert not like_number('dog') assert not like_number('dog')
assert not like_number(',') assert not like_number(',')

View File

@ -3,6 +3,7 @@ import pytest
from spacy.en import English from spacy.en import English
def test_only_pre1(): def test_only_pre1():
EN = English() EN = English()
assert len(EN("(")) == 1 assert len(EN("(")) == 1

View File

@ -58,4 +58,3 @@ def test_child_consistency(nlp, sun_text):
assert not children assert not children
for head_index, children in rights.items(): for head_index, children in rights.items():
assert not children assert not children

View File

@ -49,4 +49,3 @@ def test_three_same_close(close_puncts, EN):
def test_double_end_quote(EN): def test_double_end_quote(EN):
assert len(EN("Hello''")) == 2 assert len(EN("Hello''")) == 2
assert len(EN("''")) == 1 assert len(EN("''")) == 1

View File

@ -3,6 +3,7 @@ from spacy.en import English
import pytest import pytest
@pytest.fixture @pytest.fixture
def EN(): def EN():
return English() return English()

View File

@ -8,20 +8,26 @@ from spacy.orth import word_shape as ws
def test_capitalized(): def test_capitalized():
assert ws('Nasa') == 'Xxxx' assert ws('Nasa') == 'Xxxx'
def test_truncate(): def test_truncate():
assert ws('capitalized') == 'xxxx' assert ws('capitalized') == 'xxxx'
def test_digits(): def test_digits():
assert ws('999999999') == 'dddd' assert ws('999999999') == 'dddd'
def test_mix(): def test_mix():
assert ws('C3P0') == 'XdXd' assert ws('C3P0') == 'XdXd'
def test_punct(): def test_punct():
assert ws(',') == ',' assert ws(',') == ','
def test_space(): def test_space():
assert ws('\n') == '\n' assert ws('\n') == '\n'
def test_punct_seq(): def test_punct_seq():
assert ws('``,-') == '``,-' assert ws('``,-') == '``,-'

View File

@ -3,7 +3,6 @@ from __future__ import unicode_literals
from spacy.en import English from spacy.en import English
import pytest import pytest
import re
EN = English() EN = English()

View File

@ -13,9 +13,11 @@ def EN():
def test_no_special(EN): def test_no_special(EN):
assert len(EN("(can)")) == 3 assert len(EN("(can)")) == 3
def test_no_punct(EN): def test_no_punct(EN):
assert len(EN("can't")) == 2 assert len(EN("can't")) == 2
def test_prefix(EN): def test_prefix(EN):
assert len(EN("(can't")) == 3 assert len(EN("(can't")) == 3

View File

@ -16,6 +16,3 @@ def test_one(EN):
assert tokens[0].orth_ == 'Betty' assert tokens[0].orth_ == 'Betty'
tokens2 = EN('Betty also bought a pound of butter.') tokens2 = EN('Betty also bought a pound of butter.')
assert tokens2[0].orth_ == 'Betty' assert tokens2[0].orth_ == 'Betty'

View File

@ -16,4 +16,3 @@ def test_subtrees():
assert len(list(bus.children)) == 1 assert len(list(bus.children)) == 1
assert len(list(wheels.subtree)) == 6 assert len(list(wheels.subtree)) == 6

View File

@ -1,6 +1,7 @@
from spacy.en import English from spacy.en import English
import six import six
def test_tag_names(): def test_tag_names():
nlp = English() nlp = English()
tokens = nlp(u'I ate pizzas with anchovies.', parse=True, tag=True) tokens = nlp(u'I ate pizzas with anchovies.', parse=True, tag=True)

View File

@ -6,6 +6,7 @@ import pytest
NLU = English() NLU = English()
def test_am_pm(): def test_am_pm():
numbers = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12'] numbers = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12']
variants = ['a.m.', 'am', 'p.m.', 'pm'] variants = ['a.m.', 'am', 'p.m.', 'pm']

View File

@ -4,6 +4,7 @@ import pytest
from spacy.en import English from spacy.en import English
from spacy.parts_of_speech import ADV from spacy.parts_of_speech import ADV
@pytest.fixture @pytest.fixture
def nlp(): def nlp():
return English() return English()

View File

@ -7,6 +7,8 @@ from spacy.en.attrs import IS_STOP
import pytest import pytest
nlp = English() nlp = English()
@pytest.fixture @pytest.fixture
def token(): def token():
tokens = nlp(u'Give it back! He pleaded.') tokens = nlp(u'Give it back! He pleaded.')
@ -35,5 +37,3 @@ def test_single_token_string():
nlp = English() nlp = English()
tokens = nlp(u'foobar') tokens = nlp(u'foobar')
assert tokens[0].string == 'foobar' assert tokens[0].string == 'foobar'

View File

@ -31,6 +31,7 @@ def _orphan_from_list(toks):
lst.append(tok) lst.append(tok)
return lst return lst
def test_list_orphans(): def test_list_orphans():
# Test case from NSchrading # Test case from NSchrading
nlp = English() nlp = English()

View File

@ -10,10 +10,12 @@ from spacy.en import English
def EN(): def EN():
return English().tokenizer return English().tokenizer
def test_no_word(EN): def test_no_word(EN):
tokens = EN(u'') tokens = EN(u'')
assert len(tokens) == 0 assert len(tokens) == 0
def test_single_word(EN): def test_single_word(EN):
tokens = EN(u'hello') tokens = EN(u'hello')
assert tokens[0].orth_ == 'hello' assert tokens[0].orth_ == 'hello'
@ -60,18 +62,19 @@ def test_contraction_punct(EN):
tokens = EN("can't!") tokens = EN("can't!")
assert len(tokens) == 3 assert len(tokens) == 3
def test_sample(EN): def test_sample(EN):
text = """Tributes pour in for late British Labour Party leader text = """Tributes pour in for late British Labour Party leader
Tributes poured in from around the world Thursday Tributes poured in from around the world Thursday
to the late Labour Party leader John Smith, who died earlier from a massive to the late Labour Party leader John Smith, who died earlier from a massive
heart attack aged 55. heart attack aged 55.
In Washington, the US State Department issued a statement regretting "the In Washington, the US State Department issued a statement regretting "the
untimely death" of the rapier-tongued Scottish barrister and parliamentarian. untimely death" of the rapier-tongued Scottish barrister and parliamentarian.
"Mr. Smith, throughout his distinguished""" "Mr. Smith, throughout his distinguished"""
tokens = EN(text) tokens = EN(text)
assert len(tokens) > 5 assert len(tokens) > 5

View File

@ -3,6 +3,7 @@ from spacy.en import English
import pytest import pytest
@pytest.fixture @pytest.fixture
def tokens(): def tokens():
nlp = English() nlp = English()

View File

@ -2,6 +2,7 @@ from __future__ import unicode_literals
from spacy.orth import like_url from spacy.orth import like_url
def test_basic_url(): def test_basic_url():
assert like_url('www.google.com') assert like_url('www.google.com')
assert like_url('google.com') assert like_url('google.com')

View File

@ -4,15 +4,18 @@ from spacy.en import English
import pytest import pytest
@pytest.fixture @pytest.fixture
def EN(): def EN():
return English() return English()
def test_vec(EN): def test_vec(EN):
hype = EN.vocab['hype'] hype = EN.vocab['hype']
assert hype.orth_ == 'hype' assert hype.orth_ == 'hype'
assert 0.08 >= hype.repvec[0] > 0.07 assert 0.08 >= hype.repvec[0] > 0.07
def test_capitalized(EN): def test_capitalized(EN):
hype = EN.vocab['Hype'] hype = EN.vocab['Hype']
assert hype.orth_ == 'Hype' assert hype.orth_ == 'Hype'

View File

@ -39,5 +39,3 @@ def test_newline_double_space(EN):
def test_newline_space_wrap(EN): def test_newline_space_wrap(EN):
tokens = EN('hello \n possums') tokens = EN('hello \n possums')
assert len(tokens) == 3 assert len(tokens) == 3

View File

@ -4,7 +4,6 @@ from spacy.en import English
from spacy.util import utf8open from spacy.util import utf8open
import pytest import pytest
import os
from os import path from os import path

View File

@ -20,7 +20,7 @@ s=\.\.\.= ... =g
s=[,;:@#$%&]= & =g s=[,;:@#$%&]= & =g
# Assume sentence tokenization has been done first, so split FINAL periods # Assume sentence tokenization has been done first, so split FINAL periods
# only. # only.
s=\([^.]\)\([.]\)\([])}>"']*\)[ ]*$=\1 \2\3 =g s=\([^.]\)\([.]\)\([])}>"']*\)[ ]*$=\1 \2\3 =g
# however, we may as well split ALL question marks and exclamation points, # however, we may as well split ALL question marks and exclamation points,
# since they shouldn't have the abbrev.-marker ambiguity problem # since they shouldn't have the abbrev.-marker ambiguity problem