mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	Merge branch 'master' of ssh://github.com/honnibal/spaCy
This commit is contained in:
		
						commit
						fb8d50b3d5
					
				
							
								
								
									
										1
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										1
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							| 
						 | 
					@ -17,6 +17,7 @@ models/
 | 
				
			||||||
spacy/syntax/*.cpp
 | 
					spacy/syntax/*.cpp
 | 
				
			||||||
spacy/syntax/*.html
 | 
					spacy/syntax/*.html
 | 
				
			||||||
spacy/en/*.cpp
 | 
					spacy/en/*.cpp
 | 
				
			||||||
 | 
					spacy/en/data/*
 | 
				
			||||||
spacy/*.cpp
 | 
					spacy/*.cpp
 | 
				
			||||||
spacy/ner/*.cpp
 | 
					spacy/ner/*.cpp
 | 
				
			||||||
spacy/orthography/*.cpp
 | 
					spacy/orthography/*.cpp
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -8,12 +8,12 @@ python:
 | 
				
			||||||
  - "2.7"
 | 
					  - "2.7"
 | 
				
			||||||
  - "3.4"
 | 
					  - "3.4"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# command to install dependencies
 | 
					# install dependencies
 | 
				
			||||||
install:
 | 
					install:
 | 
				
			||||||
  - "pip install --upgrade setuptools"
 | 
					  - "pip install --upgrade setuptools"
 | 
				
			||||||
  - "pip install -r requirements.txt"
 | 
					  - "pip install -r requirements.txt"
 | 
				
			||||||
  - "export PYTHONPATH=`pwd`"
 | 
					  - "export PYTHONPATH=`pwd`"
 | 
				
			||||||
  - "python setup.py build_ext --inplace"
 | 
					  - "python setup.py build_ext --inplace"
 | 
				
			||||||
# command to run tests
 | 
					# run tests
 | 
				
			||||||
script:
 | 
					script:
 | 
				
			||||||
  - py.test tests/
 | 
					  - py.test tests/
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -3,20 +3,18 @@ spaCy
 | 
				
			||||||
 | 
					
 | 
				
			||||||
http://honnibal.github.io/spaCy
 | 
					http://honnibal.github.io/spaCy
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Fast, state-of-the-art natural language processing pipeline. Commercial licenses available, or use under AGPL.
 | 
					A pipeline for fast, state-of-the-art natural language processing. Commercial licenses available, otherwise under AGPL.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Version 0.80 released
 | 
					Version 0.80 released
 | 
				
			||||||
---------------------
 | 
					---------------------
 | 
				
			||||||
 | 
					
 | 
				
			||||||
2015-04-13
 | 
					2015-04-13
 | 
				
			||||||
 | 
					
 | 
				
			||||||
* Preliminary named entity recognition support. Accuracy is currently
 | 
					* Preliminary support for named-entity recognition. Its accuracy is substantially behind the state-of-the-art. I'm working on improvements. 
 | 
				
			||||||
  substantially behind the current state-of-the-art. I'm working on
 | 
					 | 
				
			||||||
  improvements. 
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
* Better sentence boundary detection, drawn from the syntactic structure.
 | 
					* Better sentence boundary detection, drawn from the syntactic structure.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
* Lots of bug fixes
 | 
					* Lots of bug fixes.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Supports:
 | 
					Supports:
 | 
				
			||||||
| 
						 | 
					@ -35,4 +33,3 @@ Difficult to support:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
* PyPy 2.7
 | 
					* PyPy 2.7
 | 
				
			||||||
* PyPy 3.4
 | 
					* PyPy 3.4
 | 
				
			||||||
 | 
					 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -30,5 +30,3 @@ def main(text_loc):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
if __name__ == '__main__':
 | 
					if __name__ == '__main__':
 | 
				
			||||||
    plac.call(main)
 | 
					    plac.call(main)
 | 
				
			||||||
 | 
					 | 
				
			||||||
        
 | 
					 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -7,8 +7,6 @@ from os import path
 | 
				
			||||||
import shutil
 | 
					import shutil
 | 
				
			||||||
import codecs
 | 
					import codecs
 | 
				
			||||||
import random
 | 
					import random
 | 
				
			||||||
import time
 | 
					 | 
				
			||||||
import gzip
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
import plac
 | 
					import plac
 | 
				
			||||||
import cProfile
 | 
					import cProfile
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,8 +1,6 @@
 | 
				
			||||||
"""Read a vector file, and prepare it as binary data, for easy consumption"""
 | 
					"""Read a vector file, and prepare it as binary data, for easy consumption"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import bz2
 | 
					 | 
				
			||||||
import plac
 | 
					import plac
 | 
				
			||||||
import struct
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
from spacy.vocab import write_binary_vectors
 | 
					from spacy.vocab import write_binary_vectors
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -11,5 +11,3 @@ The CLA must be signed on your first pull request. To do this, simply fill in th
 | 
				
			||||||
    $ git add -A spaCy/contributors/<your GitHub username>.md
 | 
					    $ git add -A spaCy/contributors/<your GitHub username>.md
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
Now finish your pull request, and you're done.
 | 
					Now finish your pull request, and you're done.
 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										95
									
								
								contributors/suchow.md
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										95
									
								
								contributors/suchow.md
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,95 @@
 | 
				
			||||||
 | 
					Syllogism Contributor Agreement
 | 
				
			||||||
 | 
					===============================
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					This Syllogism Contributor Agreement (“SCA”) is based on the Oracle Contributor
 | 
				
			||||||
 | 
					Agreement. The SCA applies to any contribution that you make to any product or
 | 
				
			||||||
 | 
					project managed by us (the “project”), and sets out the intellectual property
 | 
				
			||||||
 | 
					rights you grant to us in the contributed materials. The term “us” shall mean
 | 
				
			||||||
 | 
					Syllogism Co. The term "you" shall mean the person or entity identified below.
 | 
				
			||||||
 | 
					If you agree to be bound by these terms, fill in the information requested below
 | 
				
			||||||
 | 
					and include the filled-in version with your first pull-request, under the file
 | 
				
			||||||
 | 
					contrbutors/. The name of the file should be your GitHub username, with the
 | 
				
			||||||
 | 
					extension .md. For example, the user example_user would create the file
 | 
				
			||||||
 | 
					spaCy/contributors/example_user.md .
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Read this agreement carefully before signing. These terms and conditions
 | 
				
			||||||
 | 
					constitute a binding legal agreement.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					1. The term 'contribution' or ‘contributed materials’ means any source code,
 | 
				
			||||||
 | 
					object code, patch, tool, sample, graphic, specification, manual, documentation,
 | 
				
			||||||
 | 
					or any other material posted or submitted by you to the project.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					2. With respect to any worldwide copyrights, or copyright applications and registrations,
 | 
				
			||||||
 | 
					in your contribution:
 | 
				
			||||||
 | 
					  * you hereby assign to us joint ownership, and to the extent that such assignment
 | 
				
			||||||
 | 
					  is or becomes invalid, ineffective or unenforceable, you hereby grant to us a perpetual,
 | 
				
			||||||
 | 
					  irrevocable, non-exclusive, worldwide, no-charge, royalty-free, unrestricted license
 | 
				
			||||||
 | 
					  to exercise all rights under those copyrights. This includes, at our option, the
 | 
				
			||||||
 | 
					  right to sublicense these same rights to third parties through multiple levels of
 | 
				
			||||||
 | 
					  sublicensees or other licensing arrangements;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  * you agree that each of us can do all things in relation to your contribution
 | 
				
			||||||
 | 
					  as if each of us were the sole owners, and if one of us makes a derivative work
 | 
				
			||||||
 | 
					  of your contribution, the one who makes the derivative work (or has it made) will
 | 
				
			||||||
 | 
					  be the sole owner of that derivative work;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  * you agree that you will not assert any moral rights in your contribution against
 | 
				
			||||||
 | 
					  us, our licensees or transferees;
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					  * you agree that we may register a copyright in your contribution and exercise
 | 
				
			||||||
 | 
					  all ownership rights associated with it; and
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  * you agree that neither of us has any duty to consult with, obtain the consent
 | 
				
			||||||
 | 
					  of, pay or render an accounting to the other for any use or distribution of your
 | 
				
			||||||
 | 
					  contribution.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					3. With respect to any patents you own, or that you can license without payment
 | 
				
			||||||
 | 
					to any third party, you hereby grant to us a perpetual, irrevocable, non-exclusive,
 | 
				
			||||||
 | 
					worldwide, no-charge, royalty-free license to:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  * make, have made, use, sell, offer to sell, import, and otherwise transfer your
 | 
				
			||||||
 | 
					  contribution in whole or in part, alone or in combination with
 | 
				
			||||||
 | 
					  or included in any product, work or materials arising out of the project to
 | 
				
			||||||
 | 
					  which your contribution was submitted, and
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  * at our option, to sublicense these same rights to third parties through multiple
 | 
				
			||||||
 | 
					  levels of sublicensees or other licensing arrangements.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					4. Except as set out above, you keep all right, title, and interest in your
 | 
				
			||||||
 | 
					contribution. The rights that you grant to us under these terms are effective on
 | 
				
			||||||
 | 
					the date you first submitted a contribution to us, even if your submission took
 | 
				
			||||||
 | 
					place before the date you sign these terms.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					5. You covenant, represent, warrant and agree that:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  * Each contribution that you submit is and shall be an original work of authorship
 | 
				
			||||||
 | 
					  and you can legally grant the rights set out in this SCA;
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					  * to the best of your knowledge, each contribution will not violate any third
 | 
				
			||||||
 | 
					  party's copyrights, trademarks, patents, or other intellectual property rights; and
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  * each contribution shall be in compliance with U.S. export control laws and other
 | 
				
			||||||
 | 
					  applicable export and import laws. You agree to notify us if you become aware of
 | 
				
			||||||
 | 
					  any circumstance which would make any of the foregoing representations inaccurate
 | 
				
			||||||
 | 
					  in any respect. Syllogism Co. may publicly disclose your participation in the project,
 | 
				
			||||||
 | 
					  including the fact that you have signed the SCA.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					6. This SCA is governed by the laws of the State of California and applicable U.S.
 | 
				
			||||||
 | 
					  Federal law. Any choice of law rules will not apply.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					7. Please place an “x” on one of the applicable statement below. Please do NOT
 | 
				
			||||||
 | 
					mark both statements:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					x___ I am signing on behalf of myself as an individual and no other person or entity, including my employer, has or will have rights with respect my contributions.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					____ I am signing on behalf of my employer or a legal entity and I have the actual authority to contractually bind that entity.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					| Field                          | Entry                |
 | 
				
			||||||
 | 
					|------------------------------- | -------------------- |
 | 
				
			||||||
 | 
					| Name                           | Jordan Suchow        |
 | 
				
			||||||
 | 
					| Company's name (if applicable) |                      |
 | 
				
			||||||
 | 
					| Title or Role (if applicable)  |                      |
 | 
				
			||||||
 | 
					| Date                           | 2015-04-19           |
 | 
				
			||||||
 | 
					| GitHub username                | suchow               |
 | 
				
			||||||
 | 
					| Website (optional)             | http://suchow.io     |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -64,8 +64,6 @@ def clean(ext):
 | 
				
			||||||
            if os.path.exists(html):
 | 
					            if os.path.exists(html):
 | 
				
			||||||
                os.unlink(html)
 | 
					                os.unlink(html)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
HERE = os.path.dirname(__file__)
 | 
					HERE = os.path.dirname(__file__)
 | 
				
			||||||
virtual_env = os.environ.get('VIRTUAL_ENV', '')
 | 
					virtual_env = os.environ.get('VIRTUAL_ENV', '')
 | 
				
			||||||
compile_args = []
 | 
					compile_args = []
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -75,4 +75,3 @@ Boolean features
 | 
				
			||||||
+-------------+--------------------------------------------------------------+
 | 
					+-------------+--------------------------------------------------------------+
 | 
				
			||||||
| IN_LIST     | Facility for loading arbitrary run-time word lists?          |
 | 
					| IN_LIST     | Facility for loading arbitrary run-time word lists?          |
 | 
				
			||||||
+-------------+--------------------------------------------------------------+
 | 
					+-------------+--------------------------------------------------------------+
 | 
				
			||||||
 | 
					 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -28,14 +28,14 @@ can access an excellent set of pre-computed orthographic and distributional feat
 | 
				
			||||||
    >>> are.check_flag(en.CAN_NOUN)
 | 
					    >>> are.check_flag(en.CAN_NOUN)
 | 
				
			||||||
    False
 | 
					    False
 | 
				
			||||||
 | 
					
 | 
				
			||||||
spaCy makes it easy to write very efficient NLP applications, because your feature
 | 
					spaCy makes it easy to write efficient NLP applications, because your feature
 | 
				
			||||||
functions have to do almost no work: almost every lexical property you'll want
 | 
					functions have to do almost no work: almost every lexical property you'll want
 | 
				
			||||||
is pre-computed for you.  See the tutorial for an example POS tagger.
 | 
					is pre-computed for you.  See the tutorial for an example POS tagger.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Benchmark
 | 
					Benchmark
 | 
				
			||||||
---------
 | 
					---------
 | 
				
			||||||
 | 
					
 | 
				
			||||||
The tokenizer itself is also very efficient:
 | 
					The tokenizer itself is also efficient:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
+--------+-------+--------------+--------------+
 | 
					+--------+-------+--------------+--------------+
 | 
				
			||||||
| System | Time	 | Words/second | Speed Factor |
 | 
					| System | Time	 | Words/second | Speed Factor |
 | 
				
			||||||
| 
						 | 
					@ -56,7 +56,7 @@ Pros:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
- All tokens come with indices into the original string
 | 
					- All tokens come with indices into the original string
 | 
				
			||||||
- Full unicode support
 | 
					- Full unicode support
 | 
				
			||||||
- Extensible to other languages
 | 
					- Extendable to other languages
 | 
				
			||||||
- Batch operations computed efficiently in Cython
 | 
					- Batch operations computed efficiently in Cython
 | 
				
			||||||
- Cython API
 | 
					- Cython API
 | 
				
			||||||
- numpy interoperability
 | 
					- numpy interoperability
 | 
				
			||||||
| 
						 | 
					@ -68,4 +68,3 @@ Cons:
 | 
				
			||||||
- Higher memory usage (up to 1gb)
 | 
					- Higher memory usage (up to 1gb)
 | 
				
			||||||
- More conceptually complicated
 | 
					- More conceptually complicated
 | 
				
			||||||
- Tokenization rules expressed in code, not as data
 | 
					- Tokenization rules expressed in code, not as data
 | 
				
			||||||
 | 
					 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -135,7 +135,7 @@ lexical types.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
In a sample of text, vocabulary size grows exponentially slower than word
 | 
					In a sample of text, vocabulary size grows exponentially slower than word
 | 
				
			||||||
count.  So any computations we can perform over the vocabulary and apply to the
 | 
					count.  So any computations we can perform over the vocabulary and apply to the
 | 
				
			||||||
word count are very efficient.
 | 
					word count are efficient.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Part-of-speech Tagger
 | 
					Part-of-speech Tagger
 | 
				
			||||||
| 
						 | 
					@ -260,5 +260,3 @@ these models is really all about the data structures.  We want to stay small,
 | 
				
			||||||
and stay contiguous.  Minimize redundancy and minimize pointer chasing.
 | 
					and stay contiguous.  Minimize redundancy and minimize pointer chasing.
 | 
				
			||||||
That's why Cython is so well suited to this: we get to lay out our data
 | 
					That's why Cython is so well suited to this: we get to lay out our data
 | 
				
			||||||
structures, and manage the memory ourselves, with full C-level control.
 | 
					structures, and manage the memory ourselves, with full C-level control.
 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -37,7 +37,7 @@ tokenizer is suitable for production use.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
I used to think that the NLP community just needed to do more to communicate
 | 
					I used to think that the NLP community just needed to do more to communicate
 | 
				
			||||||
its findings to software engineers.  So I wrote two blog posts, explaining
 | 
					its findings to software engineers.  So I wrote two blog posts, explaining
 | 
				
			||||||
`how to write a part-of-speech tagger`_ and `parser`_.  Both were very well received,
 | 
					`how to write a part-of-speech tagger`_ and `parser`_.  Both were well received,
 | 
				
			||||||
and there's been a bit of interest in `my research software`_ --- even though
 | 
					and there's been a bit of interest in `my research software`_ --- even though
 | 
				
			||||||
it's entirely undocumented, and mostly unuseable to anyone but me.
 | 
					it's entirely undocumented, and mostly unuseable to anyone but me.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -202,7 +202,7 @@ this:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
We wanted to refine the logic so that only adverbs modifying evocative verbs
 | 
					We wanted to refine the logic so that only adverbs modifying evocative verbs
 | 
				
			||||||
of communication, like "pleaded", were highlighted.  We've now built a vector that
 | 
					of communication, like "pleaded", were highlighted.  We've now built a vector that
 | 
				
			||||||
represents that type of word, so now we can highlight adverbs based on very
 | 
					represents that type of word, so now we can highlight adverbs based on
 | 
				
			||||||
subtle logic, honing in on adverbs that seem the most stylistically
 | 
					subtle logic, honing in on adverbs that seem the most stylistically
 | 
				
			||||||
problematic, given our starting assumptions:
 | 
					problematic, given our starting assumptions:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -278,6 +278,3 @@ sentence represents the document as a whole.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Document Model
 | 
					Document Model
 | 
				
			||||||
--------------
 | 
					--------------
 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -35,7 +35,7 @@ And if you're ever in acquisition or IPO talks, the story is simple.
 | 
				
			||||||
spaCy can also be used as free open-source software, under the Aferro GPL
 | 
					spaCy can also be used as free open-source software, under the Aferro GPL
 | 
				
			||||||
license.  If you use it this way, you must comply with the AGPL license terms.
 | 
					license.  If you use it this way, you must comply with the AGPL license terms.
 | 
				
			||||||
When you distribute your project, or offer it as a network service, you must
 | 
					When you distribute your project, or offer it as a network service, you must
 | 
				
			||||||
distribute the source-code, and grant users an AGPL license to it.
 | 
					distribute the source-code and grant users an AGPL license to it.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
.. I left academia in June 2014, just when I should have been submitting my first
 | 
					.. I left academia in June 2014, just when I should have been submitting my first
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -234,4 +234,3 @@ Features
 | 
				
			||||||
  +---------+-----------------------------------------------------------+
 | 
					  +---------+-----------------------------------------------------------+
 | 
				
			||||||
  | prob    | Log probability of word, smoothed with Simple Good-Turing |
 | 
					  | prob    | Log probability of word, smoothed with Simple Good-Turing |
 | 
				
			||||||
  +---------+-----------------------------------------------------------+
 | 
					  +---------+-----------------------------------------------------------+
 | 
				
			||||||
 | 
					 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -7,8 +7,8 @@ Updates
 | 
				
			||||||
Five days ago I presented the alpha release of spaCy, a natural language
 | 
					Five days ago I presented the alpha release of spaCy, a natural language
 | 
				
			||||||
processing library that brings state-of-the-art technology to small companies.
 | 
					processing library that brings state-of-the-art technology to small companies.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
spaCy has been very well received, and there are now a lot of eyes on the project.
 | 
					spaCy has been well received, and there are now a lot of eyes on the project.
 | 
				
			||||||
Naturally, lots of issues have surfaced.  I'm very grateful to those who've reported
 | 
					Naturally, lots of issues have surfaced.  I'm grateful to those who've reported
 | 
				
			||||||
them.  I've worked hard to address them as quickly as I could.
 | 
					them.  I've worked hard to address them as quickly as I could.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Bug Fixes
 | 
					Bug Fixes
 | 
				
			||||||
| 
						 | 
					@ -108,9 +108,9 @@ input to be segmented into sentences, but with no sentence segmenter.  This
 | 
				
			||||||
caused a drop in parse accuracy of 4%!
 | 
					caused a drop in parse accuracy of 4%!
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Over the last five days, I've worked hard to correct this.  I implemented the
 | 
					Over the last five days, I've worked hard to correct this.  I implemented the
 | 
				
			||||||
modifications to the parsing algorithm I had planned, from Dongdong Zhang et al
 | 
					modifications to the parsing algorithm I had planned, from Dongdong Zhang et al.
 | 
				
			||||||
(2013), and trained and evaluated the parser on raw text, using the version of
 | 
					(2013), and trained and evaluated the parser on raw text, using the version of
 | 
				
			||||||
the WSJ distributed by Read et al (2012), and used in Dridan and Oepen's
 | 
					the WSJ distributed by Read et al. (2012), and used in Dridan and Oepen's
 | 
				
			||||||
experiments.
 | 
					experiments.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
I'm pleased to say that on the WSJ at least, spaCy 0.4 performs almost exactly
 | 
					I'm pleased to say that on the WSJ at least, spaCy 0.4 performs almost exactly
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										2
									
								
								fabfile.py
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								fabfile.py
									
									
									
									
										vendored
									
									
								
							| 
						 | 
					@ -1,4 +1,4 @@
 | 
				
			||||||
from fabric.api import local, run, lcd, cd, env
 | 
					from fabric.api import local, lcd, env
 | 
				
			||||||
from os.path import exists as file_exists
 | 
					from os.path import exists as file_exists
 | 
				
			||||||
from fabtools.python import virtualenv
 | 
					from fabtools.python import virtualenv
 | 
				
			||||||
from os import path
 | 
					from os import path
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										5
									
								
								setup.py
									
									
									
									
									
								
							
							
						
						
									
										5
									
								
								setup.py
									
									
									
									
									
								
							| 
						 | 
					@ -1,16 +1,11 @@
 | 
				
			||||||
#!/usr/bin/env python
 | 
					#!/usr/bin/env python
 | 
				
			||||||
import subprocess
 | 
					 | 
				
			||||||
from setuptools import setup
 | 
					from setuptools import setup
 | 
				
			||||||
from glob import glob
 | 
					 | 
				
			||||||
import shutil
 | 
					import shutil
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import sys
 | 
					import sys
 | 
				
			||||||
import os
 | 
					import os
 | 
				
			||||||
from os import path
 | 
					from os import path
 | 
				
			||||||
from os.path import splitext
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 | 
				
			||||||
import shutil
 | 
					 | 
				
			||||||
from setuptools import Extension
 | 
					from setuptools import Extension
 | 
				
			||||||
from distutils import sysconfig
 | 
					from distutils import sysconfig
 | 
				
			||||||
import platform
 | 
					import platform
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -79,5 +79,3 @@ cpdef enum attr_id_t:
 | 
				
			||||||
    POS
 | 
					    POS
 | 
				
			||||||
    TAG
 | 
					    TAG
 | 
				
			||||||
    DEP
 | 
					    DEP
 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -22,4 +22,3 @@ cdef class EnPosTagger:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef int set_morph(self, const int i, const PosTag* tag, TokenC* tokens) except -1
 | 
					    cdef int set_morph(self, const int i, const PosTag* tag, TokenC* tokens) except -1
 | 
				
			||||||
    cdef int lemmatize(self, const univ_pos_t pos, const LexemeC* lex) except -1
 | 
					    cdef int lemmatize(self, const univ_pos_t pos, const LexemeC* lex) except -1
 | 
				
			||||||
 | 
					 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -381,4 +381,3 @@ cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
 | 
				
			||||||
        context[7] = 4
 | 
					        context[7] = 4
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        context[7] = 0
 | 
					        context[7] = 0
 | 
				
			||||||
 | 
					 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -149,5 +149,3 @@ cpdef enum:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef int fill_context(atom_t* context, State* s, Tokens tokens) except -1
 | 
					cdef int fill_context(atom_t* context, State* s, Tokens tokens) except -1
 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -15,4 +15,3 @@ cdef class Span:
 | 
				
			||||||
    cdef public Span head
 | 
					    cdef public Span head
 | 
				
			||||||
    cdef public list rights
 | 
					    cdef public list rights
 | 
				
			||||||
    cdef public list lefts
 | 
					    cdef public list lefts
 | 
				
			||||||
 | 
					 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -277,5 +277,3 @@ class OracleError(Exception):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class UnknownMove(Exception):
 | 
					class UnknownMove(Exception):
 | 
				
			||||||
    pass
 | 
					    pass
 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -13,5 +13,3 @@ class Config(object):
 | 
				
			||||||
    @classmethod
 | 
					    @classmethod
 | 
				
			||||||
    def read(cls, model_dir, name):
 | 
					    def read(cls, model_dir, name):
 | 
				
			||||||
        return cls(**json.load(open(path.join(model_dir, '%s.json' % name))))
 | 
					        return cls(**json.load(open(path.join(model_dir, '%s.json' % name))))
 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -630,4 +630,3 @@ _parse_unset_error = """Text has not been parsed, so cannot be accessed.
 | 
				
			||||||
Check that the parser data is installed. Run "python -m spacy.en.download" if not.
 | 
					Check that the parser data is installed. Run "python -m spacy.en.download" if not.
 | 
				
			||||||
Check whether parse=False in the call to English.__call__
 | 
					Check whether parse=False in the call to English.__call__
 | 
				
			||||||
"""
 | 
					"""
 | 
				
			||||||
 | 
					 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -94,5 +94,3 @@ ctypedef uint64_t flags_t
 | 
				
			||||||
ctypedef uint32_t id_t
 | 
					ctypedef uint32_t id_t
 | 
				
			||||||
ctypedef uint16_t len_t
 | 
					ctypedef uint16_t len_t
 | 
				
			||||||
ctypedef uint16_t tag_t
 | 
					ctypedef uint16_t tag_t
 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,4 +1,3 @@
 | 
				
			||||||
import os
 | 
					 | 
				
			||||||
from os import path
 | 
					from os import path
 | 
				
			||||||
import codecs
 | 
					import codecs
 | 
				
			||||||
import json
 | 
					import json
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -33,4 +33,3 @@ cdef class Vocab:
 | 
				
			||||||
    cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
 | 
					    cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef PreshMap _map
 | 
					    cdef PreshMap _map
 | 
				
			||||||
  
 | 
					 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -7,6 +7,7 @@ from spacy.lexeme import lex_of
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from spacy import LEX, NORM, SHAPE, LAST3
 | 
					from spacy import LEX, NORM, SHAPE, LAST3
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_group_by_lex():
 | 
					def test_group_by_lex():
 | 
				
			||||||
    tokens = en.tokenize("I like the red one and I like the blue one")
 | 
					    tokens = en.tokenize("I like the red one and I like the blue one")
 | 
				
			||||||
    names, hashes, groups = tokens.group_by(LEX)
 | 
					    names, hashes, groups = tokens.group_by(LEX)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -40,6 +40,7 @@ def test_begin(state, sentence):
 | 
				
			||||||
    assert not state.is_valid('O')
 | 
					    assert not state.is_valid('O')
 | 
				
			||||||
    assert not state.is_valid('U-PER')
 | 
					    assert not state.is_valid('U-PER')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_in(state, sentence):
 | 
					def test_in(state, sentence):
 | 
				
			||||||
    state.transition('B-PER')
 | 
					    state.transition('B-PER')
 | 
				
			||||||
    assert state.n_ents == 0
 | 
					    assert state.n_ents == 0
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -30,6 +30,3 @@ def test_align_continue():
 | 
				
			||||||
    assert aligned[2] == ('re-align', [(5, 7), (7, 8), (8, 13)])
 | 
					    assert aligned[2] == ('re-align', [(5, 7), (7, 8), (8, 13)])
 | 
				
			||||||
    assert aligned[3] == ('and', [(13, 16)])
 | 
					    assert aligned[3] == ('and', [(13, 16)])
 | 
				
			||||||
    assert aligned[4] == ('continue', [(16, 24)])
 | 
					    assert aligned[4] == ('continue', [(16, 24)])
 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -37,5 +37,3 @@ def test_dep():
 | 
				
			||||||
    assert feats_array[1][1] == tokens[1].dep
 | 
					    assert feats_array[1][1] == tokens[1].dep
 | 
				
			||||||
    assert feats_array[2][1] == tokens[2].dep
 | 
					    assert feats_array[2][1] == tokens[2].dep
 | 
				
			||||||
    assert feats_array[3][1] == tokens[3].dep
 | 
					    assert feats_array[3][1] == tokens[3].dep
 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -2,6 +2,7 @@
 | 
				
			||||||
"""Sphinx doctest is just too hard. Manually paste doctest examples here"""
 | 
					"""Sphinx doctest is just too hard. Manually paste doctest examples here"""
 | 
				
			||||||
from spacy.en.attrs import IS_LOWER
 | 
					from spacy.en.attrs import IS_LOWER
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_1():
 | 
					def test_1():
 | 
				
			||||||
    import spacy.en
 | 
					    import spacy.en
 | 
				
			||||||
    from spacy.parts_of_speech import ADV
 | 
					    from spacy.parts_of_speech import ADV
 | 
				
			||||||
| 
						 | 
					@ -39,6 +40,7 @@ def test2():
 | 
				
			||||||
    nlp.vocab[u'quietly'].prob
 | 
					    nlp.vocab[u'quietly'].prob
 | 
				
			||||||
    -11.07155704498291
 | 
					    -11.07155704498291
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test3():
 | 
					def test3():
 | 
				
			||||||
    import spacy.en
 | 
					    import spacy.en
 | 
				
			||||||
    from spacy.parts_of_speech import ADV
 | 
					    from spacy.parts_of_speech import ADV
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -8,6 +8,7 @@ from spacy.en import English
 | 
				
			||||||
def EN():
 | 
					def EN():
 | 
				
			||||||
    return English()
 | 
					    return English()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_tweebo_challenge(EN):
 | 
					def test_tweebo_challenge(EN):
 | 
				
			||||||
    text = u""":o :/ :'( >:o (: :) >.< XD -__- o.O ;D :-) @_@ :P 8D :1 >:( :D =| ") :> ...."""
 | 
					    text = u""":o :/ :'( >:o (: :) >.< XD -__- o.O ;D :-) @_@ :P 8D :1 >:( :D =| ") :> ...."""
 | 
				
			||||||
    tokens = EN(text)
 | 
					    tokens = EN(text)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -16,6 +16,7 @@ def words():
 | 
				
			||||||
    return ["1997", "19.97", "hello9", "Hello", "HELLO", "Hello9", "\n", "!",
 | 
					    return ["1997", "19.97", "hello9", "Hello", "HELLO", "Hello9", "\n", "!",
 | 
				
			||||||
            "!d", "\nd"]
 | 
					            "!d", "\nd"]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_is_alpha(words):
 | 
					def test_is_alpha(words):
 | 
				
			||||||
    assert not is_alpha(words[0])
 | 
					    assert not is_alpha(words[0])
 | 
				
			||||||
    assert not is_alpha(words[1])
 | 
					    assert not is_alpha(words[1])
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -5,10 +5,12 @@ from spacy.strings import StringStore
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import pytest
 | 
					import pytest
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@pytest.fixture
 | 
					@pytest.fixture
 | 
				
			||||||
def sstore():
 | 
					def sstore():
 | 
				
			||||||
    return StringStore()
 | 
					    return StringStore()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_save_bytes(sstore):
 | 
					def test_save_bytes(sstore):
 | 
				
			||||||
    Hello_i = sstore[b'Hello']
 | 
					    Hello_i = sstore[b'Hello']
 | 
				
			||||||
    assert Hello_i == 1
 | 
					    assert Hello_i == 1
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -2,10 +2,12 @@ import pytest
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from spacy.en import English
 | 
					from spacy.en import English
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@pytest.fixture
 | 
					@pytest.fixture
 | 
				
			||||||
def EN():
 | 
					def EN():
 | 
				
			||||||
    return English()
 | 
					    return English()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_range_iter(EN):
 | 
					def test_range_iter(EN):
 | 
				
			||||||
    for i in range(len(EN.vocab)):
 | 
					    for i in range(len(EN.vocab)):
 | 
				
			||||||
        lex = EN.vocab[i]
 | 
					        lex = EN.vocab[i]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -35,4 +35,3 @@ def test_merge_heads():
 | 
				
			||||||
def test_issue_54():
 | 
					def test_issue_54():
 | 
				
			||||||
    text = u'Talks given by women had a slightly higher number of questions asked (3.2$\pm$0.2) than talks given by men (2.6$\pm$0.1).'
 | 
					    text = u'Talks given by women had a slightly higher number of questions asked (3.2$\pm$0.2) than talks given by men (2.6$\pm$0.1).'
 | 
				
			||||||
    tokens = NLU(text, merge_mwes=True)
 | 
					    tokens = NLU(text, merge_mwes=True)
 | 
				
			||||||
 | 
					 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -17,6 +17,7 @@ def morph_exc():
 | 
				
			||||||
            'PRP$': {'his': {'L': '-PRP-', 'person': 3, 'case': 2}},
 | 
					            'PRP$': {'his': {'L': '-PRP-', 'person': 3, 'case': 2}},
 | 
				
			||||||
           }
 | 
					           }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_load_exc(EN, morph_exc):
 | 
					def test_load_exc(EN, morph_exc):
 | 
				
			||||||
    EN.tagger.load_morph_exceptions(morph_exc)
 | 
					    EN.tagger.load_morph_exceptions(morph_exc)
 | 
				
			||||||
    tokens = EN('I like his style.', tag=True)
 | 
					    tokens = EN('I like his style.', tag=True)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -3,6 +3,7 @@ from spacy.en import English
 | 
				
			||||||
 | 
					
 | 
				
			||||||
nlp = English()
 | 
					nlp = English()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_simple_types():
 | 
					def test_simple_types():
 | 
				
			||||||
    tokens = nlp(u'Mr. Best flew to New York on Saturday morning.')
 | 
					    tokens = nlp(u'Mr. Best flew to New York on Saturday morning.')
 | 
				
			||||||
    ents = list(tokens.ents)
 | 
					    ents = list(tokens.ents)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -33,4 +33,3 @@ def test_word():
 | 
				
			||||||
def test_not_number():
 | 
					def test_not_number():
 | 
				
			||||||
    assert not like_number('dog')
 | 
					    assert not like_number('dog')
 | 
				
			||||||
    assert not like_number(',')
 | 
					    assert not like_number(',')
 | 
				
			||||||
 | 
					 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -3,6 +3,7 @@ import pytest
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from spacy.en import English
 | 
					from spacy.en import English
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_only_pre1():
 | 
					def test_only_pre1():
 | 
				
			||||||
    EN = English()
 | 
					    EN = English()
 | 
				
			||||||
    assert len(EN("(")) == 1
 | 
					    assert len(EN("(")) == 1
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -58,4 +58,3 @@ def test_child_consistency(nlp, sun_text):
 | 
				
			||||||
        assert not children
 | 
					        assert not children
 | 
				
			||||||
    for head_index, children in rights.items():
 | 
					    for head_index, children in rights.items():
 | 
				
			||||||
        assert not children
 | 
					        assert not children
 | 
				
			||||||
 | 
					 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -49,4 +49,3 @@ def test_three_same_close(close_puncts, EN):
 | 
				
			||||||
def test_double_end_quote(EN):
 | 
					def test_double_end_quote(EN):
 | 
				
			||||||
    assert len(EN("Hello''")) == 2
 | 
					    assert len(EN("Hello''")) == 2
 | 
				
			||||||
    assert len(EN("''")) == 1
 | 
					    assert len(EN("''")) == 1
 | 
				
			||||||
 | 
					 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -3,6 +3,7 @@ from spacy.en import English
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import pytest
 | 
					import pytest
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@pytest.fixture
 | 
					@pytest.fixture
 | 
				
			||||||
def EN():
 | 
					def EN():
 | 
				
			||||||
    return English()
 | 
					    return English()
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -8,20 +8,26 @@ from spacy.orth import word_shape as ws
 | 
				
			||||||
def test_capitalized():
 | 
					def test_capitalized():
 | 
				
			||||||
    assert ws('Nasa') == 'Xxxx'
 | 
					    assert ws('Nasa') == 'Xxxx'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_truncate():
 | 
					def test_truncate():
 | 
				
			||||||
    assert ws('capitalized') == 'xxxx'
 | 
					    assert ws('capitalized') == 'xxxx'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_digits():
 | 
					def test_digits():
 | 
				
			||||||
    assert ws('999999999') == 'dddd'
 | 
					    assert ws('999999999') == 'dddd'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_mix():
 | 
					def test_mix():
 | 
				
			||||||
    assert ws('C3P0') == 'XdXd'
 | 
					    assert ws('C3P0') == 'XdXd'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_punct():
 | 
					def test_punct():
 | 
				
			||||||
    assert ws(',') == ','
 | 
					    assert ws(',') == ','
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_space():
 | 
					def test_space():
 | 
				
			||||||
    assert ws('\n') == '\n'
 | 
					    assert ws('\n') == '\n'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_punct_seq():
 | 
					def test_punct_seq():
 | 
				
			||||||
    assert ws('``,-') == '``,-'
 | 
					    assert ws('``,-') == '``,-'
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -3,7 +3,6 @@ from __future__ import unicode_literals
 | 
				
			||||||
from spacy.en import English
 | 
					from spacy.en import English
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import pytest
 | 
					import pytest
 | 
				
			||||||
import re
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
EN = English()
 | 
					EN = English()
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -13,9 +13,11 @@ def EN():
 | 
				
			||||||
def test_no_special(EN):
 | 
					def test_no_special(EN):
 | 
				
			||||||
    assert len(EN("(can)")) == 3
 | 
					    assert len(EN("(can)")) == 3
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_no_punct(EN):
 | 
					def test_no_punct(EN):
 | 
				
			||||||
    assert len(EN("can't")) == 2
 | 
					    assert len(EN("can't")) == 2
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_prefix(EN):
 | 
					def test_prefix(EN):
 | 
				
			||||||
    assert len(EN("(can't")) == 3
 | 
					    assert len(EN("(can't")) == 3
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -16,6 +16,3 @@ def test_one(EN):
 | 
				
			||||||
    assert tokens[0].orth_ == 'Betty'
 | 
					    assert tokens[0].orth_ == 'Betty'
 | 
				
			||||||
    tokens2 = EN('Betty also bought a pound of butter.')
 | 
					    tokens2 = EN('Betty also bought a pound of butter.')
 | 
				
			||||||
    assert tokens2[0].orth_ == 'Betty'
 | 
					    assert tokens2[0].orth_ == 'Betty'
 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -16,4 +16,3 @@ def test_subtrees():
 | 
				
			||||||
    assert len(list(bus.children)) == 1
 | 
					    assert len(list(bus.children)) == 1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    assert len(list(wheels.subtree)) == 6
 | 
					    assert len(list(wheels.subtree)) == 6
 | 
				
			||||||
 | 
					 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,7 @@
 | 
				
			||||||
from spacy.en import English
 | 
					from spacy.en import English
 | 
				
			||||||
import six
 | 
					import six
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_tag_names():
 | 
					def test_tag_names():
 | 
				
			||||||
    nlp = English()
 | 
					    nlp = English()
 | 
				
			||||||
    tokens = nlp(u'I ate pizzas with anchovies.', parse=True, tag=True)
 | 
					    tokens = nlp(u'I ate pizzas with anchovies.', parse=True, tag=True)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -6,6 +6,7 @@ import pytest
 | 
				
			||||||
 | 
					
 | 
				
			||||||
NLU = English()
 | 
					NLU = English()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_am_pm():
 | 
					def test_am_pm():
 | 
				
			||||||
    numbers = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12']
 | 
					    numbers = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12']
 | 
				
			||||||
    variants = ['a.m.', 'am', 'p.m.', 'pm']
 | 
					    variants = ['a.m.', 'am', 'p.m.', 'pm']
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -4,6 +4,7 @@ import pytest
 | 
				
			||||||
from spacy.en import English
 | 
					from spacy.en import English
 | 
				
			||||||
from spacy.parts_of_speech import ADV
 | 
					from spacy.parts_of_speech import ADV
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@pytest.fixture
 | 
					@pytest.fixture
 | 
				
			||||||
def nlp():
 | 
					def nlp():
 | 
				
			||||||
    return English()
 | 
					    return English()
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -7,6 +7,8 @@ from spacy.en.attrs import IS_STOP
 | 
				
			||||||
import pytest
 | 
					import pytest
 | 
				
			||||||
 | 
					
 | 
				
			||||||
nlp = English()
 | 
					nlp = English()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@pytest.fixture
 | 
					@pytest.fixture
 | 
				
			||||||
def token():
 | 
					def token():
 | 
				
			||||||
    tokens = nlp(u'Give it back! He pleaded.')
 | 
					    tokens = nlp(u'Give it back! He pleaded.')
 | 
				
			||||||
| 
						 | 
					@ -35,5 +37,3 @@ def test_single_token_string():
 | 
				
			||||||
    nlp = English()
 | 
					    nlp = English()
 | 
				
			||||||
    tokens = nlp(u'foobar')
 | 
					    tokens = nlp(u'foobar')
 | 
				
			||||||
    assert tokens[0].string == 'foobar'
 | 
					    assert tokens[0].string == 'foobar'
 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -31,6 +31,7 @@ def _orphan_from_list(toks):
 | 
				
			||||||
        lst.append(tok)
 | 
					        lst.append(tok)
 | 
				
			||||||
    return lst
 | 
					    return lst
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_list_orphans():
 | 
					def test_list_orphans():
 | 
				
			||||||
    # Test case from NSchrading
 | 
					    # Test case from NSchrading
 | 
				
			||||||
    nlp = English()
 | 
					    nlp = English()
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -10,10 +10,12 @@ from spacy.en import English
 | 
				
			||||||
def EN():
 | 
					def EN():
 | 
				
			||||||
    return English().tokenizer
 | 
					    return English().tokenizer
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_no_word(EN):
 | 
					def test_no_word(EN):
 | 
				
			||||||
    tokens = EN(u'')
 | 
					    tokens = EN(u'')
 | 
				
			||||||
    assert len(tokens) == 0
 | 
					    assert len(tokens) == 0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_single_word(EN):
 | 
					def test_single_word(EN):
 | 
				
			||||||
    tokens = EN(u'hello')
 | 
					    tokens = EN(u'hello')
 | 
				
			||||||
    assert tokens[0].orth_ == 'hello'
 | 
					    assert tokens[0].orth_ == 'hello'
 | 
				
			||||||
| 
						 | 
					@ -60,6 +62,7 @@ def test_contraction_punct(EN):
 | 
				
			||||||
    tokens = EN("can't!")
 | 
					    tokens = EN("can't!")
 | 
				
			||||||
    assert len(tokens) == 3
 | 
					    assert len(tokens) == 3
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_sample(EN):
 | 
					def test_sample(EN):
 | 
				
			||||||
    text = """Tributes pour in for late British Labour Party leader
 | 
					    text = """Tributes pour in for late British Labour Party leader
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -3,6 +3,7 @@ from spacy.en import English
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import pytest
 | 
					import pytest
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@pytest.fixture
 | 
					@pytest.fixture
 | 
				
			||||||
def tokens():
 | 
					def tokens():
 | 
				
			||||||
    nlp = English()
 | 
					    nlp = English()
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -2,6 +2,7 @@ from __future__ import unicode_literals
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from spacy.orth import like_url
 | 
					from spacy.orth import like_url
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_basic_url():
 | 
					def test_basic_url():
 | 
				
			||||||
    assert like_url('www.google.com')
 | 
					    assert like_url('www.google.com')
 | 
				
			||||||
    assert like_url('google.com')
 | 
					    assert like_url('google.com')
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -4,15 +4,18 @@ from spacy.en import English
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import pytest
 | 
					import pytest
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@pytest.fixture
 | 
					@pytest.fixture
 | 
				
			||||||
def EN():
 | 
					def EN():
 | 
				
			||||||
    return English()
 | 
					    return English()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_vec(EN):
 | 
					def test_vec(EN):
 | 
				
			||||||
    hype = EN.vocab['hype']
 | 
					    hype = EN.vocab['hype']
 | 
				
			||||||
    assert hype.orth_ == 'hype'
 | 
					    assert hype.orth_ == 'hype'
 | 
				
			||||||
    assert 0.08 >= hype.repvec[0] > 0.07
 | 
					    assert 0.08 >= hype.repvec[0] > 0.07
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_capitalized(EN):
 | 
					def test_capitalized(EN):
 | 
				
			||||||
    hype = EN.vocab['Hype']
 | 
					    hype = EN.vocab['Hype']
 | 
				
			||||||
    assert hype.orth_ == 'Hype'
 | 
					    assert hype.orth_ == 'Hype'
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -39,5 +39,3 @@ def test_newline_double_space(EN):
 | 
				
			||||||
def test_newline_space_wrap(EN):
 | 
					def test_newline_space_wrap(EN):
 | 
				
			||||||
    tokens = EN('hello \n possums')
 | 
					    tokens = EN('hello \n possums')
 | 
				
			||||||
    assert len(tokens) == 3
 | 
					    assert len(tokens) == 3
 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -4,7 +4,6 @@ from spacy.en import English
 | 
				
			||||||
from spacy.util import utf8open
 | 
					from spacy.util import utf8open
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import pytest
 | 
					import pytest
 | 
				
			||||||
import os
 | 
					 | 
				
			||||||
from os import path
 | 
					from os import path
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user