mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-13 05:07:03 +03:00
Merge branch 'master' of ssh://github.com/honnibal/spaCy
This commit is contained in:
commit
fb8d50b3d5
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -17,6 +17,7 @@ models/
|
|||
spacy/syntax/*.cpp
|
||||
spacy/syntax/*.html
|
||||
spacy/en/*.cpp
|
||||
spacy/en/data/*
|
||||
spacy/*.cpp
|
||||
spacy/ner/*.cpp
|
||||
spacy/orthography/*.cpp
|
||||
|
|
|
@ -8,12 +8,12 @@ python:
|
|||
- "2.7"
|
||||
- "3.4"
|
||||
|
||||
# command to install dependencies
|
||||
# install dependencies
|
||||
install:
|
||||
- "pip install --upgrade setuptools"
|
||||
- "pip install -r requirements.txt"
|
||||
- "export PYTHONPATH=`pwd`"
|
||||
- "python setup.py build_ext --inplace"
|
||||
# command to run tests
|
||||
# run tests
|
||||
script:
|
||||
- py.test tests/
|
||||
|
|
|
@ -3,20 +3,18 @@ spaCy
|
|||
|
||||
http://honnibal.github.io/spaCy
|
||||
|
||||
Fast, state-of-the-art natural language processing pipeline. Commercial licenses available, or use under AGPL.
|
||||
A pipeline for fast, state-of-the-art natural language processing. Commercial licenses available, otherwise under AGPL.
|
||||
|
||||
Version 0.80 released
|
||||
---------------------
|
||||
|
||||
2015-04-13
|
||||
|
||||
* Preliminary named entity recognition support. Accuracy is currently
|
||||
substantially behind the current state-of-the-art. I'm working on
|
||||
improvements.
|
||||
* Preliminary support for named-entity recognition. Its accuracy is substantially behind the state-of-the-art. I'm working on improvements.
|
||||
|
||||
* Better sentence boundary detection, drawn from the syntactic structure.
|
||||
|
||||
* Lots of bug fixes
|
||||
* Lots of bug fixes.
|
||||
|
||||
|
||||
Supports:
|
||||
|
@ -35,4 +33,3 @@ Difficult to support:
|
|||
|
||||
* PyPy 2.7
|
||||
* PyPy 3.4
|
||||
|
||||
|
|
|
@ -30,5 +30,3 @@ def main(text_loc):
|
|||
|
||||
if __name__ == '__main__':
|
||||
plac.call(main)
|
||||
|
||||
|
||||
|
|
|
@ -7,8 +7,6 @@ from os import path
|
|||
import shutil
|
||||
import codecs
|
||||
import random
|
||||
import time
|
||||
import gzip
|
||||
|
||||
import plac
|
||||
import cProfile
|
||||
|
|
|
@ -1,8 +1,6 @@
|
|||
"""Read a vector file, and prepare it as binary data, for easy consumption"""
|
||||
|
||||
import bz2
|
||||
import plac
|
||||
import struct
|
||||
|
||||
from spacy.vocab import write_binary_vectors
|
||||
|
||||
|
|
|
@ -11,5 +11,3 @@ The CLA must be signed on your first pull request. To do this, simply fill in th
|
|||
$ git add -A spaCy/contributors/<your GitHub username>.md
|
||||
|
||||
Now finish your pull request, and you're done.
|
||||
|
||||
|
||||
|
|
95
contributors/suchow.md
Normal file
95
contributors/suchow.md
Normal file
|
@ -0,0 +1,95 @@
|
|||
Syllogism Contributor Agreement
|
||||
===============================
|
||||
|
||||
This Syllogism Contributor Agreement (“SCA”) is based on the Oracle Contributor
|
||||
Agreement. The SCA applies to any contribution that you make to any product or
|
||||
project managed by us (the “project”), and sets out the intellectual property
|
||||
rights you grant to us in the contributed materials. The term “us” shall mean
|
||||
Syllogism Co. The term "you" shall mean the person or entity identified below.
|
||||
If you agree to be bound by these terms, fill in the information requested below
|
||||
and include the filled-in version with your first pull-request, under the file
|
||||
contrbutors/. The name of the file should be your GitHub username, with the
|
||||
extension .md. For example, the user example_user would create the file
|
||||
spaCy/contributors/example_user.md .
|
||||
|
||||
Read this agreement carefully before signing. These terms and conditions
|
||||
constitute a binding legal agreement.
|
||||
|
||||
1. The term 'contribution' or ‘contributed materials’ means any source code,
|
||||
object code, patch, tool, sample, graphic, specification, manual, documentation,
|
||||
or any other material posted or submitted by you to the project.
|
||||
|
||||
2. With respect to any worldwide copyrights, or copyright applications and registrations,
|
||||
in your contribution:
|
||||
* you hereby assign to us joint ownership, and to the extent that such assignment
|
||||
is or becomes invalid, ineffective or unenforceable, you hereby grant to us a perpetual,
|
||||
irrevocable, non-exclusive, worldwide, no-charge, royalty-free, unrestricted license
|
||||
to exercise all rights under those copyrights. This includes, at our option, the
|
||||
right to sublicense these same rights to third parties through multiple levels of
|
||||
sublicensees or other licensing arrangements;
|
||||
|
||||
* you agree that each of us can do all things in relation to your contribution
|
||||
as if each of us were the sole owners, and if one of us makes a derivative work
|
||||
of your contribution, the one who makes the derivative work (or has it made) will
|
||||
be the sole owner of that derivative work;
|
||||
|
||||
* you agree that you will not assert any moral rights in your contribution against
|
||||
us, our licensees or transferees;
|
||||
|
||||
* you agree that we may register a copyright in your contribution and exercise
|
||||
all ownership rights associated with it; and
|
||||
|
||||
* you agree that neither of us has any duty to consult with, obtain the consent
|
||||
of, pay or render an accounting to the other for any use or distribution of your
|
||||
contribution.
|
||||
|
||||
3. With respect to any patents you own, or that you can license without payment
|
||||
to any third party, you hereby grant to us a perpetual, irrevocable, non-exclusive,
|
||||
worldwide, no-charge, royalty-free license to:
|
||||
|
||||
* make, have made, use, sell, offer to sell, import, and otherwise transfer your
|
||||
contribution in whole or in part, alone or in combination with
|
||||
or included in any product, work or materials arising out of the project to
|
||||
which your contribution was submitted, and
|
||||
|
||||
* at our option, to sublicense these same rights to third parties through multiple
|
||||
levels of sublicensees or other licensing arrangements.
|
||||
|
||||
4. Except as set out above, you keep all right, title, and interest in your
|
||||
contribution. The rights that you grant to us under these terms are effective on
|
||||
the date you first submitted a contribution to us, even if your submission took
|
||||
place before the date you sign these terms.
|
||||
|
||||
5. You covenant, represent, warrant and agree that:
|
||||
|
||||
* Each contribution that you submit is and shall be an original work of authorship
|
||||
and you can legally grant the rights set out in this SCA;
|
||||
|
||||
* to the best of your knowledge, each contribution will not violate any third
|
||||
party's copyrights, trademarks, patents, or other intellectual property rights; and
|
||||
|
||||
* each contribution shall be in compliance with U.S. export control laws and other
|
||||
applicable export and import laws. You agree to notify us if you become aware of
|
||||
any circumstance which would make any of the foregoing representations inaccurate
|
||||
in any respect. Syllogism Co. may publicly disclose your participation in the project,
|
||||
including the fact that you have signed the SCA.
|
||||
|
||||
6. This SCA is governed by the laws of the State of California and applicable U.S.
|
||||
Federal law. Any choice of law rules will not apply.
|
||||
|
||||
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||
mark both statements:
|
||||
|
||||
x___ I am signing on behalf of myself as an individual and no other person or entity, including my employer, has or will have rights with respect my contributions.
|
||||
|
||||
____ I am signing on behalf of my employer or a legal entity and I have the actual authority to contractually bind that entity.
|
||||
|
||||
| Field | Entry |
|
||||
|------------------------------- | -------------------- |
|
||||
| Name | Jordan Suchow |
|
||||
| Company's name (if applicable) | |
|
||||
| Title or Role (if applicable) | |
|
||||
| Date | 2015-04-19 |
|
||||
| GitHub username | suchow |
|
||||
| Website (optional) | http://suchow.io |
|
||||
|
|
@ -64,8 +64,6 @@ def clean(ext):
|
|||
if os.path.exists(html):
|
||||
os.unlink(html)
|
||||
|
||||
|
||||
|
||||
HERE = os.path.dirname(__file__)
|
||||
virtual_env = os.environ.get('VIRTUAL_ENV', '')
|
||||
compile_args = []
|
||||
|
|
|
@ -75,4 +75,3 @@ Boolean features
|
|||
+-------------+--------------------------------------------------------------+
|
||||
| IN_LIST | Facility for loading arbitrary run-time word lists? |
|
||||
+-------------+--------------------------------------------------------------+
|
||||
|
||||
|
|
|
@ -28,14 +28,14 @@ can access an excellent set of pre-computed orthographic and distributional feat
|
|||
>>> are.check_flag(en.CAN_NOUN)
|
||||
False
|
||||
|
||||
spaCy makes it easy to write very efficient NLP applications, because your feature
|
||||
spaCy makes it easy to write efficient NLP applications, because your feature
|
||||
functions have to do almost no work: almost every lexical property you'll want
|
||||
is pre-computed for you. See the tutorial for an example POS tagger.
|
||||
|
||||
Benchmark
|
||||
---------
|
||||
|
||||
The tokenizer itself is also very efficient:
|
||||
The tokenizer itself is also efficient:
|
||||
|
||||
+--------+-------+--------------+--------------+
|
||||
| System | Time | Words/second | Speed Factor |
|
||||
|
@ -56,7 +56,7 @@ Pros:
|
|||
|
||||
- All tokens come with indices into the original string
|
||||
- Full unicode support
|
||||
- Extensible to other languages
|
||||
- Extendable to other languages
|
||||
- Batch operations computed efficiently in Cython
|
||||
- Cython API
|
||||
- numpy interoperability
|
||||
|
@ -68,4 +68,3 @@ Cons:
|
|||
- Higher memory usage (up to 1gb)
|
||||
- More conceptually complicated
|
||||
- Tokenization rules expressed in code, not as data
|
||||
|
||||
|
|
|
@ -135,7 +135,7 @@ lexical types.
|
|||
|
||||
In a sample of text, vocabulary size grows exponentially slower than word
|
||||
count. So any computations we can perform over the vocabulary and apply to the
|
||||
word count are very efficient.
|
||||
word count are efficient.
|
||||
|
||||
|
||||
Part-of-speech Tagger
|
||||
|
@ -260,5 +260,3 @@ these models is really all about the data structures. We want to stay small,
|
|||
and stay contiguous. Minimize redundancy and minimize pointer chasing.
|
||||
That's why Cython is so well suited to this: we get to lay out our data
|
||||
structures, and manage the memory ourselves, with full C-level control.
|
||||
|
||||
|
||||
|
|
|
@ -37,7 +37,7 @@ tokenizer is suitable for production use.
|
|||
|
||||
I used to think that the NLP community just needed to do more to communicate
|
||||
its findings to software engineers. So I wrote two blog posts, explaining
|
||||
`how to write a part-of-speech tagger`_ and `parser`_. Both were very well received,
|
||||
`how to write a part-of-speech tagger`_ and `parser`_. Both were well received,
|
||||
and there's been a bit of interest in `my research software`_ --- even though
|
||||
it's entirely undocumented, and mostly unuseable to anyone but me.
|
||||
|
||||
|
@ -202,7 +202,7 @@ this:
|
|||
|
||||
We wanted to refine the logic so that only adverbs modifying evocative verbs
|
||||
of communication, like "pleaded", were highlighted. We've now built a vector that
|
||||
represents that type of word, so now we can highlight adverbs based on very
|
||||
represents that type of word, so now we can highlight adverbs based on
|
||||
subtle logic, honing in on adverbs that seem the most stylistically
|
||||
problematic, given our starting assumptions:
|
||||
|
||||
|
|
|
@ -278,6 +278,3 @@ sentence represents the document as a whole.
|
|||
|
||||
Document Model
|
||||
--------------
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -35,7 +35,7 @@ And if you're ever in acquisition or IPO talks, the story is simple.
|
|||
spaCy can also be used as free open-source software, under the Aferro GPL
|
||||
license. If you use it this way, you must comply with the AGPL license terms.
|
||||
When you distribute your project, or offer it as a network service, you must
|
||||
distribute the source-code, and grant users an AGPL license to it.
|
||||
distribute the source-code and grant users an AGPL license to it.
|
||||
|
||||
|
||||
.. I left academia in June 2014, just when I should have been submitting my first
|
||||
|
|
|
@ -234,4 +234,3 @@ Features
|
|||
+---------+-----------------------------------------------------------+
|
||||
| prob | Log probability of word, smoothed with Simple Good-Turing |
|
||||
+---------+-----------------------------------------------------------+
|
||||
|
||||
|
|
|
@ -7,8 +7,8 @@ Updates
|
|||
Five days ago I presented the alpha release of spaCy, a natural language
|
||||
processing library that brings state-of-the-art technology to small companies.
|
||||
|
||||
spaCy has been very well received, and there are now a lot of eyes on the project.
|
||||
Naturally, lots of issues have surfaced. I'm very grateful to those who've reported
|
||||
spaCy has been well received, and there are now a lot of eyes on the project.
|
||||
Naturally, lots of issues have surfaced. I'm grateful to those who've reported
|
||||
them. I've worked hard to address them as quickly as I could.
|
||||
|
||||
Bug Fixes
|
||||
|
@ -108,9 +108,9 @@ input to be segmented into sentences, but with no sentence segmenter. This
|
|||
caused a drop in parse accuracy of 4%!
|
||||
|
||||
Over the last five days, I've worked hard to correct this. I implemented the
|
||||
modifications to the parsing algorithm I had planned, from Dongdong Zhang et al
|
||||
modifications to the parsing algorithm I had planned, from Dongdong Zhang et al.
|
||||
(2013), and trained and evaluated the parser on raw text, using the version of
|
||||
the WSJ distributed by Read et al (2012), and used in Dridan and Oepen's
|
||||
the WSJ distributed by Read et al. (2012), and used in Dridan and Oepen's
|
||||
experiments.
|
||||
|
||||
I'm pleased to say that on the WSJ at least, spaCy 0.4 performs almost exactly
|
||||
|
|
2
fabfile.py
vendored
2
fabfile.py
vendored
|
@ -1,4 +1,4 @@
|
|||
from fabric.api import local, run, lcd, cd, env
|
||||
from fabric.api import local, lcd, env
|
||||
from os.path import exists as file_exists
|
||||
from fabtools.python import virtualenv
|
||||
from os import path
|
||||
|
|
5
setup.py
5
setup.py
|
@ -1,16 +1,11 @@
|
|||
#!/usr/bin/env python
|
||||
import subprocess
|
||||
from setuptools import setup
|
||||
from glob import glob
|
||||
import shutil
|
||||
|
||||
import sys
|
||||
import os
|
||||
from os import path
|
||||
from os.path import splitext
|
||||
|
||||
|
||||
import shutil
|
||||
from setuptools import Extension
|
||||
from distutils import sysconfig
|
||||
import platform
|
||||
|
|
|
@ -79,5 +79,3 @@ cpdef enum attr_id_t:
|
|||
POS
|
||||
TAG
|
||||
DEP
|
||||
|
||||
|
||||
|
|
|
@ -22,4 +22,3 @@ cdef class EnPosTagger:
|
|||
|
||||
cdef int set_morph(self, const int i, const PosTag* tag, TokenC* tokens) except -1
|
||||
cdef int lemmatize(self, const univ_pos_t pos, const LexemeC* lex) except -1
|
||||
|
||||
|
|
|
@ -381,4 +381,3 @@ cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
|
|||
context[7] = 4
|
||||
else:
|
||||
context[7] = 0
|
||||
|
||||
|
|
|
@ -149,5 +149,3 @@ cpdef enum:
|
|||
|
||||
|
||||
cdef int fill_context(atom_t* context, State* s, Tokens tokens) except -1
|
||||
|
||||
|
||||
|
|
|
@ -15,4 +15,3 @@ cdef class Span:
|
|||
cdef public Span head
|
||||
cdef public list rights
|
||||
cdef public list lefts
|
||||
|
||||
|
|
|
@ -277,5 +277,3 @@ class OracleError(Exception):
|
|||
|
||||
class UnknownMove(Exception):
|
||||
pass
|
||||
|
||||
|
||||
|
|
|
@ -13,5 +13,3 @@ class Config(object):
|
|||
@classmethod
|
||||
def read(cls, model_dir, name):
|
||||
return cls(**json.load(open(path.join(model_dir, '%s.json' % name))))
|
||||
|
||||
|
||||
|
|
|
@ -630,4 +630,3 @@ _parse_unset_error = """Text has not been parsed, so cannot be accessed.
|
|||
Check that the parser data is installed. Run "python -m spacy.en.download" if not.
|
||||
Check whether parse=False in the call to English.__call__
|
||||
"""
|
||||
|
||||
|
|
|
@ -94,5 +94,3 @@ ctypedef uint64_t flags_t
|
|||
ctypedef uint32_t id_t
|
||||
ctypedef uint16_t len_t
|
||||
ctypedef uint16_t tag_t
|
||||
|
||||
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
import os
|
||||
from os import path
|
||||
import codecs
|
||||
import json
|
||||
|
|
|
@ -33,4 +33,3 @@ cdef class Vocab:
|
|||
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
|
||||
|
||||
cdef PreshMap _map
|
||||
|
||||
|
|
|
@ -7,6 +7,7 @@ from spacy.lexeme import lex_of
|
|||
|
||||
from spacy import LEX, NORM, SHAPE, LAST3
|
||||
|
||||
|
||||
def test_group_by_lex():
|
||||
tokens = en.tokenize("I like the red one and I like the blue one")
|
||||
names, hashes, groups = tokens.group_by(LEX)
|
||||
|
|
|
@ -40,6 +40,7 @@ def test_begin(state, sentence):
|
|||
assert not state.is_valid('O')
|
||||
assert not state.is_valid('U-PER')
|
||||
|
||||
|
||||
def test_in(state, sentence):
|
||||
state.transition('B-PER')
|
||||
assert state.n_ents == 0
|
||||
|
|
|
@ -30,6 +30,3 @@ def test_align_continue():
|
|||
assert aligned[2] == ('re-align', [(5, 7), (7, 8), (8, 13)])
|
||||
assert aligned[3] == ('and', [(13, 16)])
|
||||
assert aligned[4] == ('continue', [(16, 24)])
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -37,5 +37,3 @@ def test_dep():
|
|||
assert feats_array[1][1] == tokens[1].dep
|
||||
assert feats_array[2][1] == tokens[2].dep
|
||||
assert feats_array[3][1] == tokens[3].dep
|
||||
|
||||
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
"""Sphinx doctest is just too hard. Manually paste doctest examples here"""
|
||||
from spacy.en.attrs import IS_LOWER
|
||||
|
||||
|
||||
def test_1():
|
||||
import spacy.en
|
||||
from spacy.parts_of_speech import ADV
|
||||
|
@ -39,6 +40,7 @@ def test2():
|
|||
nlp.vocab[u'quietly'].prob
|
||||
-11.07155704498291
|
||||
|
||||
|
||||
def test3():
|
||||
import spacy.en
|
||||
from spacy.parts_of_speech import ADV
|
||||
|
|
|
@ -8,6 +8,7 @@ from spacy.en import English
|
|||
def EN():
|
||||
return English()
|
||||
|
||||
|
||||
def test_tweebo_challenge(EN):
|
||||
text = u""":o :/ :'( >:o (: :) >.< XD -__- o.O ;D :-) @_@ :P 8D :1 >:( :D =| ") :> ...."""
|
||||
tokens = EN(text)
|
||||
|
|
|
@ -16,6 +16,7 @@ def words():
|
|||
return ["1997", "19.97", "hello9", "Hello", "HELLO", "Hello9", "\n", "!",
|
||||
"!d", "\nd"]
|
||||
|
||||
|
||||
def test_is_alpha(words):
|
||||
assert not is_alpha(words[0])
|
||||
assert not is_alpha(words[1])
|
||||
|
|
|
@ -5,10 +5,12 @@ from spacy.strings import StringStore
|
|||
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sstore():
|
||||
return StringStore()
|
||||
|
||||
|
||||
def test_save_bytes(sstore):
|
||||
Hello_i = sstore[b'Hello']
|
||||
assert Hello_i == 1
|
||||
|
|
|
@ -2,10 +2,12 @@ import pytest
|
|||
|
||||
from spacy.en import English
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def EN():
|
||||
return English()
|
||||
|
||||
|
||||
def test_range_iter(EN):
|
||||
for i in range(len(EN.vocab)):
|
||||
lex = EN.vocab[i]
|
||||
|
|
|
@ -35,4 +35,3 @@ def test_merge_heads():
|
|||
def test_issue_54():
|
||||
text = u'Talks given by women had a slightly higher number of questions asked (3.2$\pm$0.2) than talks given by men (2.6$\pm$0.1).'
|
||||
tokens = NLU(text, merge_mwes=True)
|
||||
|
||||
|
|
|
@ -17,6 +17,7 @@ def morph_exc():
|
|||
'PRP$': {'his': {'L': '-PRP-', 'person': 3, 'case': 2}},
|
||||
}
|
||||
|
||||
|
||||
def test_load_exc(EN, morph_exc):
|
||||
EN.tagger.load_morph_exceptions(morph_exc)
|
||||
tokens = EN('I like his style.', tag=True)
|
||||
|
|
|
@ -3,6 +3,7 @@ from spacy.en import English
|
|||
|
||||
nlp = English()
|
||||
|
||||
|
||||
def test_simple_types():
|
||||
tokens = nlp(u'Mr. Best flew to New York on Saturday morning.')
|
||||
ents = list(tokens.ents)
|
||||
|
|
|
@ -33,4 +33,3 @@ def test_word():
|
|||
def test_not_number():
|
||||
assert not like_number('dog')
|
||||
assert not like_number(',')
|
||||
|
||||
|
|
|
@ -3,6 +3,7 @@ import pytest
|
|||
|
||||
from spacy.en import English
|
||||
|
||||
|
||||
def test_only_pre1():
|
||||
EN = English()
|
||||
assert len(EN("(")) == 1
|
||||
|
|
|
@ -58,4 +58,3 @@ def test_child_consistency(nlp, sun_text):
|
|||
assert not children
|
||||
for head_index, children in rights.items():
|
||||
assert not children
|
||||
|
||||
|
|
|
@ -49,4 +49,3 @@ def test_three_same_close(close_puncts, EN):
|
|||
def test_double_end_quote(EN):
|
||||
assert len(EN("Hello''")) == 2
|
||||
assert len(EN("''")) == 1
|
||||
|
||||
|
|
|
@ -3,6 +3,7 @@ from spacy.en import English
|
|||
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def EN():
|
||||
return English()
|
||||
|
|
|
@ -8,20 +8,26 @@ from spacy.orth import word_shape as ws
|
|||
def test_capitalized():
|
||||
assert ws('Nasa') == 'Xxxx'
|
||||
|
||||
|
||||
def test_truncate():
|
||||
assert ws('capitalized') == 'xxxx'
|
||||
|
||||
|
||||
def test_digits():
|
||||
assert ws('999999999') == 'dddd'
|
||||
|
||||
|
||||
def test_mix():
|
||||
assert ws('C3P0') == 'XdXd'
|
||||
|
||||
|
||||
def test_punct():
|
||||
assert ws(',') == ','
|
||||
|
||||
|
||||
def test_space():
|
||||
assert ws('\n') == '\n'
|
||||
|
||||
|
||||
def test_punct_seq():
|
||||
assert ws('``,-') == '``,-'
|
||||
|
|
|
@ -3,7 +3,6 @@ from __future__ import unicode_literals
|
|||
from spacy.en import English
|
||||
|
||||
import pytest
|
||||
import re
|
||||
|
||||
|
||||
EN = English()
|
||||
|
|
|
@ -13,9 +13,11 @@ def EN():
|
|||
def test_no_special(EN):
|
||||
assert len(EN("(can)")) == 3
|
||||
|
||||
|
||||
def test_no_punct(EN):
|
||||
assert len(EN("can't")) == 2
|
||||
|
||||
|
||||
def test_prefix(EN):
|
||||
assert len(EN("(can't")) == 3
|
||||
|
||||
|
|
|
@ -16,6 +16,3 @@ def test_one(EN):
|
|||
assert tokens[0].orth_ == 'Betty'
|
||||
tokens2 = EN('Betty also bought a pound of butter.')
|
||||
assert tokens2[0].orth_ == 'Betty'
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -16,4 +16,3 @@ def test_subtrees():
|
|||
assert len(list(bus.children)) == 1
|
||||
|
||||
assert len(list(wheels.subtree)) == 6
|
||||
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
from spacy.en import English
|
||||
import six
|
||||
|
||||
|
||||
def test_tag_names():
|
||||
nlp = English()
|
||||
tokens = nlp(u'I ate pizzas with anchovies.', parse=True, tag=True)
|
||||
|
|
|
@ -6,6 +6,7 @@ import pytest
|
|||
|
||||
NLU = English()
|
||||
|
||||
|
||||
def test_am_pm():
|
||||
numbers = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12']
|
||||
variants = ['a.m.', 'am', 'p.m.', 'pm']
|
||||
|
|
|
@ -4,6 +4,7 @@ import pytest
|
|||
from spacy.en import English
|
||||
from spacy.parts_of_speech import ADV
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def nlp():
|
||||
return English()
|
||||
|
|
|
@ -7,6 +7,8 @@ from spacy.en.attrs import IS_STOP
|
|||
import pytest
|
||||
|
||||
nlp = English()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def token():
|
||||
tokens = nlp(u'Give it back! He pleaded.')
|
||||
|
@ -35,5 +37,3 @@ def test_single_token_string():
|
|||
nlp = English()
|
||||
tokens = nlp(u'foobar')
|
||||
assert tokens[0].string == 'foobar'
|
||||
|
||||
|
||||
|
|
|
@ -31,6 +31,7 @@ def _orphan_from_list(toks):
|
|||
lst.append(tok)
|
||||
return lst
|
||||
|
||||
|
||||
def test_list_orphans():
|
||||
# Test case from NSchrading
|
||||
nlp = English()
|
||||
|
|
|
@ -10,10 +10,12 @@ from spacy.en import English
|
|||
def EN():
|
||||
return English().tokenizer
|
||||
|
||||
|
||||
def test_no_word(EN):
|
||||
tokens = EN(u'')
|
||||
assert len(tokens) == 0
|
||||
|
||||
|
||||
def test_single_word(EN):
|
||||
tokens = EN(u'hello')
|
||||
assert tokens[0].orth_ == 'hello'
|
||||
|
@ -60,6 +62,7 @@ def test_contraction_punct(EN):
|
|||
tokens = EN("can't!")
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
def test_sample(EN):
|
||||
text = """Tributes pour in for late British Labour Party leader
|
||||
|
||||
|
|
|
@ -3,6 +3,7 @@ from spacy.en import English
|
|||
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def tokens():
|
||||
nlp = English()
|
||||
|
|
|
@ -2,6 +2,7 @@ from __future__ import unicode_literals
|
|||
|
||||
from spacy.orth import like_url
|
||||
|
||||
|
||||
def test_basic_url():
|
||||
assert like_url('www.google.com')
|
||||
assert like_url('google.com')
|
||||
|
|
|
@ -4,15 +4,18 @@ from spacy.en import English
|
|||
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def EN():
|
||||
return English()
|
||||
|
||||
|
||||
def test_vec(EN):
|
||||
hype = EN.vocab['hype']
|
||||
assert hype.orth_ == 'hype'
|
||||
assert 0.08 >= hype.repvec[0] > 0.07
|
||||
|
||||
|
||||
def test_capitalized(EN):
|
||||
hype = EN.vocab['Hype']
|
||||
assert hype.orth_ == 'Hype'
|
||||
|
|
|
@ -39,5 +39,3 @@ def test_newline_double_space(EN):
|
|||
def test_newline_space_wrap(EN):
|
||||
tokens = EN('hello \n possums')
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
|
|
|
@ -4,7 +4,6 @@ from spacy.en import English
|
|||
from spacy.util import utf8open
|
||||
|
||||
import pytest
|
||||
import os
|
||||
from os import path
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user