diff --git a/.gitignore b/.gitignore index 426f13518..e2f9c5de5 100644 --- a/.gitignore +++ b/.gitignore @@ -17,6 +17,7 @@ models/ spacy/syntax/*.cpp spacy/syntax/*.html spacy/en/*.cpp +spacy/en/data/* spacy/*.cpp spacy/ner/*.cpp spacy/orthography/*.cpp diff --git a/.travis.yml b/.travis.yml index 83c7da85f..6571f55bd 100644 --- a/.travis.yml +++ b/.travis.yml @@ -8,12 +8,12 @@ python: - "2.7" - "3.4" -# command to install dependencies +# install dependencies install: - "pip install --upgrade setuptools" - "pip install -r requirements.txt" - "export PYTHONPATH=`pwd`" - "python setup.py build_ext --inplace" -# command to run tests +# run tests script: - py.test tests/ diff --git a/LICENSE.txt b/LICENSE.txt index 1f8de8e72..c5117738b 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -1,7 +1,7 @@ spaCy is commercial open-source software: you can buy a commercial license, or you can use it under the AGPL, as described below. -spaCy Natural Language Processing Tools +spaCy Natural Language Processing Tools Copyright (C) 2015 Matthew Honnibal This program is free software: you can redistribute it and/or modify diff --git a/README.md b/README.md index 29a82cd75..a72ccf2c6 100644 --- a/README.md +++ b/README.md @@ -3,20 +3,18 @@ spaCy http://honnibal.github.io/spaCy -Fast, state-of-the-art natural language processing pipeline. Commercial licenses available, or use under AGPL. +A pipeline for fast, state-of-the-art natural language processing. Commercial licenses available, otherwise under AGPL. Version 0.80 released --------------------- 2015-04-13 -* Preliminary named entity recognition support. Accuracy is currently - substantially behind the current state-of-the-art. I'm working on - improvements. +* Preliminary support for named-entity recognition. Its accuracy is substantially behind the state-of-the-art. I'm working on improvements. * Better sentence boundary detection, drawn from the syntactic structure. -* Lots of bug fixes +* Lots of bug fixes. Supports: @@ -35,4 +33,3 @@ Difficult to support: * PyPy 2.7 * PyPy 3.4 - diff --git a/bin/ner_tag.py b/bin/ner_tag.py index e7ec1e51e..34588bd12 100644 --- a/bin/ner_tag.py +++ b/bin/ner_tag.py @@ -30,5 +30,3 @@ def main(text_loc): if __name__ == '__main__': plac.call(main) - - diff --git a/bin/parser/train.py b/bin/parser/train.py index e590b297b..9ae3a3267 100755 --- a/bin/parser/train.py +++ b/bin/parser/train.py @@ -7,8 +7,6 @@ from os import path import shutil import codecs import random -import time -import gzip import plac import cProfile @@ -134,7 +132,7 @@ def main(train_loc, dev_loc, model_dir, n_sents=0, out_loc="", verbose=False, print 'NER P', scorer.ents_p print 'NER R', scorer.ents_r print 'NER F', scorer.ents_f - + if __name__ == '__main__': plac.call(main) diff --git a/bin/prepare_vecs.py b/bin/prepare_vecs.py index a8c774052..b55dafee3 100644 --- a/bin/prepare_vecs.py +++ b/bin/prepare_vecs.py @@ -1,15 +1,13 @@ """Read a vector file, and prepare it as binary data, for easy consumption""" -import bz2 import plac -import struct from spacy.vocab import write_binary_vectors def main(in_loc, out_loc): write_binary_vectors(in_loc, out_loc) - + if __name__ == '__main__': plac.call(main) diff --git a/contributors/cla.md b/contributors/cla.md index cc63ca444..27b522dc8 100644 --- a/contributors/cla.md +++ b/contributors/cla.md @@ -1,7 +1,7 @@ Signing the Contributors License Agreement ========================================== -SpaCy is a commercial open-source project, owned by Syllogism Co. We require that contributors to SpaCy sign our Contributors License Agreement, which is based on the Oracle Contributor Agreement. +SpaCy is a commercial open-source project, owned by Syllogism Co. We require that contributors to SpaCy sign our Contributors License Agreement, which is based on the Oracle Contributor Agreement. The CLA must be signed on your first pull request. To do this, simply fill in the file cla_template.md, and include the filed in form in your first pull request. @@ -11,5 +11,3 @@ The CLA must be signed on your first pull request. To do this, simply fill in th $ git add -A spaCy/contributors/.md Now finish your pull request, and you're done. - - diff --git a/contributors/cla_template.md b/contributors/cla_template.md index fb54da72d..fca6771de 100644 --- a/contributors/cla_template.md +++ b/contributors/cla_template.md @@ -2,7 +2,7 @@ Syllogism Contributor Agreement =============================== This Syllogism Contributor Agreement (“SCA”) is based on the Oracle Contributor -Agreement. The SCA applies to any contribution that you make to any product or +Agreement. The SCA applies to any contribution that you make to any product or project managed by us (the “project”), and sets out the intellectual property rights you grant to us in the contributed materials. The term “us” shall mean Syllogism Co. The term "you" shall mean the person or entity identified below. diff --git a/contributors/suchow.md b/contributors/suchow.md new file mode 100644 index 000000000..099e78c2c --- /dev/null +++ b/contributors/suchow.md @@ -0,0 +1,95 @@ +Syllogism Contributor Agreement +=============================== + +This Syllogism Contributor Agreement (“SCA”) is based on the Oracle Contributor +Agreement. The SCA applies to any contribution that you make to any product or +project managed by us (the “project”), and sets out the intellectual property +rights you grant to us in the contributed materials. The term “us” shall mean +Syllogism Co. The term "you" shall mean the person or entity identified below. +If you agree to be bound by these terms, fill in the information requested below +and include the filled-in version with your first pull-request, under the file +contrbutors/. The name of the file should be your GitHub username, with the +extension .md. For example, the user example_user would create the file +spaCy/contributors/example_user.md . + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +1. The term 'contribution' or ‘contributed materials’ means any source code, +object code, patch, tool, sample, graphic, specification, manual, documentation, +or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and registrations, +in your contribution: + * you hereby assign to us joint ownership, and to the extent that such assignment + is or becomes invalid, ineffective or unenforceable, you hereby grant to us a perpetual, + irrevocable, non-exclusive, worldwide, no-charge, royalty-free, unrestricted license + to exercise all rights under those copyrights. This includes, at our option, the + right to sublicense these same rights to third parties through multiple levels of + sublicensees or other licensing arrangements; + + * you agree that each of us can do all things in relation to your contribution + as if each of us were the sole owners, and if one of us makes a derivative work + of your contribution, the one who makes the derivative work (or has it made) will + be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution against + us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and exercise + all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the consent + of, pay or render an accounting to the other for any use or distribution of your + contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, non-exclusive, +worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer your + contribution in whole or in part, alone or in combination with + or included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through multiple + levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective on +the date you first submitted a contribution to us, even if your submission took +place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of authorship + and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any third + party's copyrights, trademarks, patents, or other intellectual property rights; and + + * each contribution shall be in compliance with U.S. export control laws and other + applicable export and import laws. You agree to notify us if you become aware of + any circumstance which would make any of the foregoing representations inaccurate + in any respect. Syllogism Co. may publicly disclose your participation in the project, + including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable U.S. + Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + +x___ I am signing on behalf of myself as an individual and no other person or entity, including my employer, has or will have rights with respect my contributions. + +____ I am signing on behalf of my employer or a legal entity and I have the actual authority to contractually bind that entity. + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Jordan Suchow | +| Company's name (if applicable) | | +| Title or Role (if applicable) | | +| Date | 2015-04-19 | +| GitHub username | suchow | +| Website (optional) | http://suchow.io | + diff --git a/dev_setup.py b/dev_setup.py index 3b8fc9f73..8efaba40b 100644 --- a/dev_setup.py +++ b/dev_setup.py @@ -64,8 +64,6 @@ def clean(ext): if os.path.exists(html): os.unlink(html) - - HERE = os.path.dirname(__file__) virtual_env = os.environ.get('VIRTUAL_ENV', '') compile_args = [] @@ -102,7 +100,7 @@ exts = [ Extension("spacy.syntax.arc_eager", ["spacy/syntax/arc_eager.pyx"], **ext_args), Extension("spacy.syntax._parse_features", ["spacy/syntax/_parse_features.pyx"], **ext_args) - + #Extension("spacy.pos_feats", ["spacy/pos_feats.pyx"], language="c++", include_dirs=includes), #Extension("spacy.ner._state", ["spacy/ner/_state.pyx"], language="c++", include_dirs=includes), #Extension("spacy.ner.bilou_moves", ["spacy/ner/bilou_moves.pyx"], language="c++", include_dirs=includes), diff --git a/docs/source/api.rst b/docs/source/api.rst index e8638ed55..bb85b45ae 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -28,7 +28,7 @@ API .. autoclass:: spacy.tokens.Tokens - + +---------------+-------------+-------------+ | Attribute | Type | Attr API | +===============+=============+=============+ @@ -48,7 +48,7 @@ API For faster access, the underlying C data can be accessed from Cython. You can also export the data to a numpy array, via `Tokens.to_array`, if pure Python access is required, and you need slightly better performance. However, this - is both slower and has a worse API than Cython access. + is both slower and has a worse API than Cython access. .. autoclass:: spacy.tokens.Token @@ -107,7 +107,7 @@ API *derivational* suffixes are not stripped, e.g. the lemma of "instutitions" is "institution", not "institute". Lemmatization is performed using the WordNet data, but extended to also cover closed-class words such as - pronouns. By default, the WN lemmatizer returns "hi" as the lemma of "his". + pronouns. By default, the WN lemmatizer returns "hi" as the lemma of "his". We assign pronouns the lemma -PRON-. lower @@ -119,9 +119,9 @@ API shape A transform of the word's string, to show orthographic features. The - characters a-z are mapped to x, A-Z is mapped to X, 0-9 is mapped to d. + characters a-z are mapped to x, A-Z is mapped to X, 0-9 is mapped to d. After these mappings, sequences of 4 or more of the same character are - truncated to length 4. Examples: C3Po --> XdXx, favorite --> xxxx, + truncated to length 4. Examples: C3Po --> XdXx, favorite --> xxxx, :) --> :) prefix @@ -161,7 +161,7 @@ API pos A part-of-speech tag, from the Google Universal Tag Set, e.g. NOUN, VERB, ADV. Constants for the 17 tag values are provided in spacy.parts\_of\_speech. - + dep The type of syntactic dependency relation between the word and its syntactic head. @@ -185,10 +185,10 @@ API rights An iterator for the immediate rightward syntactic children of the word. - + children An iterator that yields from lefts, and then yields from rights. - + subtree An iterator for the part of the sentence syntactically governed by the word, including the word itself. @@ -205,15 +205,15 @@ API .. py:class:: vocab.Vocab(self, data_dir=None, lex_props_getter=None) .. py:method:: __len__(self) --> int - + .. py:method:: __getitem__(self, id: int) --> unicode - + .. py:method:: __getitem__(self, string: unicode) --> int - + .. py:method:: __setitem__(self, py_str: unicode, props: Dict[str, int[float]) --> None .. py:method:: dump(self, loc: unicode) --> None - + .. py:method:: load_lexemes(self, loc: unicode) --> None .. py:method:: load_vectors(self, loc: unicode) --> None @@ -223,9 +223,9 @@ API .. py:method:: __len__(self) --> int .. py:method:: __getitem__(self, id: int) --> unicode - + .. py:method:: __getitem__(self, string: bytes) --> id - + .. py:method:: __getitem__(self, string: unicode) --> id .. py:method:: dump(self, loc: unicode) --> None diff --git a/docs/source/features.rst b/docs/source/features.rst index 3e586dd68..ecd465182 100644 --- a/docs/source/features.rst +++ b/docs/source/features.rst @@ -66,7 +66,7 @@ Boolean features +-------------+--------------------------------------------------------------+ | IS_UPPER | The result of sic.isupper() | +-------------+--------------------------------------------------------------+ -| LIKE_URL | Check whether the string looks like it could be a URL. Aims | +| LIKE_URL | Check whether the string looks like it could be a URL. Aims | | | for low false negative rate. | +-------------+--------------------------------------------------------------+ | LIKE_NUMBER | Check whether the string looks like it could be a numeric | @@ -75,4 +75,3 @@ Boolean features +-------------+--------------------------------------------------------------+ | IN_LIST | Facility for loading arbitrary run-time word lists? | +-------------+--------------------------------------------------------------+ - diff --git a/docs/source/guide/overview.rst b/docs/source/guide/overview.rst index 59d0810d8..6faaaa67f 100644 --- a/docs/source/guide/overview.rst +++ b/docs/source/guide/overview.rst @@ -6,7 +6,7 @@ What and Why spaCy is a lightning-fast, full-cream NLP tokenizer and lexicon. -Most tokenizers give you a sequence of strings. That's barbaric. +Most tokenizers give you a sequence of strings. That's barbaric. Giving you strings invites you to compute on every *token*, when what you should be doing is computing on every *type*. Remember `Zipf's law `_: you'll @@ -28,14 +28,14 @@ can access an excellent set of pre-computed orthographic and distributional feat >>> are.check_flag(en.CAN_NOUN) False -spaCy makes it easy to write very efficient NLP applications, because your feature +spaCy makes it easy to write efficient NLP applications, because your feature functions have to do almost no work: almost every lexical property you'll want is pre-computed for you. See the tutorial for an example POS tagger. Benchmark --------- -The tokenizer itself is also very efficient: +The tokenizer itself is also efficient: +--------+-------+--------------+--------------+ | System | Time | Words/second | Speed Factor | @@ -56,7 +56,7 @@ Pros: - All tokens come with indices into the original string - Full unicode support -- Extensible to other languages +- Extendable to other languages - Batch operations computed efficiently in Cython - Cython API - numpy interoperability @@ -68,4 +68,3 @@ Cons: - Higher memory usage (up to 1gb) - More conceptually complicated - Tokenization rules expressed in code, not as data - diff --git a/docs/source/howworks.rst b/docs/source/howworks.rst index 5538988d1..00d61d66d 100644 --- a/docs/source/howworks.rst +++ b/docs/source/howworks.rst @@ -116,13 +116,13 @@ this was written quickly and has not been executed): This procedure splits off tokens from the start and end of the string, at each -point checking whether the remaining string is in our special-cases table. If +point checking whether the remaining string is in our special-cases table. If it is, we stop splitting, and return the tokenization at that point. The advantage of this design is that the prefixes, suffixes and special-cases can be declared separately, in easy-to-understand files. If a new entry is added to the special-cases, you can be sure that it won't have some unforeseen -consequence to a complicated regular-expression grammar. +consequence to a complicated regular-expression grammar. Coupling the Tokenizer and Lexicon ################################## @@ -135,7 +135,7 @@ lexical types. In a sample of text, vocabulary size grows exponentially slower than word count. So any computations we can perform over the vocabulary and apply to the -word count are very efficient. +word count are efficient. Part-of-speech Tagger @@ -159,7 +159,7 @@ Dependency Parser The parser uses the algorithm described in my `2014 blog post`_. This algorithm, shift-reduce dependency parsing, is becoming widely adopted due -to its compelling speed/accuracy trade-off. +to its compelling speed/accuracy trade-off. Some quick details about spaCy's take on this, for those who happen to know these models well. I'll write up a better description shortly. @@ -176,7 +176,7 @@ scored 91.0. So how have I gotten it to 92.4? The following tweaks: 1. I use Brown cluster features --- these help a lot; 2. I redesigned the feature set. I've long known that the Zhang and Nivre (2011) feature set was suboptimal, but a few features don't make a very - compelling publication. Still, they're important. + compelling publication. Still, they're important. 3. When I do the dynamic oracle training, I also make the upate cost-sensitive: if the oracle determines that the move the parser took has a cost of N, then the weights for the gold class are incremented by @@ -206,8 +206,8 @@ loop: class_, score = max(enumerate(scores), key=lambda item: item[1]) transition(state, class_) -The parser makes 2N transitions for a sentence of length N. In order to select -the transition, it extracts a vector of K features from the state. Each feature +The parser makes 2N transitions for a sentence of length N. In order to select +the transition, it extracts a vector of K features from the state. Each feature is used as a key into a hash table managed by the model. The features map to a vector of weights, of length C. We then dot product the feature weights to the scores vector we are building for that instance. @@ -253,12 +253,10 @@ the classes. In the case of the parser, this means the hash table is accessed 2NKC times, instead of the 2NK times if you have a weights vector. You should also be careful to store the weights contiguously in memory --- you don't want a linked list here. I use a block-sparse format, because my problems tend to -have a few dozen classes. +have a few dozen classes. I guess if I had to summarize my experience, I'd say that the efficiency of these models is really all about the data structures. We want to stay small, and stay contiguous. Minimize redundancy and minimize pointer chasing. That's why Cython is so well suited to this: we get to lay out our data structures, and manage the memory ourselves, with full C-level control. - - diff --git a/docs/source/index.rst b/docs/source/index.rst index bab31eb7b..08fbb8046 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -10,7 +10,7 @@ spaCy: Industrial-strength NLP .. _Issue Tracker: https://github.com/honnibal/spaCy/issues -**13/04**: *Version 0.80 released. Includes named entity recognition, better sentence +**13/04**: *Version 0.80 released. Includes named entity recognition, better sentence boundary detection, and many bug fixes.* `spaCy`_ is a new library for text processing in Python and Cython. @@ -28,7 +28,7 @@ If they don't want to stay in academia, they join Google, IBM, etc. The net result is that outside of the tech giants, commercial NLP has changed little in the last ten years. In academia, it's changed entirely. Amazing -improvements in quality. Orders of magnitude faster. But the +improvements in quality. Orders of magnitude faster. But the academic code is always GPL, undocumented, unuseable, or all three. You could implement the ideas yourself, but the papers are hard to read, and training data is exorbitantly expensive. So what are you left with? A common answer is @@ -37,7 +37,7 @@ tokenizer is suitable for production use. I used to think that the NLP community just needed to do more to communicate its findings to software engineers. So I wrote two blog posts, explaining -`how to write a part-of-speech tagger`_ and `parser`_. Both were very well received, +`how to write a part-of-speech tagger`_ and `parser`_. Both were well received, and there's been a bit of interest in `my research software`_ --- even though it's entirely undocumented, and mostly unuseable to anyone but me. @@ -58,14 +58,14 @@ to embedded word representations, and a range of useful features are pre-calcula and cached. If none of that made any sense to you, here's the gist of it. Computers don't -understand text. This is unfortunate, because that's what the web almost entirely +understand text. This is unfortunate, because that's what the web almost entirely consists of. We want to recommend people text based on other text they liked. We want to shorten text to display it on a mobile screen. We want to aggregate it, link it, filter it, categorise it, generate it and correct it. spaCy provides a library of utility functions that help programmers build such products. It's commercial open source software: you can either use it under -the AGPL, or you can `buy a commercial license`_ for a one-time fee. +the AGPL, or you can `buy a commercial license`_ for a one-time fee. .. _buy a commercial license: license.html @@ -148,7 +148,7 @@ cosine metric: >>> from numpy import dot >>> from numpy.linalg import norm - + >>> cosine = lambda v1, v2: dot(v1, v2) / (norm(v1) * norm(v2)) >>> words = [w for w in nlp.vocab if w.has_repvec] >>> words.sort(key=lambda w: cosine(w.repvec, pleaded.repvec)) @@ -200,9 +200,9 @@ this: -We wanted to refine the logic so that only adverbs modifying evocative verbs +We wanted to refine the logic so that only adverbs modifying evocative verbs of communication, like "pleaded", were highlighted. We've now built a vector that -represents that type of word, so now we can highlight adverbs based on very +represents that type of word, so now we can highlight adverbs based on subtle logic, honing in on adverbs that seem the most stylistically problematic, given our starting assumptions: @@ -213,7 +213,7 @@ problematic, given our starting assumptions: >>> from spacy.parts_of_speech import ADV, VERB >>> cosine = lambda v1, v2: dot(v1, v2) / (norm(v1) * norm(v2)) >>> def is_bad_adverb(token, target_verb, tol): - ... if token.pos != ADV + ... if token.pos != ADV ... return False ... elif token.head.pos != VERB: ... return False @@ -238,11 +238,11 @@ database, and processed with an NLP library, to one of three levels of detail --- tokenization, tagging, or parsing. The tasks are additive: to parse the text you have to tokenize and tag it. The pre-processing was not subtracted from the times --- I report the time required for the pipeline to complete. -I report mean times per document, in milliseconds. +I report mean times per document, in milliseconds. **Hardware**: Intel i7-3770 (2012) -.. table:: Efficiency comparison. Lower is better. +.. table:: Efficiency comparison. Lower is better. +--------------+---------------------------+--------------------------------+ | | Absolute (ms per doc) | Relative (to spaCy) | @@ -278,7 +278,7 @@ representations. publish or perform any benchmark or performance tests or analysis relating to the Service or the use thereof without express authorization from AlchemyAPI; -.. Did you get that? You're not allowed to evaluate how well their system works, +.. Did you get that? You're not allowed to evaluate how well their system works, unless you're granted a special exception. Their system must be pretty terrible to motivate such an embarrassing restriction. They must know this makes them look bad, but they apparently believe allowing @@ -287,7 +287,7 @@ representations. .. spaCy is based on science, not alchemy. It's open source, and I am happy to clarify any detail of the algorithms I've implemented. It's evaluated against the current best published systems, following the standard - methodologies. These evaluations show that it performs extremely well. + methodologies. These evaluations show that it performs extremely well. Accuracy Comparison ------------------- @@ -299,7 +299,7 @@ Accuracy Comparison +--------------+----------+------------+ | spaCy | 97.2 | 92.4 | +--------------+----------+------------+ - | CoreNLP | 96.9 | 92.2 | + | CoreNLP | 96.9 | 92.2 | +--------------+----------+------------+ | ZPar | 97.3 | 92.9 | +--------------+----------+------------+ @@ -329,5 +329,5 @@ previous fastest parser that I'm aware of. quickstart.rst api.rst howworks.rst - license.rst + license.rst updates.rst diff --git a/docs/source/lexrank_tutorial.rst b/docs/source/lexrank_tutorial.rst index 5f3e472dd..f5c5ae8fd 100644 --- a/docs/source/lexrank_tutorial.rst +++ b/docs/source/lexrank_tutorial.rst @@ -97,7 +97,7 @@ like lead-text take a while to float up the priority list. This strategy also h the advantage of transparency: it's obvious to users how the decision is being made, so nobody is likely to complain about the feature if it works this way. -Instead of cutting off the text mid-word, we can tokenize the text, and +Instead of cutting off the text mid-word, we can tokenize the text, and +----------------+-----------+ | System | Rouge-1 R | @@ -116,7 +116,7 @@ A simple bag-of-words model can be created using the `count_by` method, which produces a dictionary of frequencies, keyed by string IDs: .. code:: python - + >>> from spacy.en import English >>> from spacy.en.attrs import SIC >>> nlp = English() @@ -148,7 +148,7 @@ from any token: - + .. _word embeddings: https://colah.github.io/posts/2014-07-NLP-RNNs-Representations/ @@ -196,8 +196,8 @@ undirected --- so, it's natural to represent this as a matrix: from scipy.spatial.distance import cosine import numpy - - + + def lexrank(sent_vectors): n = len(sent_vectors) # Build the cosine similarity matrix @@ -205,7 +205,7 @@ undirected --- so, it's natural to represent this as a matrix: for i in range(n): for j in range(n): matrix[i, j] = cosine(sent_vectors[i], sent_vectors[j]) - # Normalize + # Normalize for i in range(n): matrix[i] /= sum(matrix[i]) return _pagerank(matrix) @@ -278,6 +278,3 @@ sentence represents the document as a whole. Document Model -------------- - - - diff --git a/docs/source/license.rst b/docs/source/license.rst index feb83feab..5edf22095 100644 --- a/docs/source/license.rst +++ b/docs/source/license.rst @@ -13,7 +13,7 @@ I've been writing spaCy for six months now, and I'm very excited to release it. I think it's the most valuable thing I could have built. When I was in academia, I noticed that small companies couldn't really make use of our work. Meanwhile the tech giants have been hiring *everyone*, and putting this stuff -into production. I think spaCy can change that. +into production. I think spaCy can change that. +------------+-----------+----------+-------------------------------------+ @@ -35,7 +35,7 @@ And if you're ever in acquisition or IPO talks, the story is simple. spaCy can also be used as free open-source software, under the Aferro GPL license. If you use it this way, you must comply with the AGPL license terms. When you distribute your project, or offer it as a network service, you must -distribute the source-code, and grant users an AGPL license to it. +distribute the source-code and grant users an AGPL license to it. .. I left academia in June 2014, just when I should have been submitting my first @@ -52,14 +52,14 @@ Examples -------- In order to clarify how spaCy's license structure might apply to you, I've -written a few examples, in the form of user-stories. +written a few examples, in the form of user-stories. Ashley and Casey: Seed stage start-up ##################################### Ashley and Casey have an idea for a start-up. To explore their idea, they want to build a minimum viable product they can put in front of potential users and -investors. +investors. They have two options. @@ -75,7 +75,7 @@ They have two options. import a module that imports it, etc). They also cannot use spaCy as a network resource, by running it as a service --- this is the loophole that the "A" part of the AGPL is designed to close. - + Ashley and Casey find the AGPL license unattractive for commercial use. They decide to take up the trial commercial license. However, over the next 90 days, Ashley has to move house twice, and Casey gets @@ -92,7 +92,7 @@ developing. They own the copyright to any modifications they make to spaCy, but not to the original spaCy code. No additional fees will be due when they hire new developers, run spaCy on -additional internal servers, etc. If their company is acquired, the license will +additional internal servers, etc. If their company is acquired, the license will be transferred to the company acquiring them. However, to use spaCy in another product, they will have to buy a second license. @@ -115,9 +115,9 @@ In order to do this, they must sign a contributor agreement, ceding their copyright. When commercial licenses to spaCy are sold, Alex and Sasha will not be able to claim any royalties from their contributions. -Later, Alex and Sasha implement new features into spaCy, for another paper. The +Later, Alex and Sasha implement new features into spaCy, for another paper. The code was quite rushed, and they don't want to take the time to put together a -proper pull request. They must release their modifications under the AGPL, but +proper pull request. They must release their modifications under the AGPL, but they are not obliged to contribute it to the spaCy repository, or concede their copyright. @@ -126,8 +126,8 @@ Phuong and Jessie: Open Source developers ######################################### Phuong and Jessie use the open-source software Calibre to manage their e-book -libraries. They have an idea for a search feature, and they want to use spaCy -to implement it. Calibre is released under the GPLv3. The AGPL has additional +libraries. They have an idea for a search feature, and they want to use spaCy +to implement it. Calibre is released under the GPLv3. The AGPL has additional restrictions for projects used as a network resource, but they don't apply to this project, so Phuong and Jessie can use spaCy to improve Calibre. They'll have to release their code, but that was always their intention anyway. diff --git a/docs/source/quickstart.rst b/docs/source/quickstart.rst index 470df42d7..0226d5c88 100644 --- a/docs/source/quickstart.rst +++ b/docs/source/quickstart.rst @@ -18,12 +18,12 @@ With Python 2.7 or Python 3, using Linux or OSX, run: .. _300 mb of data: http://s3-us-west-1.amazonaws.com/media.spacynlp.com/en_data_all-0.4.tgz -The download command fetches and installs about 300mb of data, for the +The download command fetches and installs about 300mb of data, for the parser model and word vectors, which it installs within the spacy.en package directory. If you're stuck using a server with an old version of Python, and you don't have root access, I've prepared a bootstrap script to help you compile a local -Python install. Run: +Python install. Run: .. code:: bash @@ -47,7 +47,7 @@ this is how I build the project. $ py.test tests/ Python packaging is awkward at the best of times, and it's particularly tricky -with C extensions, built via Cython, requiring large data files. So, please +with C extensions, built via Cython, requiring large data files. So, please report issues as you encounter them, and bear with me :) Usage @@ -88,7 +88,7 @@ the original orthographic form of the word. .. py:class:: spacy.en.English(self, data_dir=join(dirname(__file__), 'data')) - .. py:method:: __call__(self, text: unicode, tag=True, parse=True, entity=True, merge_mwes=False) --> Tokens + .. py:method:: __call__(self, text: unicode, tag=True, parse=True, entity=True, merge_mwes=False) --> Tokens +-----------------+--------------+--------------+ | Attribute | Type | Its API | @@ -121,7 +121,7 @@ the original orthographic form of the word. **Get sentence or named entity spans** .. py:attribute:: tokens.Tokens.sents --> Iterator[Span] - + .. py:attribute:: tokens.Tokens.ents --> Iterator[Span] You can iterate over a Span to access individual Tokens, or access its @@ -131,7 +131,7 @@ the original orthographic form of the word. **Embedded word representenations** .. py:attribute:: tokens.Token.repvec - + .. py:attribute:: lexeme.Lexeme.repvec @@ -150,13 +150,13 @@ the original orthographic form of the word. **Align to original string** .. py:attribute:: string: unicode - + Padded with original whitespace. .. py:attribute:: length: int Length, in unicode code-points. Equal to len(self.orth_). - + .. py:attribute:: idx: int Starting offset of word in the original string. @@ -234,4 +234,3 @@ Features +---------+-----------------------------------------------------------+ | prob | Log probability of word, smoothed with Simple Good-Turing | +---------+-----------------------------------------------------------+ - diff --git a/docs/source/updates.rst b/docs/source/updates.rst index 0d2eb0c9c..c796f31a5 100644 --- a/docs/source/updates.rst +++ b/docs/source/updates.rst @@ -7,8 +7,8 @@ Updates Five days ago I presented the alpha release of spaCy, a natural language processing library that brings state-of-the-art technology to small companies. -spaCy has been very well received, and there are now a lot of eyes on the project. -Naturally, lots of issues have surfaced. I'm very grateful to those who've reported +spaCy has been well received, and there are now a lot of eyes on the project. +Naturally, lots of issues have surfaced. I'm grateful to those who've reported them. I've worked hard to address them as quickly as I could. Bug Fixes @@ -21,22 +21,22 @@ Bug Fixes all look-ups into the vocabulary failed on wide unicode builds, which further meant that the part-of-speech tagger and parser features were not computed correctly. - + The fix is simple: we already have to read in a list of all the strings, so just store an index into that list, instead of a hash. * Parse tree navigation API was rough, and buggy. - The parse-tree navigation API was the last thing I added before v0.3. I've + The parse-tree navigation API was the last thing I added before v0.3. I've now replaced it with something better. The previous API design was flawed, and the implementation was buggy --- Token.child() and Token.head were sometimes inconsistent. I've addressed the most immediate problems, but this part of the design is - still a work in progress. It's a difficult problem. The parse is a tree, + still a work in progress. It's a difficult problem. The parse is a tree, and we want to freely navigate up and down it without creating reference cycles that inhibit garbage collection, and without doing a lot of copying, creating and deleting. - + I think I've got a promising solution to this, but I suspect there's currently a memory leak. Please get in touch no the tracker if you want to know more, especially if you think you can help. @@ -53,14 +53,14 @@ pinning down or reproducing. Please send details of your system to the Enhancements: Train and evaluate on whole paragraphs ---------------------------------------------------- -.. note:: tl;dr: I shipped the wrong parsing model with 0.3. That model expected input to be segmented into sentences. 0.4 ships the correct model, which uses some algorithmic tricks to minimize the impact of tokenization and sentence segmentation errors on the parser. +.. note:: tl;dr: I shipped the wrong parsing model with 0.3. That model expected input to be segmented into sentences. 0.4 ships the correct model, which uses some algorithmic tricks to minimize the impact of tokenization and sentence segmentation errors on the parser. Most English parsing research is performed on text with perfect pre-processing: one newline between every sentence, one space between every token. It's always been done this way, and it's good. It's a useful idealisation, because the pre-processing has few algorithmic implications. - + But, for practical performance, this stuff can matter a lot. Dridan and Oepen (2013) did a simple but rare thing: they actually ran a few parsers on raw text. Even on the standard Wall Street Journal corpus, @@ -77,7 +77,7 @@ made a big difference: | Corrected | 89.9 | 88.8 | +-------------+-------+----------+ -.. note:: spaCy is evaluated on unlabelled dependencies, where the above accuracy figures refer to phrase-structure trees. Accuracies are non-comparable. +.. note:: spaCy is evaluated on unlabelled dependencies, where the above accuracy figures refer to phrase-structure trees. Accuracies are non-comparable. @@ -108,9 +108,9 @@ input to be segmented into sentences, but with no sentence segmenter. This caused a drop in parse accuracy of 4%! Over the last five days, I've worked hard to correct this. I implemented the -modifications to the parsing algorithm I had planned, from Dongdong Zhang et al +modifications to the parsing algorithm I had planned, from Dongdong Zhang et al. (2013), and trained and evaluated the parser on raw text, using the version of -the WSJ distributed by Read et al (2012), and used in Dridan and Oepen's +the WSJ distributed by Read et al. (2012), and used in Dridan and Oepen's experiments. I'm pleased to say that on the WSJ at least, spaCy 0.4 performs almost exactly diff --git a/fabfile.py b/fabfile.py index dc6177107..070fd4cda 100644 --- a/fabfile.py +++ b/fabfile.py @@ -1,4 +1,4 @@ -from fabric.api import local, run, lcd, cd, env +from fabric.api import local, lcd, env from os.path import exists as file_exists from fabtools.python import virtualenv from os import path diff --git a/lang_data/en/morphs.json b/lang_data/en/morphs.json index fe361654a..41fda9aa7 100644 --- a/lang_data/en/morphs.json +++ b/lang_data/en/morphs.json @@ -1,7 +1,7 @@ { "PRP": { "I": {"L": "-PRON-", "person": 1, "number": 1, "gender": 0, "case": 1}, - "me": {"L": "-PRON-", "person": 1, "number": 1, "gender": 0, "case": 3}, + "me": {"L": "-PRON-", "person": 1, "number": 1, "gender": 0, "case": 3}, "mine": {"L": "-PRON-", "person": 1, "number": 1, "gender": 0, "case": 2}, "myself": {"L": "-PRON-", "person": 1, "number": 1, "gender": 0, "case": 4}, "you": {"L": "-PRON-", "person": 2, "number": 0, "gender": 0, "case": 0}, diff --git a/setup.py b/setup.py index 36b965ff3..9857ad955 100644 --- a/setup.py +++ b/setup.py @@ -1,16 +1,11 @@ #!/usr/bin/env python -import subprocess from setuptools import setup -from glob import glob import shutil import sys import os from os import path -from os.path import splitext - -import shutil from setuptools import Extension from distutils import sysconfig import platform @@ -155,7 +150,7 @@ MOD_NAMES = ['spacy.parts_of_speech', 'spacy.strings', 'spacy.morphology', 'spacy._ml', 'spacy.tokenizer', 'spacy.en.attrs', 'spacy.en.pos', 'spacy.syntax.parser', 'spacy.syntax._state', - 'spacy.syntax.transition_system', + 'spacy.syntax.transition_system', 'spacy.syntax.arc_eager', 'spacy.syntax._parse_features', 'spacy.syntax.conll', 'spacy.orth', 'spacy.syntax.ner'] diff --git a/spacy/_ml.pxd b/spacy/_ml.pxd index d23bd5b19..4b111217e 100644 --- a/spacy/_ml.pxd +++ b/spacy/_ml.pxd @@ -33,7 +33,7 @@ cdef class Model: cdef class HastyModel: cdef Pool mem cdef weight_t* _scores - + cdef const weight_t* score(self, atom_t* context) except NULL cdef int update(self, atom_t* context, class_t guess, class_t gold, int cost) except -1 diff --git a/spacy/attrs.pxd b/spacy/attrs.pxd index d0727d287..f6e35d90b 100644 --- a/spacy/attrs.pxd +++ b/spacy/attrs.pxd @@ -79,5 +79,3 @@ cpdef enum attr_id_t: POS TAG DEP - - diff --git a/spacy/en/__init__.py b/spacy/en/__init__.py index 66d1f705f..b50e2f006 100644 --- a/spacy/en/__init__.py +++ b/spacy/en/__init__.py @@ -129,19 +129,19 @@ class English(object): entity=parse_if_model_present, merge_mwes=False): """Apply the pipeline to some text. The text can span multiple sentences, and can contain arbtrary whitespace. Alignment into the original string - + The tagger and parser are lazy-loaded the first time they are required. Loading the parser model usually takes 5-10 seconds. - + Args: text (unicode): The text to be processed. Keyword args: tag (bool): Whether to add part-of-speech tags to the text. Also sets morphological analysis and lemmas. - + parse (True, False, -1): Whether to add labelled syntactic dependencies. - + -1 (default) is "guess": It will guess True if tag=True and the model has been installed. diff --git a/spacy/en/download.py b/spacy/en/download.py index ce0ab343e..1fdf24c0f 100644 --- a/spacy/en/download.py +++ b/spacy/en/download.py @@ -39,7 +39,7 @@ def install_parser_model(url, dest_dir): def install_dep_vectors(url, dest_dir): if not os.path.exists(dest_dir): os.mkdir(dest_dir) - + filename = download_file(url, dest_dir) diff --git a/spacy/en/pos.pxd b/spacy/en/pos.pxd index 22d65cde2..b59481020 100644 --- a/spacy/en/pos.pxd +++ b/spacy/en/pos.pxd @@ -22,4 +22,3 @@ cdef class EnPosTagger: cdef int set_morph(self, const int i, const PosTag* tag, TokenC* tokens) except -1 cdef int lemmatize(self, const univ_pos_t pos, const LexemeC* lex) except -1 - diff --git a/spacy/en/pos.pyx b/spacy/en/pos.pyx index 20c9ad950..dd541c72a 100644 --- a/spacy/en/pos.pyx +++ b/spacy/en/pos.pyx @@ -353,7 +353,7 @@ cdef class EnPosTagger: cached.lemma = self.strings[lemma_str] set_morph_from_dict(&cached.morph, props) self._morph_cache.set(pos, orth, cached) - + cdef int fill_context(atom_t* context, const int i, const TokenC* tokens) except -1: _fill_from_token(&context[P2_orth], &tokens[i-2]) @@ -381,4 +381,3 @@ cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil: context[7] = 4 else: context[7] = 0 - diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd index 41324cd38..87354d532 100644 --- a/spacy/lexeme.pxd +++ b/spacy/lexeme.pxd @@ -12,7 +12,7 @@ cdef LexemeC EMPTY_LEXEME cdef int set_lex_struct_props(LexemeC* lex, dict props, StringStore strings, const float* empty_vec) except -1 - + cdef class Lexeme: cdef readonly ndarray repvec diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 7a2bce95a..d66161c83 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -17,12 +17,12 @@ cdef int set_lex_struct_props(LexemeC* lex, dict props, StringStore string_store const float* empty_vec) except -1: lex.length = props['length'] lex.orth = string_store[props['orth']] - lex.lower = string_store[props['lower']] - lex.norm = string_store[props['norm']] - lex.shape = string_store[props['shape']] + lex.lower = string_store[props['lower']] + lex.norm = string_store[props['norm']] + lex.shape = string_store[props['shape']] lex.prefix = string_store[props['prefix']] lex.suffix = string_store[props['suffix']] - + lex.cluster = props['cluster'] lex.prob = props['prob'] lex.sentiment = props['sentiment'] diff --git a/spacy/ner/_feats.pyx b/spacy/ner/_feats.pyx index 18e073c5b..c1b6e1c35 100644 --- a/spacy/ner/_feats.pyx +++ b/spacy/ner/_feats.pyx @@ -58,10 +58,10 @@ LOCAL = ( (N3.sic,), (P4.sic,), (N4.sic,), - + (P1.sic, N0.sic,), (N0.sic, N1.sic), - + (N0.prefix,), (N0.suffix,), diff --git a/spacy/ner/annot.pyx b/spacy/ner/annot.pyx index d04345319..a1e582e5c 100644 --- a/spacy/ner/annot.pyx +++ b/spacy/ner/annot.pyx @@ -11,7 +11,7 @@ cdef class NERAnnotation: memset(self.starts, -1, sizeof(int) * length) memset(self.ends, -1, sizeof(int) * length) memset(self.labels, -1, sizeof(int) * length) - + cdef int start, end, label for start, end, label in entities: for i in range(start, end): diff --git a/spacy/ner/bilou_moves.pyx b/spacy/ner/bilou_moves.pyx index 42cef3fb7..a73a48135 100644 --- a/spacy/ner/bilou_moves.pyx +++ b/spacy/ner/bilou_moves.pyx @@ -107,7 +107,7 @@ cdef bint is_oracle(ActionType act, int tag, ActionType g_act, int g_tag, # U, Gold L --> False # U, Gold O --> False return False - + cdef int set_accept_if_valid(Move* moves, int n_classes, State* s) except 0: cdef int n_accept = 0 @@ -160,7 +160,7 @@ cdef Move* best_accepted(Move* moves, weight_t* scores, int n) except NULL: cdef int best = first_accept cdef weight_t score = scores[first_accept-1] cdef int i - for i in range(first_accept+1, n): + for i in range(first_accept+1, n): if moves[i].accept and scores[i-1] > score: best = i score = scores[i-1] @@ -179,7 +179,7 @@ cdef int transition(State *s, Move* move) except -1: end_entity(s) elif move.action == OUT: pass - s.tags[s.i] = move.clas + s.tags[s.i] = move.clas s.i += 1 diff --git a/spacy/ner/context.pxd b/spacy/ner/context.pxd index f9280c516..433334765 100644 --- a/spacy/ner/context.pxd +++ b/spacy/ner/context.pxd @@ -149,5 +149,3 @@ cpdef enum: cdef int fill_context(atom_t* context, State* s, Tokens tokens) except -1 - - diff --git a/spacy/ner/context.pyx b/spacy/ner/context.pyx index c062bb098..f6beb1501 100644 --- a/spacy/ner/context.pyx +++ b/spacy/ner/context.pyx @@ -18,7 +18,7 @@ cdef int _fill_token(atom_t* c, Lexeme* lex, atom_t pos): c[T_postype] = lex.postype c[T_nertype] = 0 c[T_sensetype] = 0 - + c[T_is_alpha] = lex.flags & (1 << IS_ALPHA) c[T_is_digit] = lex.flags & (1 << IS_DIGIT) c[T_is_lower] = lex.flags & (1 << IS_LOWER) diff --git a/spacy/ner/feats.pyx b/spacy/ner/feats.pyx index 60910f235..b1657716e 100644 --- a/spacy/ner/feats.pyx +++ b/spacy/ner/feats.pyx @@ -7,10 +7,10 @@ LOCAL = ( (N1_sic,), (P2_sic,), (N2_sic,), - + (P1_sic, W_sic,), (W_sic, N1_sic), - + (W_prefix,), (W_suffix,), diff --git a/spacy/ner/greedy_parser.pyx b/spacy/ner/greedy_parser.pyx index 5825c7539..94d096529 100644 --- a/spacy/ner/greedy_parser.pyx +++ b/spacy/ner/greedy_parser.pyx @@ -92,7 +92,7 @@ cdef class NERParser: fill_context(self._context, s, tokens) self.extractor.extract(self._feats, self._values, self._context, NULL) self.model.score(self._scores, self._feats, self._values) - + set_accept_if_valid(self._moves, self.n_classes, s) guess = best_accepted(self._moves, self._scores, self.n_classes) assert guess.clas != 0 diff --git a/spacy/ner/io_moves.pxd b/spacy/ner/io_moves.pxd index 97f9512e8..50f6be106 100644 --- a/spacy/ner/io_moves.pxd +++ b/spacy/ner/io_moves.pxd @@ -16,7 +16,7 @@ cpdef enum ActionType: cdef int set_accept_if_oracle(Move* moves, int n, State* s, int* g_starts, int* g_ends, int* g_labels) except 0 - + cdef int set_accept_if_valid(Move* moves, int n, State* s) except 0 cdef Move* best_accepted(Move* moves, weight_t* scores, int n) except NULL diff --git a/spacy/ner/io_moves.pyx b/spacy/ner/io_moves.pyx index dc268e4a5..257a18f3c 100644 --- a/spacy/ner/io_moves.pyx +++ b/spacy/ner/io_moves.pyx @@ -97,7 +97,7 @@ cdef Move* best_accepted(Move* moves, weight_t* scores, int n) except NULL: cdef int best = first_accept cdef weight_t score = scores[first_accept-1] cdef int i - for i in range(first_accept+1, n): + for i in range(first_accept+1, n): if moves[i].accept and scores[i-1] > score: best = i score = scores[i-1] @@ -105,7 +105,7 @@ cdef Move* best_accepted(Move* moves, weight_t* scores, int n) except NULL: cdef int transition(State *s, Move* move) except -1: - s.tags[s.i] = move.clas + s.tags[s.i] = move.clas if move.action == OUT: s.i += 1 elif move.action == SHIFT: diff --git a/spacy/ner/pystate.pxd b/spacy/ner/pystate.pxd index 9293fae01..6710d9f40 100644 --- a/spacy/ner/pystate.pxd +++ b/spacy/ner/pystate.pxd @@ -8,7 +8,7 @@ cdef class PyState: cdef readonly list tag_names cdef readonly int n_classes cdef readonly dict moves_by_name - + cdef Move* _moves cdef Move* _golds cdef State* _s diff --git a/spacy/scorer.py b/spacy/scorer.py index 684a9476f..a15d5564e 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -33,7 +33,7 @@ class Scorer(object): @property def ents_r(self): return (self.ents_tp / (self.ents_tp + self.ents_fn + 1e-100)) * 100 - + @property def ents_f(self): return (2 * self.ents_p * self.ents_r) / (self.ents_p + self.ents_r + 1e-100) diff --git a/spacy/spans.pxd b/spacy/spans.pxd index 94b0cde98..180a991ee 100644 --- a/spacy/spans.pxd +++ b/spacy/spans.pxd @@ -5,7 +5,7 @@ from .structs cimport Morphology, TokenC, LexemeC from .vocab cimport Vocab from .strings cimport StringStore - + cdef class Span: cdef readonly Tokens _seq cdef public int i @@ -15,4 +15,3 @@ cdef class Span: cdef public Span head cdef public list rights cdef public list lefts - diff --git a/spacy/structs.pxd b/spacy/structs.pxd index c1fc13ecd..4892aa7b9 100644 --- a/spacy/structs.pxd +++ b/spacy/structs.pxd @@ -8,7 +8,7 @@ cdef struct LexemeC: const float* repvec flags_t flags - + attr_t id attr_t length @@ -18,7 +18,7 @@ cdef struct LexemeC: attr_t shape attr_t prefix attr_t suffix - + attr_t cluster float prob diff --git a/spacy/syntax/_parse_features.pxd b/spacy/syntax/_parse_features.pxd index 1801d1738..0a5965671 100644 --- a/spacy/syntax/_parse_features.pxd +++ b/spacy/syntax/_parse_features.pxd @@ -99,7 +99,7 @@ cpdef enum: S0_shape S0_ne_iob S0_ne_type - + S0r2w S0r2W S0r2p @@ -164,7 +164,7 @@ cpdef enum: N0_shape N0_ne_iob N0_ne_type - + N1w N1W N1p @@ -190,7 +190,7 @@ cpdef enum: N2_shape N2_ne_iob N2_ne_type - + P1w P1W P1p @@ -203,7 +203,7 @@ cpdef enum: P1_shape P1_ne_iob P1_ne_type - + P2w P2W P2p @@ -216,7 +216,7 @@ cpdef enum: P2_shape P2_ne_iob P2_ne_type - + E0w E0W E0p @@ -229,7 +229,7 @@ cpdef enum: E0_shape E0_ne_iob E0_ne_type - + E1w E1W E1p @@ -242,7 +242,7 @@ cpdef enum: E1_shape E1_ne_iob E1_ne_type - + # Misc features at the end dist N0lv diff --git a/spacy/syntax/_parse_features.pyx b/spacy/syntax/_parse_features.pyx index 4b4f9b00e..8b07db979 100644 --- a/spacy/syntax/_parse_features.pyx +++ b/spacy/syntax/_parse_features.pyx @@ -111,10 +111,10 @@ ner = ( (N1W,), (P2W,), (N2W,), - + (P1W, N0W,), (N0W, N1W), - + (N0_prefix,), (N0_suffix,), @@ -205,22 +205,22 @@ ner = ( unigrams = ( (S2W, S2p), (S2c6, S2p), - + (S1W, S1p), (S1c6, S1p), (S0W, S0p), (S0c6, S0p), - + (N0W, N0p), (N0p,), (N0c,), (N0c6, N0p), (N0L,), - + (N1W, N1p), (N1c6, N1p), - + (N2W, N2p), (N2c6, N2p), diff --git a/spacy/syntax/_state.pyx b/spacy/syntax/_state.pyx index 12295905b..37b2fb30e 100644 --- a/spacy/syntax/_state.pyx +++ b/spacy/syntax/_state.pyx @@ -27,7 +27,7 @@ cdef int pop_stack(State *s) except -1: s.stack -= 1 if s.stack_len == 0 and not at_eol(s): push_stack(s) - + cdef int push_stack(State *s) except -1: assert s.i < s.sent_len diff --git a/spacy/syntax/arc_eager.pxd b/spacy/syntax/arc_eager.pxd index 119e07402..606629c66 100644 --- a/spacy/syntax/arc_eager.pxd +++ b/spacy/syntax/arc_eager.pxd @@ -3,7 +3,7 @@ from cymem.cymem cimport Pool from thinc.typedefs cimport weight_t -from ._state cimport State +from ._state cimport State from .transition_system cimport TransitionSystem, Transition diff --git a/spacy/syntax/ner.pyx b/spacy/syntax/ner.pyx index 0e1bc6b20..f9b270c30 100644 --- a/spacy/syntax/ner.pyx +++ b/spacy/syntax/ner.pyx @@ -277,5 +277,3 @@ class OracleError(Exception): class UnknownMove(Exception): pass - - diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index 58e98c1e1..09495ae92 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -36,7 +36,7 @@ from . import _parse_features from ._parse_features cimport fill_context, CONTEXT_SIZE -DEBUG = False +DEBUG = False def set_debug(val): global DEBUG DEBUG = val @@ -111,7 +111,7 @@ cdef class GreedyParser: scores = self.model.score(context) guess = self.moves.best_valid(scores, state) best = self.moves.best_gold(scores, state, gold) - + cost = guess.get_cost(&guess, state, gold) self.model.update(context, guess.clas, best.clas, cost) diff --git a/spacy/syntax/transition_system.pxd b/spacy/syntax/transition_system.pxd index f0eac376a..44fe43949 100644 --- a/spacy/syntax/transition_system.pxd +++ b/spacy/syntax/transition_system.pxd @@ -34,16 +34,16 @@ cdef class TransitionSystem: cdef int finalize_state(self, State* state) except -1 cdef int preprocess_gold(self, GoldParse gold) except -1 - + cdef Transition lookup_transition(self, object name) except * - + cdef Transition init_transition(self, int clas, int move, int label) except * cdef Transition best_valid(self, const weight_t* scores, const State* state) except * cdef Transition best_gold(self, const weight_t* scores, const State* state, GoldParse gold) except * - + #cdef class PyState: # """Provide a Python class for testing purposes.""" diff --git a/spacy/syntax/util.py b/spacy/syntax/util.py index 3ba770ae2..64b259b6b 100644 --- a/spacy/syntax/util.py +++ b/spacy/syntax/util.py @@ -13,5 +13,3 @@ class Config(object): @classmethod def read(cls, model_dir, name): return cls(**json.load(open(path.join(model_dir, '%s.json' % name)))) - - diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index f20367b08..7a1231a07 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -60,7 +60,7 @@ cdef class Tokenizer: split off a suffix, and repeat. Args: - string (unicode): The string to be tokenized. + string (unicode): The string to be tokenized. Returns: tokens (Tokens): A Tokens object, giving access to a sequence of LexemeCs. @@ -213,7 +213,7 @@ cdef class Tokenizer: cdef unicode string = chars[:length] match = self._infix_re.search(string) return match.start() if match is not None else 0 - + cdef int _find_prefix(self, Py_UNICODE* chars, size_t length) except -1: cdef unicode string = chars[:length] match = self._prefix_re.search(string) diff --git a/spacy/tokens.pxd b/spacy/tokens.pxd index 2038020bb..9ddd126a1 100644 --- a/spacy/tokens.pxd +++ b/spacy/tokens.pxd @@ -31,9 +31,9 @@ cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil: cdef class Tokens: cdef Pool mem cdef Vocab vocab - + cdef TokenC* data - + cdef list _py_tokens cdef unicode _string @@ -61,7 +61,7 @@ cdef class Token: cdef int array_len cdef bint _owns_c_data - + cdef Tokens _seq @staticmethod diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx index 4e81b8c24..3d90abb8b 100644 --- a/spacy/tokens.pyx +++ b/spacy/tokens.pyx @@ -105,10 +105,10 @@ cdef class Tokens: def __getitem__(self, object i): """Retrieve a token. - + The Python Token objects are created lazily from internal C data, and cached in _py_tokens - + Returns: token (Token): """ @@ -181,7 +181,7 @@ cdef class Tokens: yield Span(self, start, i+1) start = None if start is not None: - yield Span(self, start, self.length) + yield Span(self, start, self.length) cdef int push_back(self, int idx, LexemeOrToken lex_or_tok) except -1: if self.length == self.max_length: @@ -299,7 +299,7 @@ cdef class Tokens: # What to do about morphology?? # TODO: token.morph = ??? token.tag = self.vocab.strings[tag] - token.lemma = self.vocab.strings[lemma] + token.lemma = self.vocab.strings[lemma] if ent_type == 'O': token.ent_iob = 2 token.ent_type = 0 @@ -356,7 +356,7 @@ cdef class Tokens: self._py_tokens = [None] * self.length # Return the merged Python object return self[start] - + cdef class Token: """An individual token --- i.e. a word, a punctuation symbol, etc. Created @@ -630,4 +630,3 @@ _parse_unset_error = """Text has not been parsed, so cannot be accessed. Check that the parser data is installed. Run "python -m spacy.en.download" if not. Check whether parse=False in the call to English.__call__ """ - diff --git a/spacy/typedefs.pxd b/spacy/typedefs.pxd index 3eefab27d..c2c50dbcc 100644 --- a/spacy/typedefs.pxd +++ b/spacy/typedefs.pxd @@ -94,5 +94,3 @@ ctypedef uint64_t flags_t ctypedef uint32_t id_t ctypedef uint16_t len_t ctypedef uint16_t tag_t - - diff --git a/spacy/util.py b/spacy/util.py index cbc5dfbed..1d48ab7e9 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -1,4 +1,3 @@ -import os from os import path import codecs import json @@ -72,7 +71,7 @@ def read_detoken_rules(lang): for line in file_: entries.append(line.strip()) return entries - + def align_tokens(ref, indices): start = 0 @@ -88,7 +87,7 @@ def align_tokens(ref, indices): def detokenize(token_rules, words): - """To align with treebanks, return a list of "chunks", where a chunk is a + """To align with treebanks, return a list of "chunks", where a chunk is a sequence of tokens that are separated by whitespace in actual strings. Each chunk should be a tuple of token indices, e.g. diff --git a/spacy/vocab.pxd b/spacy/vocab.pxd index 25d62cffe..092bedda7 100644 --- a/spacy/vocab.pxd +++ b/spacy/vocab.pxd @@ -31,6 +31,5 @@ cdef class Vocab: cdef const LexemeC* get(self, Pool mem, UniStr* s) except NULL cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1 - + cdef PreshMap _map - diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 8ed9805a0..feb609c0e 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -170,7 +170,7 @@ cdef class Vocab: self.lexemes[lexeme.id] = lexeme i += 1 fclose(fp) - + def load_rep_vectors(self, loc): file_ = _CFile(loc, b'rb') cdef int32_t word_len @@ -187,7 +187,7 @@ cdef class Vocab: except IOError: break file_.read(&vec_len, sizeof(vec_len), 1) - + mem = Address(word_len, sizeof(char)) chars = mem.ptr vec = self.mem.alloc(vec_len, sizeof(float)) diff --git a/tests/_depr_group_by.py b/tests/_depr_group_by.py index e0c7ce484..9f83c5ce9 100644 --- a/tests/_depr_group_by.py +++ b/tests/_depr_group_by.py @@ -7,6 +7,7 @@ from spacy.lexeme import lex_of from spacy import LEX, NORM, SHAPE, LAST3 + def test_group_by_lex(): tokens = en.tokenize("I like the red one and I like the blue one") names, hashes, groups = tokens.group_by(LEX) diff --git a/tests/depr_test_ner.py b/tests/depr_test_ner.py index bc492c9bf..80e643dd1 100644 --- a/tests/depr_test_ner.py +++ b/tests/depr_test_ner.py @@ -40,6 +40,7 @@ def test_begin(state, sentence): assert not state.is_valid('O') assert not state.is_valid('U-PER') + def test_in(state, sentence): state.transition('B-PER') assert state.n_ents == 0 diff --git a/tests/sun.tokens b/tests/sun.tokens index d16fa1eae..4b912e18e 100644 --- a/tests/sun.tokens +++ b/tests/sun.tokens @@ -1,4 +1,4 @@ -The Sun is the star at the center of the Solar System. It is almost perfectly spherical and consists of hot plasma interwoven with magnetic fields. [ 12 ] [ 13 ] It has a diameter of about 1 , 392 , 684 km ( 865 , 374 mi ) , [ 5 ] around 109 times that of Earth , and its mass ( 1.989×1030 kilograms , approximately 330 , 000 times the mass of Earth ) accounts for about 99.86 % of the total mass of the Solar System. [ 14 ] Chemically , about three quarters of the Sun 's mass consists of hydrogen , while the rest is mostly helium. The remaining 1.69 % ( equal to 5 , 600 times the mass of Earth ) consists of heavier elements , including oxygen , carbon , neon and iron , among others. [ 15 ] +The Sun is the star at the center of the Solar System. It is almost perfectly spherical and consists of hot plasma interwoven with magnetic fields. [ 12 ] [ 13 ] It has a diameter of about 1 , 392 , 684 km ( 865 , 374 mi ) , [ 5 ] around 109 times that of Earth , and its mass ( 1.989×1030 kilograms , approximately 330 , 000 times the mass of Earth ) accounts for about 99.86 % of the total mass of the Solar System. [ 14 ] Chemically , about three quarters of the Sun 's mass consists of hydrogen , while the rest is mostly helium. The remaining 1.69 % ( equal to 5 , 600 times the mass of Earth ) consists of heavier elements , including oxygen , carbon , neon and iron , among others. [ 15 ] -The Sun formed about 4.567 billion [ a ] [ 16 ] years ago from the gravitational collapse of a region within a large molecular cloud. Most of the matter gathered in the center , while the rest flattened into an orbiting disk that would become the Solar System. The central mass became increasingly hot and dense , eventually initiating thermonuclear fusion in its core. It is thought that almost all stars form by this process. The Sun is a G-type main-sequence star ( G2V ) based on spectral class and it is informally designated as a yellow dwarf because its visible radiation is most intense in the yellow-green portion of the spectrum , and although it is actually white in color , from the surface of the Earth it may appear yellow because of atmospheric scattering of blue light. [ 17 ] In the spectral class label , G2 indicates its surface temperature , of approximately 5778 K ( 5505 °C ) , and V indicates that the Sun , like most stars , is a main-sequence star , and thus generates its energy by nuclear fusion of hydrogen nuclei into helium. In its core , the Sun fuses about 620 million metric tons of hydrogen each second. [ 18 ] [ 19 ] -Once regarded by astronomers as a small and relatively insignificant star , the Sun is now thought to be brighter than about 85 % of the stars in the Milky Way , most of which are red dwarfs. [ 20 ] [ 21 ] The absolute magnitude of the Sun is +4.83 ; however , as the star closest to Earth , the Sun is by far the brightest object in the sky with an apparent magnitude of −26.74. [ 22 ] [ 23 ] This is about 13 billion times brighter than the next brightest star , Sirius , with an apparent magnitude of −1.46. The Sun 's hot corona continuously expands in space creating the solar wind , a stream of charged particles that extends to the heliopause at roughly 100 astronomical units. The bubble in the interstellar medium formed by the solar wind , the heliosphere , is the largest continuous structure in the Solar System. [ 24 ] [ 25 ] +The Sun formed about 4.567 billion [ a ] [ 16 ] years ago from the gravitational collapse of a region within a large molecular cloud. Most of the matter gathered in the center , while the rest flattened into an orbiting disk that would become the Solar System. The central mass became increasingly hot and dense , eventually initiating thermonuclear fusion in its core. It is thought that almost all stars form by this process. The Sun is a G-type main-sequence star ( G2V ) based on spectral class and it is informally designated as a yellow dwarf because its visible radiation is most intense in the yellow-green portion of the spectrum , and although it is actually white in color , from the surface of the Earth it may appear yellow because of atmospheric scattering of blue light. [ 17 ] In the spectral class label , G2 indicates its surface temperature , of approximately 5778 K ( 5505 °C ) , and V indicates that the Sun , like most stars , is a main-sequence star , and thus generates its energy by nuclear fusion of hydrogen nuclei into helium. In its core , the Sun fuses about 620 million metric tons of hydrogen each second. [ 18 ] [ 19 ] +Once regarded by astronomers as a small and relatively insignificant star , the Sun is now thought to be brighter than about 85 % of the stars in the Milky Way , most of which are red dwarfs. [ 20 ] [ 21 ] The absolute magnitude of the Sun is +4.83 ; however , as the star closest to Earth , the Sun is by far the brightest object in the sky with an apparent magnitude of −26.74. [ 22 ] [ 23 ] This is about 13 billion times brighter than the next brightest star , Sirius , with an apparent magnitude of −1.46. The Sun 's hot corona continuously expands in space creating the solar wind , a stream of charged particles that extends to the heliopause at roughly 100 astronomical units. The bubble in the interstellar medium formed by the solar wind , the heliosphere , is the largest continuous structure in the Solar System. [ 24 ] [ 25 ] diff --git a/tests/test_align.py b/tests/test_align.py index 9d817e107..a603c4a74 100644 --- a/tests/test_align.py +++ b/tests/test_align.py @@ -30,6 +30,3 @@ def test_align_continue(): assert aligned[2] == ('re-align', [(5, 7), (7, 8), (8, 13)]) assert aligned[3] == ('and', [(13, 16)]) assert aligned[4] == ('continue', [(16, 24)]) - - - diff --git a/tests/test_array.py b/tests/test_array.py index 7a08fbb8f..b6f0620c5 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -37,5 +37,3 @@ def test_dep(): assert feats_array[1][1] == tokens[1].dep assert feats_array[2][1] == tokens[2].dep assert feats_array[3][1] == tokens[3].dep - - diff --git a/tests/test_docs.py b/tests/test_docs.py index a6a44c154..7692413d8 100644 --- a/tests/test_docs.py +++ b/tests/test_docs.py @@ -2,6 +2,7 @@ """Sphinx doctest is just too hard. Manually paste doctest examples here""" from spacy.en.attrs import IS_LOWER + def test_1(): import spacy.en from spacy.parts_of_speech import ADV @@ -39,6 +40,7 @@ def test2(): nlp.vocab[u'quietly'].prob -11.07155704498291 + def test3(): import spacy.en from spacy.parts_of_speech import ADV @@ -57,7 +59,7 @@ def test3(): assert sum(o) != 0 from numpy import dot from numpy.linalg import norm - + cosine = lambda v1, v2: dot(v1, v2) / (norm(v1) * norm(v2)) words = [w for w in nlp.vocab if w.check(IS_LOWER) and w.has_repvec] words.sort(key=lambda w: cosine(w.repvec, pleaded.repvec)) diff --git a/tests/test_emoticons.py b/tests/test_emoticons.py index 2725845a6..98ce58296 100644 --- a/tests/test_emoticons.py +++ b/tests/test_emoticons.py @@ -8,6 +8,7 @@ from spacy.en import English def EN(): return English() + def test_tweebo_challenge(EN): text = u""":o :/ :'( >:o (: :) >.< XD -__- o.O ;D :-) @_@ :P 8D :1 >:( :D =| ") :> ....""" tokens = EN(text) diff --git a/tests/test_flag_features.py b/tests/test_flag_features.py index 27b53d6e5..9c544b972 100644 --- a/tests/test_flag_features.py +++ b/tests/test_flag_features.py @@ -16,6 +16,7 @@ def words(): return ["1997", "19.97", "hello9", "Hello", "HELLO", "Hello9", "\n", "!", "!d", "\nd"] + def test_is_alpha(words): assert not is_alpha(words[0]) assert not is_alpha(words[1]) diff --git a/tests/test_intern.py b/tests/test_intern.py index 53e9fa400..a16340c41 100644 --- a/tests/test_intern.py +++ b/tests/test_intern.py @@ -5,10 +5,12 @@ from spacy.strings import StringStore import pytest + @pytest.fixture def sstore(): return StringStore() + def test_save_bytes(sstore): Hello_i = sstore[b'Hello'] assert Hello_i == 1 diff --git a/tests/test_iter_lexicon.py b/tests/test_iter_lexicon.py index 64d84970e..ba666d4b0 100644 --- a/tests/test_iter_lexicon.py +++ b/tests/test_iter_lexicon.py @@ -2,10 +2,12 @@ import pytest from spacy.en import English + @pytest.fixture def EN(): return English() + def test_range_iter(EN): for i in range(len(EN.vocab)): lex = EN.vocab[i] diff --git a/tests/test_merge.py b/tests/test_merge.py index 370a334b8..58be79796 100644 --- a/tests/test_merge.py +++ b/tests/test_merge.py @@ -35,4 +35,3 @@ def test_merge_heads(): def test_issue_54(): text = u'Talks given by women had a slightly higher number of questions asked (3.2$\pm$0.2) than talks given by men (2.6$\pm$0.1).' tokens = NLU(text, merge_mwes=True) - diff --git a/tests/test_morph_exceptions.py b/tests/test_morph_exceptions.py index 72714c7a7..c2dbbc7d0 100644 --- a/tests/test_morph_exceptions.py +++ b/tests/test_morph_exceptions.py @@ -17,6 +17,7 @@ def morph_exc(): 'PRP$': {'his': {'L': '-PRP-', 'person': 3, 'case': 2}}, } + def test_load_exc(EN, morph_exc): EN.tagger.load_morph_exceptions(morph_exc) tokens = EN('I like his style.', tag=True) diff --git a/tests/test_ner.py b/tests/test_ner.py index bedca8a8e..136145d28 100644 --- a/tests/test_ner.py +++ b/tests/test_ner.py @@ -3,6 +3,7 @@ from spacy.en import English nlp = English() + def test_simple_types(): tokens = nlp(u'Mr. Best flew to New York on Saturday morning.') ents = list(tokens.ents) diff --git a/tests/test_number.py b/tests/test_number.py index f305c981c..2ca840a06 100644 --- a/tests/test_number.py +++ b/tests/test_number.py @@ -33,4 +33,3 @@ def test_word(): def test_not_number(): assert not like_number('dog') assert not like_number(',') - diff --git a/tests/test_only_punct.py b/tests/test_only_punct.py index 384ad6332..a09beb9ef 100644 --- a/tests/test_only_punct.py +++ b/tests/test_only_punct.py @@ -3,6 +3,7 @@ import pytest from spacy.en import English + def test_only_pre1(): EN = English() assert len(EN("(")) == 1 diff --git a/tests/test_parse_navigate.py b/tests/test_parse_navigate.py index 30e257204..402779399 100644 --- a/tests/test_parse_navigate.py +++ b/tests/test_parse_navigate.py @@ -58,4 +58,3 @@ def test_child_consistency(nlp, sun_text): assert not children for head_index, children in rights.items(): assert not children - diff --git a/tests/test_post_punct.py b/tests/test_post_punct.py index f61759609..1d29a6ed6 100644 --- a/tests/test_post_punct.py +++ b/tests/test_post_punct.py @@ -49,4 +49,3 @@ def test_three_same_close(close_puncts, EN): def test_double_end_quote(EN): assert len(EN("Hello''")) == 2 assert len(EN("''")) == 1 - diff --git a/tests/test_sbd.py b/tests/test_sbd.py index f4c603409..b9835f7dc 100644 --- a/tests/test_sbd.py +++ b/tests/test_sbd.py @@ -3,6 +3,7 @@ from spacy.en import English import pytest + @pytest.fixture def EN(): return English() diff --git a/tests/test_shape.py b/tests/test_shape.py index c517d3087..0568feb6a 100644 --- a/tests/test_shape.py +++ b/tests/test_shape.py @@ -8,20 +8,26 @@ from spacy.orth import word_shape as ws def test_capitalized(): assert ws('Nasa') == 'Xxxx' + def test_truncate(): assert ws('capitalized') == 'xxxx' + def test_digits(): assert ws('999999999') == 'dddd' + def test_mix(): assert ws('C3P0') == 'XdXd' + def test_punct(): assert ws(',') == ',' + def test_space(): assert ws('\n') == '\n' + def test_punct_seq(): assert ws('``,-') == '``,-' diff --git a/tests/test_span.py b/tests/test_span.py index cbbe4494d..99b96838a 100644 --- a/tests/test_span.py +++ b/tests/test_span.py @@ -3,7 +3,6 @@ from __future__ import unicode_literals from spacy.en import English import pytest -import re EN = English() diff --git a/tests/test_special_affix.py b/tests/test_special_affix.py index d2cc1c27c..cdc6a6d78 100644 --- a/tests/test_special_affix.py +++ b/tests/test_special_affix.py @@ -13,9 +13,11 @@ def EN(): def test_no_special(EN): assert len(EN("(can)")) == 3 + def test_no_punct(EN): assert len(EN("can't")) == 2 + def test_prefix(EN): assert len(EN("(can't")) == 3 diff --git a/tests/test_string_loading.py b/tests/test_string_loading.py index 86cd4f2a9..9f9fde1f8 100644 --- a/tests/test_string_loading.py +++ b/tests/test_string_loading.py @@ -16,6 +16,3 @@ def test_one(EN): assert tokens[0].orth_ == 'Betty' tokens2 = EN('Betty also bought a pound of butter.') assert tokens2[0].orth_ == 'Betty' - - - diff --git a/tests/test_subtree.py b/tests/test_subtree.py index b25ec233d..0f32105f0 100644 --- a/tests/test_subtree.py +++ b/tests/test_subtree.py @@ -16,4 +16,3 @@ def test_subtrees(): assert len(list(bus.children)) == 1 assert len(list(wheels.subtree)) == 6 - diff --git a/tests/test_tag_names.py b/tests/test_tag_names.py index 17d005d1f..875fd5560 100644 --- a/tests/test_tag_names.py +++ b/tests/test_tag_names.py @@ -1,6 +1,7 @@ from spacy.en import English import six + def test_tag_names(): nlp = English() tokens = nlp(u'I ate pizzas with anchovies.', parse=True, tag=True) diff --git a/tests/test_times.py b/tests/test_times.py index 88d9db050..bd63a2fc5 100644 --- a/tests/test_times.py +++ b/tests/test_times.py @@ -6,6 +6,7 @@ import pytest NLU = English() + def test_am_pm(): numbers = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12'] variants = ['a.m.', 'am', 'p.m.', 'pm'] diff --git a/tests/test_token.py b/tests/test_token.py index 11d9d41f8..88ea6a5a5 100644 --- a/tests/test_token.py +++ b/tests/test_token.py @@ -4,6 +4,7 @@ import pytest from spacy.en import English from spacy.parts_of_speech import ADV + @pytest.fixture def nlp(): return English() diff --git a/tests/test_token_api.py b/tests/test_token_api.py index eb7e1013b..ea1ee615c 100644 --- a/tests/test_token_api.py +++ b/tests/test_token_api.py @@ -7,6 +7,8 @@ from spacy.en.attrs import IS_STOP import pytest nlp = English() + + @pytest.fixture def token(): tokens = nlp(u'Give it back! He pleaded.') @@ -35,5 +37,3 @@ def test_single_token_string(): nlp = English() tokens = nlp(u'foobar') assert tokens[0].string == 'foobar' - - diff --git a/tests/test_token_references.py b/tests/test_token_references.py index b5fe9f941..99c632309 100644 --- a/tests/test_token_references.py +++ b/tests/test_token_references.py @@ -31,6 +31,7 @@ def _orphan_from_list(toks): lst.append(tok) return lst + def test_list_orphans(): # Test case from NSchrading nlp = English() diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py index 6c3ce3271..26d24b063 100644 --- a/tests/test_tokenizer.py +++ b/tests/test_tokenizer.py @@ -10,10 +10,12 @@ from spacy.en import English def EN(): return English().tokenizer + def test_no_word(EN): tokens = EN(u'') assert len(tokens) == 0 + def test_single_word(EN): tokens = EN(u'hello') assert tokens[0].orth_ == 'hello' @@ -60,18 +62,19 @@ def test_contraction_punct(EN): tokens = EN("can't!") assert len(tokens) == 3 + def test_sample(EN): text = """Tributes pour in for late British Labour Party leader -Tributes poured in from around the world Thursday -to the late Labour Party leader John Smith, who died earlier from a massive +Tributes poured in from around the world Thursday +to the late Labour Party leader John Smith, who died earlier from a massive heart attack aged 55. -In Washington, the US State Department issued a statement regretting "the +In Washington, the US State Department issued a statement regretting "the untimely death" of the rapier-tongued Scottish barrister and parliamentarian. "Mr. Smith, throughout his distinguished""" - + tokens = EN(text) assert len(tokens) > 5 diff --git a/tests/test_tokens_api.py b/tests/test_tokens_api.py index dee626a2b..46aecd5c7 100644 --- a/tests/test_tokens_api.py +++ b/tests/test_tokens_api.py @@ -3,6 +3,7 @@ from spacy.en import English import pytest + @pytest.fixture def tokens(): nlp = English() diff --git a/tests/test_urlish.py b/tests/test_urlish.py index f10659dc1..3faa40c5e 100644 --- a/tests/test_urlish.py +++ b/tests/test_urlish.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals from spacy.orth import like_url + def test_basic_url(): assert like_url('www.google.com') assert like_url('google.com') diff --git a/tests/test_vec.py b/tests/test_vec.py index be214f379..31d0df3b0 100644 --- a/tests/test_vec.py +++ b/tests/test_vec.py @@ -4,15 +4,18 @@ from spacy.en import English import pytest + @pytest.fixture def EN(): return English() + def test_vec(EN): hype = EN.vocab['hype'] assert hype.orth_ == 'hype' assert 0.08 >= hype.repvec[0] > 0.07 + def test_capitalized(EN): hype = EN.vocab['Hype'] assert hype.orth_ == 'Hype' diff --git a/tests/test_whitespace.py b/tests/test_whitespace.py index a3a700235..19a453c51 100644 --- a/tests/test_whitespace.py +++ b/tests/test_whitespace.py @@ -39,5 +39,3 @@ def test_newline_double_space(EN): def test_newline_space_wrap(EN): tokens = EN('hello \n possums') assert len(tokens) == 3 - - diff --git a/tests/test_wiki_sun.py b/tests/test_wiki_sun.py index faad3eb30..afca2ea06 100644 --- a/tests/test_wiki_sun.py +++ b/tests/test_wiki_sun.py @@ -4,7 +4,6 @@ from spacy.en import English from spacy.util import utf8open import pytest -import os from os import path diff --git a/tests/tokenizer.sed b/tests/tokenizer.sed index f5f891c49..f39c04178 100644 --- a/tests/tokenizer.sed +++ b/tests/tokenizer.sed @@ -20,7 +20,7 @@ s=\.\.\.= ... =g s=[,;:@#$%&]= & =g # Assume sentence tokenization has been done first, so split FINAL periods -# only. +# only. s=\([^.]\)\([.]\)\([])}>"']*\)[ ]*$=\1 \2\3 =g # however, we may as well split ALL question marks and exclamation points, # since they shouldn't have the abbrev.-marker ambiguity problem