mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Remove trailing whitespace
This commit is contained in:
parent
5f0f940a1f
commit
3a8d9b37a6
|
@ -35,4 +35,3 @@ Difficult to support:
|
||||||
|
|
||||||
* PyPy 2.7
|
* PyPy 2.7
|
||||||
* PyPy 3.4
|
* PyPy 3.4
|
||||||
|
|
||||||
|
|
|
@ -30,5 +30,3 @@ def main(text_loc):
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
plac.call(main)
|
plac.call(main)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -132,7 +132,7 @@ def main(train_loc, dev_loc, model_dir, n_sents=0, out_loc="", verbose=False,
|
||||||
print 'NER P', scorer.ents_p
|
print 'NER P', scorer.ents_p
|
||||||
print 'NER R', scorer.ents_r
|
print 'NER R', scorer.ents_r
|
||||||
print 'NER F', scorer.ents_f
|
print 'NER F', scorer.ents_f
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
plac.call(main)
|
plac.call(main)
|
||||||
|
|
|
@ -7,7 +7,7 @@ from spacy.vocab import write_binary_vectors
|
||||||
|
|
||||||
def main(in_loc, out_loc):
|
def main(in_loc, out_loc):
|
||||||
write_binary_vectors(in_loc, out_loc)
|
write_binary_vectors(in_loc, out_loc)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
plac.call(main)
|
plac.call(main)
|
||||||
|
|
|
@ -11,5 +11,3 @@ The CLA must be signed on your first pull request. To do this, simply fill in th
|
||||||
$ git add -A spaCy/contributors/<your GitHub username>.md
|
$ git add -A spaCy/contributors/<your GitHub username>.md
|
||||||
|
|
||||||
Now finish your pull request, and you're done.
|
Now finish your pull request, and you're done.
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -102,7 +102,7 @@ exts = [
|
||||||
Extension("spacy.syntax.arc_eager", ["spacy/syntax/arc_eager.pyx"], **ext_args),
|
Extension("spacy.syntax.arc_eager", ["spacy/syntax/arc_eager.pyx"], **ext_args),
|
||||||
Extension("spacy.syntax._parse_features", ["spacy/syntax/_parse_features.pyx"],
|
Extension("spacy.syntax._parse_features", ["spacy/syntax/_parse_features.pyx"],
|
||||||
**ext_args)
|
**ext_args)
|
||||||
|
|
||||||
#Extension("spacy.pos_feats", ["spacy/pos_feats.pyx"], language="c++", include_dirs=includes),
|
#Extension("spacy.pos_feats", ["spacy/pos_feats.pyx"], language="c++", include_dirs=includes),
|
||||||
#Extension("spacy.ner._state", ["spacy/ner/_state.pyx"], language="c++", include_dirs=includes),
|
#Extension("spacy.ner._state", ["spacy/ner/_state.pyx"], language="c++", include_dirs=includes),
|
||||||
#Extension("spacy.ner.bilou_moves", ["spacy/ner/bilou_moves.pyx"], language="c++", include_dirs=includes),
|
#Extension("spacy.ner.bilou_moves", ["spacy/ner/bilou_moves.pyx"], language="c++", include_dirs=includes),
|
||||||
|
|
|
@ -28,7 +28,7 @@ API
|
||||||
|
|
||||||
|
|
||||||
.. autoclass:: spacy.tokens.Tokens
|
.. autoclass:: spacy.tokens.Tokens
|
||||||
|
|
||||||
+---------------+-------------+-------------+
|
+---------------+-------------+-------------+
|
||||||
| Attribute | Type | Attr API |
|
| Attribute | Type | Attr API |
|
||||||
+===============+=============+=============+
|
+===============+=============+=============+
|
||||||
|
@ -48,7 +48,7 @@ API
|
||||||
For faster access, the underlying C data can be accessed from Cython. You
|
For faster access, the underlying C data can be accessed from Cython. You
|
||||||
can also export the data to a numpy array, via `Tokens.to_array`, if pure Python
|
can also export the data to a numpy array, via `Tokens.to_array`, if pure Python
|
||||||
access is required, and you need slightly better performance. However, this
|
access is required, and you need slightly better performance. However, this
|
||||||
is both slower and has a worse API than Cython access.
|
is both slower and has a worse API than Cython access.
|
||||||
|
|
||||||
|
|
||||||
.. autoclass:: spacy.tokens.Token
|
.. autoclass:: spacy.tokens.Token
|
||||||
|
@ -119,7 +119,7 @@ API
|
||||||
|
|
||||||
shape
|
shape
|
||||||
A transform of the word's string, to show orthographic features. The
|
A transform of the word's string, to show orthographic features. The
|
||||||
characters a-z are mapped to x, A-Z is mapped to X, 0-9 is mapped to d.
|
characters a-z are mapped to x, A-Z is mapped to X, 0-9 is mapped to d.
|
||||||
After these mappings, sequences of 4 or more of the same character are
|
After these mappings, sequences of 4 or more of the same character are
|
||||||
truncated to length 4. Examples: C3Po --> XdXx, favorite --> xxxx,
|
truncated to length 4. Examples: C3Po --> XdXx, favorite --> xxxx,
|
||||||
:) --> :)
|
:) --> :)
|
||||||
|
@ -161,7 +161,7 @@ API
|
||||||
pos
|
pos
|
||||||
A part-of-speech tag, from the Google Universal Tag Set, e.g. NOUN, VERB,
|
A part-of-speech tag, from the Google Universal Tag Set, e.g. NOUN, VERB,
|
||||||
ADV. Constants for the 17 tag values are provided in spacy.parts\_of\_speech.
|
ADV. Constants for the 17 tag values are provided in spacy.parts\_of\_speech.
|
||||||
|
|
||||||
dep
|
dep
|
||||||
The type of syntactic dependency relation between the word and its
|
The type of syntactic dependency relation between the word and its
|
||||||
syntactic head.
|
syntactic head.
|
||||||
|
@ -185,10 +185,10 @@ API
|
||||||
|
|
||||||
rights
|
rights
|
||||||
An iterator for the immediate rightward syntactic children of the word.
|
An iterator for the immediate rightward syntactic children of the word.
|
||||||
|
|
||||||
children
|
children
|
||||||
An iterator that yields from lefts, and then yields from rights.
|
An iterator that yields from lefts, and then yields from rights.
|
||||||
|
|
||||||
subtree
|
subtree
|
||||||
An iterator for the part of the sentence syntactically governed by the
|
An iterator for the part of the sentence syntactically governed by the
|
||||||
word, including the word itself.
|
word, including the word itself.
|
||||||
|
@ -205,15 +205,15 @@ API
|
||||||
.. py:class:: vocab.Vocab(self, data_dir=None, lex_props_getter=None)
|
.. py:class:: vocab.Vocab(self, data_dir=None, lex_props_getter=None)
|
||||||
|
|
||||||
.. py:method:: __len__(self) --> int
|
.. py:method:: __len__(self) --> int
|
||||||
|
|
||||||
.. py:method:: __getitem__(self, id: int) --> unicode
|
.. py:method:: __getitem__(self, id: int) --> unicode
|
||||||
|
|
||||||
.. py:method:: __getitem__(self, string: unicode) --> int
|
.. py:method:: __getitem__(self, string: unicode) --> int
|
||||||
|
|
||||||
.. py:method:: __setitem__(self, py_str: unicode, props: Dict[str, int[float]) --> None
|
.. py:method:: __setitem__(self, py_str: unicode, props: Dict[str, int[float]) --> None
|
||||||
|
|
||||||
.. py:method:: dump(self, loc: unicode) --> None
|
.. py:method:: dump(self, loc: unicode) --> None
|
||||||
|
|
||||||
.. py:method:: load_lexemes(self, loc: unicode) --> None
|
.. py:method:: load_lexemes(self, loc: unicode) --> None
|
||||||
|
|
||||||
.. py:method:: load_vectors(self, loc: unicode) --> None
|
.. py:method:: load_vectors(self, loc: unicode) --> None
|
||||||
|
@ -223,9 +223,9 @@ API
|
||||||
.. py:method:: __len__(self) --> int
|
.. py:method:: __len__(self) --> int
|
||||||
|
|
||||||
.. py:method:: __getitem__(self, id: int) --> unicode
|
.. py:method:: __getitem__(self, id: int) --> unicode
|
||||||
|
|
||||||
.. py:method:: __getitem__(self, string: bytes) --> id
|
.. py:method:: __getitem__(self, string: bytes) --> id
|
||||||
|
|
||||||
.. py:method:: __getitem__(self, string: unicode) --> id
|
.. py:method:: __getitem__(self, string: unicode) --> id
|
||||||
|
|
||||||
.. py:method:: dump(self, loc: unicode) --> None
|
.. py:method:: dump(self, loc: unicode) --> None
|
||||||
|
|
|
@ -75,4 +75,3 @@ Boolean features
|
||||||
+-------------+--------------------------------------------------------------+
|
+-------------+--------------------------------------------------------------+
|
||||||
| IN_LIST | Facility for loading arbitrary run-time word lists? |
|
| IN_LIST | Facility for loading arbitrary run-time word lists? |
|
||||||
+-------------+--------------------------------------------------------------+
|
+-------------+--------------------------------------------------------------+
|
||||||
|
|
||||||
|
|
|
@ -68,4 +68,3 @@ Cons:
|
||||||
- Higher memory usage (up to 1gb)
|
- Higher memory usage (up to 1gb)
|
||||||
- More conceptually complicated
|
- More conceptually complicated
|
||||||
- Tokenization rules expressed in code, not as data
|
- Tokenization rules expressed in code, not as data
|
||||||
|
|
||||||
|
|
|
@ -122,7 +122,7 @@ it is, we stop splitting, and return the tokenization at that point.
|
||||||
The advantage of this design is that the prefixes, suffixes and special-cases
|
The advantage of this design is that the prefixes, suffixes and special-cases
|
||||||
can be declared separately, in easy-to-understand files. If a new entry is
|
can be declared separately, in easy-to-understand files. If a new entry is
|
||||||
added to the special-cases, you can be sure that it won't have some unforeseen
|
added to the special-cases, you can be sure that it won't have some unforeseen
|
||||||
consequence to a complicated regular-expression grammar.
|
consequence to a complicated regular-expression grammar.
|
||||||
|
|
||||||
Coupling the Tokenizer and Lexicon
|
Coupling the Tokenizer and Lexicon
|
||||||
##################################
|
##################################
|
||||||
|
@ -159,7 +159,7 @@ Dependency Parser
|
||||||
|
|
||||||
The parser uses the algorithm described in my `2014 blog post`_.
|
The parser uses the algorithm described in my `2014 blog post`_.
|
||||||
This algorithm, shift-reduce dependency parsing, is becoming widely adopted due
|
This algorithm, shift-reduce dependency parsing, is becoming widely adopted due
|
||||||
to its compelling speed/accuracy trade-off.
|
to its compelling speed/accuracy trade-off.
|
||||||
|
|
||||||
Some quick details about spaCy's take on this, for those who happen to know
|
Some quick details about spaCy's take on this, for those who happen to know
|
||||||
these models well. I'll write up a better description shortly.
|
these models well. I'll write up a better description shortly.
|
||||||
|
@ -176,7 +176,7 @@ scored 91.0. So how have I gotten it to 92.4? The following tweaks:
|
||||||
1. I use Brown cluster features --- these help a lot;
|
1. I use Brown cluster features --- these help a lot;
|
||||||
2. I redesigned the feature set. I've long known that the Zhang and Nivre
|
2. I redesigned the feature set. I've long known that the Zhang and Nivre
|
||||||
(2011) feature set was suboptimal, but a few features don't make a very
|
(2011) feature set was suboptimal, but a few features don't make a very
|
||||||
compelling publication. Still, they're important.
|
compelling publication. Still, they're important.
|
||||||
3. When I do the dynamic oracle training, I also make
|
3. When I do the dynamic oracle training, I also make
|
||||||
the upate cost-sensitive: if the oracle determines that the move the parser
|
the upate cost-sensitive: if the oracle determines that the move the parser
|
||||||
took has a cost of N, then the weights for the gold class are incremented by
|
took has a cost of N, then the weights for the gold class are incremented by
|
||||||
|
@ -253,12 +253,10 @@ the classes. In the case of the parser, this means the hash table is accessed
|
||||||
2NKC times, instead of the 2NK times if you have a weights vector. You should
|
2NKC times, instead of the 2NK times if you have a weights vector. You should
|
||||||
also be careful to store the weights contiguously in memory --- you don't want
|
also be careful to store the weights contiguously in memory --- you don't want
|
||||||
a linked list here. I use a block-sparse format, because my problems tend to
|
a linked list here. I use a block-sparse format, because my problems tend to
|
||||||
have a few dozen classes.
|
have a few dozen classes.
|
||||||
|
|
||||||
I guess if I had to summarize my experience, I'd say that the efficiency of
|
I guess if I had to summarize my experience, I'd say that the efficiency of
|
||||||
these models is really all about the data structures. We want to stay small,
|
these models is really all about the data structures. We want to stay small,
|
||||||
and stay contiguous. Minimize redundancy and minimize pointer chasing.
|
and stay contiguous. Minimize redundancy and minimize pointer chasing.
|
||||||
That's why Cython is so well suited to this: we get to lay out our data
|
That's why Cython is so well suited to this: we get to lay out our data
|
||||||
structures, and manage the memory ourselves, with full C-level control.
|
structures, and manage the memory ourselves, with full C-level control.
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -65,7 +65,7 @@ it, link it, filter it, categorise it, generate it and correct it.
|
||||||
|
|
||||||
spaCy provides a library of utility functions that help programmers build such
|
spaCy provides a library of utility functions that help programmers build such
|
||||||
products. It's commercial open source software: you can either use it under
|
products. It's commercial open source software: you can either use it under
|
||||||
the AGPL, or you can `buy a commercial license`_ for a one-time fee.
|
the AGPL, or you can `buy a commercial license`_ for a one-time fee.
|
||||||
|
|
||||||
.. _buy a commercial license: license.html
|
.. _buy a commercial license: license.html
|
||||||
|
|
||||||
|
@ -148,7 +148,7 @@ cosine metric:
|
||||||
|
|
||||||
>>> from numpy import dot
|
>>> from numpy import dot
|
||||||
>>> from numpy.linalg import norm
|
>>> from numpy.linalg import norm
|
||||||
|
|
||||||
>>> cosine = lambda v1, v2: dot(v1, v2) / (norm(v1) * norm(v2))
|
>>> cosine = lambda v1, v2: dot(v1, v2) / (norm(v1) * norm(v2))
|
||||||
>>> words = [w for w in nlp.vocab if w.has_repvec]
|
>>> words = [w for w in nlp.vocab if w.has_repvec]
|
||||||
>>> words.sort(key=lambda w: cosine(w.repvec, pleaded.repvec))
|
>>> words.sort(key=lambda w: cosine(w.repvec, pleaded.repvec))
|
||||||
|
@ -200,7 +200,7 @@ this:
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
We wanted to refine the logic so that only adverbs modifying evocative verbs
|
We wanted to refine the logic so that only adverbs modifying evocative verbs
|
||||||
of communication, like "pleaded", were highlighted. We've now built a vector that
|
of communication, like "pleaded", were highlighted. We've now built a vector that
|
||||||
represents that type of word, so now we can highlight adverbs based on very
|
represents that type of word, so now we can highlight adverbs based on very
|
||||||
subtle logic, honing in on adverbs that seem the most stylistically
|
subtle logic, honing in on adverbs that seem the most stylistically
|
||||||
|
@ -213,7 +213,7 @@ problematic, given our starting assumptions:
|
||||||
>>> from spacy.parts_of_speech import ADV, VERB
|
>>> from spacy.parts_of_speech import ADV, VERB
|
||||||
>>> cosine = lambda v1, v2: dot(v1, v2) / (norm(v1) * norm(v2))
|
>>> cosine = lambda v1, v2: dot(v1, v2) / (norm(v1) * norm(v2))
|
||||||
>>> def is_bad_adverb(token, target_verb, tol):
|
>>> def is_bad_adverb(token, target_verb, tol):
|
||||||
... if token.pos != ADV
|
... if token.pos != ADV
|
||||||
... return False
|
... return False
|
||||||
... elif token.head.pos != VERB:
|
... elif token.head.pos != VERB:
|
||||||
... return False
|
... return False
|
||||||
|
@ -238,11 +238,11 @@ database, and processed with an NLP library, to one of three levels of detail
|
||||||
--- tokenization, tagging, or parsing. The tasks are additive: to parse the
|
--- tokenization, tagging, or parsing. The tasks are additive: to parse the
|
||||||
text you have to tokenize and tag it. The pre-processing was not subtracted
|
text you have to tokenize and tag it. The pre-processing was not subtracted
|
||||||
from the times --- I report the time required for the pipeline to complete.
|
from the times --- I report the time required for the pipeline to complete.
|
||||||
I report mean times per document, in milliseconds.
|
I report mean times per document, in milliseconds.
|
||||||
|
|
||||||
**Hardware**: Intel i7-3770 (2012)
|
**Hardware**: Intel i7-3770 (2012)
|
||||||
|
|
||||||
.. table:: Efficiency comparison. Lower is better.
|
.. table:: Efficiency comparison. Lower is better.
|
||||||
|
|
||||||
+--------------+---------------------------+--------------------------------+
|
+--------------+---------------------------+--------------------------------+
|
||||||
| | Absolute (ms per doc) | Relative (to spaCy) |
|
| | Absolute (ms per doc) | Relative (to spaCy) |
|
||||||
|
@ -287,7 +287,7 @@ representations.
|
||||||
.. spaCy is based on science, not alchemy. It's open source, and I am happy to
|
.. spaCy is based on science, not alchemy. It's open source, and I am happy to
|
||||||
clarify any detail of the algorithms I've implemented.
|
clarify any detail of the algorithms I've implemented.
|
||||||
It's evaluated against the current best published systems, following the standard
|
It's evaluated against the current best published systems, following the standard
|
||||||
methodologies. These evaluations show that it performs extremely well.
|
methodologies. These evaluations show that it performs extremely well.
|
||||||
|
|
||||||
Accuracy Comparison
|
Accuracy Comparison
|
||||||
-------------------
|
-------------------
|
||||||
|
@ -299,7 +299,7 @@ Accuracy Comparison
|
||||||
+--------------+----------+------------+
|
+--------------+----------+------------+
|
||||||
| spaCy | 97.2 | 92.4 |
|
| spaCy | 97.2 | 92.4 |
|
||||||
+--------------+----------+------------+
|
+--------------+----------+------------+
|
||||||
| CoreNLP | 96.9 | 92.2 |
|
| CoreNLP | 96.9 | 92.2 |
|
||||||
+--------------+----------+------------+
|
+--------------+----------+------------+
|
||||||
| ZPar | 97.3 | 92.9 |
|
| ZPar | 97.3 | 92.9 |
|
||||||
+--------------+----------+------------+
|
+--------------+----------+------------+
|
||||||
|
@ -329,5 +329,5 @@ previous fastest parser that I'm aware of.
|
||||||
quickstart.rst
|
quickstart.rst
|
||||||
api.rst
|
api.rst
|
||||||
howworks.rst
|
howworks.rst
|
||||||
license.rst
|
license.rst
|
||||||
updates.rst
|
updates.rst
|
||||||
|
|
|
@ -97,7 +97,7 @@ like lead-text take a while to float up the priority list. This strategy also h
|
||||||
the advantage of transparency: it's obvious to users how the decision is being
|
the advantage of transparency: it's obvious to users how the decision is being
|
||||||
made, so nobody is likely to complain about the feature if it works this way.
|
made, so nobody is likely to complain about the feature if it works this way.
|
||||||
|
|
||||||
Instead of cutting off the text mid-word, we can tokenize the text, and
|
Instead of cutting off the text mid-word, we can tokenize the text, and
|
||||||
|
|
||||||
+----------------+-----------+
|
+----------------+-----------+
|
||||||
| System | Rouge-1 R |
|
| System | Rouge-1 R |
|
||||||
|
@ -116,7 +116,7 @@ A simple bag-of-words model can be created using the `count_by` method, which
|
||||||
produces a dictionary of frequencies, keyed by string IDs:
|
produces a dictionary of frequencies, keyed by string IDs:
|
||||||
|
|
||||||
.. code:: python
|
.. code:: python
|
||||||
|
|
||||||
>>> from spacy.en import English
|
>>> from spacy.en import English
|
||||||
>>> from spacy.en.attrs import SIC
|
>>> from spacy.en.attrs import SIC
|
||||||
>>> nlp = English()
|
>>> nlp = English()
|
||||||
|
@ -148,7 +148,7 @@ from any token:
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
.. _word embeddings: https://colah.github.io/posts/2014-07-NLP-RNNs-Representations/
|
.. _word embeddings: https://colah.github.io/posts/2014-07-NLP-RNNs-Representations/
|
||||||
|
|
||||||
|
@ -196,8 +196,8 @@ undirected --- so, it's natural to represent this as a matrix:
|
||||||
|
|
||||||
from scipy.spatial.distance import cosine
|
from scipy.spatial.distance import cosine
|
||||||
import numpy
|
import numpy
|
||||||
|
|
||||||
|
|
||||||
def lexrank(sent_vectors):
|
def lexrank(sent_vectors):
|
||||||
n = len(sent_vectors)
|
n = len(sent_vectors)
|
||||||
# Build the cosine similarity matrix
|
# Build the cosine similarity matrix
|
||||||
|
@ -205,7 +205,7 @@ undirected --- so, it's natural to represent this as a matrix:
|
||||||
for i in range(n):
|
for i in range(n):
|
||||||
for j in range(n):
|
for j in range(n):
|
||||||
matrix[i, j] = cosine(sent_vectors[i], sent_vectors[j])
|
matrix[i, j] = cosine(sent_vectors[i], sent_vectors[j])
|
||||||
# Normalize
|
# Normalize
|
||||||
for i in range(n):
|
for i in range(n):
|
||||||
matrix[i] /= sum(matrix[i])
|
matrix[i] /= sum(matrix[i])
|
||||||
return _pagerank(matrix)
|
return _pagerank(matrix)
|
||||||
|
@ -278,6 +278,3 @@ sentence represents the document as a whole.
|
||||||
|
|
||||||
Document Model
|
Document Model
|
||||||
--------------
|
--------------
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -13,7 +13,7 @@ I've been writing spaCy for six months now, and I'm very excited to release it.
|
||||||
I think it's the most valuable thing I could have built. When I was in
|
I think it's the most valuable thing I could have built. When I was in
|
||||||
academia, I noticed that small companies couldn't really make use of our work.
|
academia, I noticed that small companies couldn't really make use of our work.
|
||||||
Meanwhile the tech giants have been hiring *everyone*, and putting this stuff
|
Meanwhile the tech giants have been hiring *everyone*, and putting this stuff
|
||||||
into production. I think spaCy can change that.
|
into production. I think spaCy can change that.
|
||||||
|
|
||||||
|
|
||||||
+------------+-----------+----------+-------------------------------------+
|
+------------+-----------+----------+-------------------------------------+
|
||||||
|
@ -52,14 +52,14 @@ Examples
|
||||||
--------
|
--------
|
||||||
|
|
||||||
In order to clarify how spaCy's license structure might apply to you, I've
|
In order to clarify how spaCy's license structure might apply to you, I've
|
||||||
written a few examples, in the form of user-stories.
|
written a few examples, in the form of user-stories.
|
||||||
|
|
||||||
Ashley and Casey: Seed stage start-up
|
Ashley and Casey: Seed stage start-up
|
||||||
#####################################
|
#####################################
|
||||||
|
|
||||||
Ashley and Casey have an idea for a start-up. To explore their idea, they want
|
Ashley and Casey have an idea for a start-up. To explore their idea, they want
|
||||||
to build a minimum viable product they can put in front of potential users and
|
to build a minimum viable product they can put in front of potential users and
|
||||||
investors.
|
investors.
|
||||||
|
|
||||||
They have two options.
|
They have two options.
|
||||||
|
|
||||||
|
@ -75,7 +75,7 @@ They have two options.
|
||||||
import a module that imports it, etc). They also cannot use spaCy as
|
import a module that imports it, etc). They also cannot use spaCy as
|
||||||
a network resource, by running it as a service --- this is the
|
a network resource, by running it as a service --- this is the
|
||||||
loophole that the "A" part of the AGPL is designed to close.
|
loophole that the "A" part of the AGPL is designed to close.
|
||||||
|
|
||||||
Ashley and Casey find the AGPL license unattractive for commercial use.
|
Ashley and Casey find the AGPL license unattractive for commercial use.
|
||||||
They decide to take up the trial commercial license.
|
They decide to take up the trial commercial license.
|
||||||
However, over the next 90 days, Ashley has to move house twice, and Casey gets
|
However, over the next 90 days, Ashley has to move house twice, and Casey gets
|
||||||
|
|
|
@ -18,7 +18,7 @@ With Python 2.7 or Python 3, using Linux or OSX, run:
|
||||||
.. _300 mb of data: http://s3-us-west-1.amazonaws.com/media.spacynlp.com/en_data_all-0.4.tgz
|
.. _300 mb of data: http://s3-us-west-1.amazonaws.com/media.spacynlp.com/en_data_all-0.4.tgz
|
||||||
|
|
||||||
|
|
||||||
The download command fetches and installs about 300mb of data, for the
|
The download command fetches and installs about 300mb of data, for the
|
||||||
parser model and word vectors, which it installs within the spacy.en package directory.
|
parser model and word vectors, which it installs within the spacy.en package directory.
|
||||||
|
|
||||||
If you're stuck using a server with an old version of Python, and you don't
|
If you're stuck using a server with an old version of Python, and you don't
|
||||||
|
@ -88,7 +88,7 @@ the original orthographic form of the word.
|
||||||
|
|
||||||
.. py:class:: spacy.en.English(self, data_dir=join(dirname(__file__), 'data'))
|
.. py:class:: spacy.en.English(self, data_dir=join(dirname(__file__), 'data'))
|
||||||
|
|
||||||
.. py:method:: __call__(self, text: unicode, tag=True, parse=True, entity=True, merge_mwes=False) --> Tokens
|
.. py:method:: __call__(self, text: unicode, tag=True, parse=True, entity=True, merge_mwes=False) --> Tokens
|
||||||
|
|
||||||
+-----------------+--------------+--------------+
|
+-----------------+--------------+--------------+
|
||||||
| Attribute | Type | Its API |
|
| Attribute | Type | Its API |
|
||||||
|
@ -121,7 +121,7 @@ the original orthographic form of the word.
|
||||||
**Get sentence or named entity spans**
|
**Get sentence or named entity spans**
|
||||||
|
|
||||||
.. py:attribute:: tokens.Tokens.sents --> Iterator[Span]
|
.. py:attribute:: tokens.Tokens.sents --> Iterator[Span]
|
||||||
|
|
||||||
.. py:attribute:: tokens.Tokens.ents --> Iterator[Span]
|
.. py:attribute:: tokens.Tokens.ents --> Iterator[Span]
|
||||||
|
|
||||||
You can iterate over a Span to access individual Tokens, or access its
|
You can iterate over a Span to access individual Tokens, or access its
|
||||||
|
@ -131,7 +131,7 @@ the original orthographic form of the word.
|
||||||
**Embedded word representenations**
|
**Embedded word representenations**
|
||||||
|
|
||||||
.. py:attribute:: tokens.Token.repvec
|
.. py:attribute:: tokens.Token.repvec
|
||||||
|
|
||||||
.. py:attribute:: lexeme.Lexeme.repvec
|
.. py:attribute:: lexeme.Lexeme.repvec
|
||||||
|
|
||||||
|
|
||||||
|
@ -150,13 +150,13 @@ the original orthographic form of the word.
|
||||||
**Align to original string**
|
**Align to original string**
|
||||||
|
|
||||||
.. py:attribute:: string: unicode
|
.. py:attribute:: string: unicode
|
||||||
|
|
||||||
Padded with original whitespace.
|
Padded with original whitespace.
|
||||||
|
|
||||||
.. py:attribute:: length: int
|
.. py:attribute:: length: int
|
||||||
|
|
||||||
Length, in unicode code-points. Equal to len(self.orth_).
|
Length, in unicode code-points. Equal to len(self.orth_).
|
||||||
|
|
||||||
.. py:attribute:: idx: int
|
.. py:attribute:: idx: int
|
||||||
|
|
||||||
Starting offset of word in the original string.
|
Starting offset of word in the original string.
|
||||||
|
@ -234,4 +234,3 @@ Features
|
||||||
+---------+-----------------------------------------------------------+
|
+---------+-----------------------------------------------------------+
|
||||||
| prob | Log probability of word, smoothed with Simple Good-Turing |
|
| prob | Log probability of word, smoothed with Simple Good-Turing |
|
||||||
+---------+-----------------------------------------------------------+
|
+---------+-----------------------------------------------------------+
|
||||||
|
|
||||||
|
|
|
@ -21,7 +21,7 @@ Bug Fixes
|
||||||
all look-ups into the vocabulary failed on wide unicode builds, which
|
all look-ups into the vocabulary failed on wide unicode builds, which
|
||||||
further meant that the part-of-speech tagger and parser features were not
|
further meant that the part-of-speech tagger and parser features were not
|
||||||
computed correctly.
|
computed correctly.
|
||||||
|
|
||||||
The fix is simple: we already have to read in a list of all the strings, so
|
The fix is simple: we already have to read in a list of all the strings, so
|
||||||
just store an index into that list, instead of a hash.
|
just store an index into that list, instead of a hash.
|
||||||
|
|
||||||
|
@ -36,7 +36,7 @@ Bug Fixes
|
||||||
and we want to freely navigate up and down it without creating reference
|
and we want to freely navigate up and down it without creating reference
|
||||||
cycles that inhibit garbage collection, and without doing a lot of copying,
|
cycles that inhibit garbage collection, and without doing a lot of copying,
|
||||||
creating and deleting.
|
creating and deleting.
|
||||||
|
|
||||||
I think I've got a promising solution to this, but I suspect there's
|
I think I've got a promising solution to this, but I suspect there's
|
||||||
currently a memory leak. Please get in touch no the tracker if you want to
|
currently a memory leak. Please get in touch no the tracker if you want to
|
||||||
know more, especially if you think you can help.
|
know more, especially if you think you can help.
|
||||||
|
@ -60,7 +60,7 @@ Most English parsing research is performed on text with perfect pre-processing:
|
||||||
one newline between every sentence, one space between every token.
|
one newline between every sentence, one space between every token.
|
||||||
It's always been done this way, and it's good. It's a useful idealisation,
|
It's always been done this way, and it's good. It's a useful idealisation,
|
||||||
because the pre-processing has few algorithmic implications.
|
because the pre-processing has few algorithmic implications.
|
||||||
|
|
||||||
But, for practical performance, this stuff can matter a lot.
|
But, for practical performance, this stuff can matter a lot.
|
||||||
Dridan and Oepen (2013) did a simple but rare thing: they actually ran a few
|
Dridan and Oepen (2013) did a simple but rare thing: they actually ran a few
|
||||||
parsers on raw text. Even on the standard Wall Street Journal corpus,
|
parsers on raw text. Even on the standard Wall Street Journal corpus,
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
{
|
{
|
||||||
"PRP": {
|
"PRP": {
|
||||||
"I": {"L": "-PRON-", "person": 1, "number": 1, "gender": 0, "case": 1},
|
"I": {"L": "-PRON-", "person": 1, "number": 1, "gender": 0, "case": 1},
|
||||||
"me": {"L": "-PRON-", "person": 1, "number": 1, "gender": 0, "case": 3},
|
"me": {"L": "-PRON-", "person": 1, "number": 1, "gender": 0, "case": 3},
|
||||||
"mine": {"L": "-PRON-", "person": 1, "number": 1, "gender": 0, "case": 2},
|
"mine": {"L": "-PRON-", "person": 1, "number": 1, "gender": 0, "case": 2},
|
||||||
"myself": {"L": "-PRON-", "person": 1, "number": 1, "gender": 0, "case": 4},
|
"myself": {"L": "-PRON-", "person": 1, "number": 1, "gender": 0, "case": 4},
|
||||||
"you": {"L": "-PRON-", "person": 2, "number": 0, "gender": 0, "case": 0},
|
"you": {"L": "-PRON-", "person": 2, "number": 0, "gender": 0, "case": 0},
|
||||||
|
|
2
setup.py
2
setup.py
|
@ -150,7 +150,7 @@ MOD_NAMES = ['spacy.parts_of_speech', 'spacy.strings',
|
||||||
'spacy.morphology',
|
'spacy.morphology',
|
||||||
'spacy._ml', 'spacy.tokenizer', 'spacy.en.attrs',
|
'spacy._ml', 'spacy.tokenizer', 'spacy.en.attrs',
|
||||||
'spacy.en.pos', 'spacy.syntax.parser', 'spacy.syntax._state',
|
'spacy.en.pos', 'spacy.syntax.parser', 'spacy.syntax._state',
|
||||||
'spacy.syntax.transition_system',
|
'spacy.syntax.transition_system',
|
||||||
'spacy.syntax.arc_eager', 'spacy.syntax._parse_features',
|
'spacy.syntax.arc_eager', 'spacy.syntax._parse_features',
|
||||||
'spacy.syntax.conll', 'spacy.orth',
|
'spacy.syntax.conll', 'spacy.orth',
|
||||||
'spacy.syntax.ner']
|
'spacy.syntax.ner']
|
||||||
|
|
|
@ -33,7 +33,7 @@ cdef class Model:
|
||||||
cdef class HastyModel:
|
cdef class HastyModel:
|
||||||
cdef Pool mem
|
cdef Pool mem
|
||||||
cdef weight_t* _scores
|
cdef weight_t* _scores
|
||||||
|
|
||||||
cdef const weight_t* score(self, atom_t* context) except NULL
|
cdef const weight_t* score(self, atom_t* context) except NULL
|
||||||
cdef int update(self, atom_t* context, class_t guess, class_t gold, int cost) except -1
|
cdef int update(self, atom_t* context, class_t guess, class_t gold, int cost) except -1
|
||||||
|
|
||||||
|
|
|
@ -79,5 +79,3 @@ cpdef enum attr_id_t:
|
||||||
POS
|
POS
|
||||||
TAG
|
TAG
|
||||||
DEP
|
DEP
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -129,19 +129,19 @@ class English(object):
|
||||||
entity=parse_if_model_present, merge_mwes=False):
|
entity=parse_if_model_present, merge_mwes=False):
|
||||||
"""Apply the pipeline to some text. The text can span multiple sentences,
|
"""Apply the pipeline to some text. The text can span multiple sentences,
|
||||||
and can contain arbtrary whitespace. Alignment into the original string
|
and can contain arbtrary whitespace. Alignment into the original string
|
||||||
|
|
||||||
The tagger and parser are lazy-loaded the first time they are required.
|
The tagger and parser are lazy-loaded the first time they are required.
|
||||||
Loading the parser model usually takes 5-10 seconds.
|
Loading the parser model usually takes 5-10 seconds.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
text (unicode): The text to be processed.
|
text (unicode): The text to be processed.
|
||||||
|
|
||||||
Keyword args:
|
Keyword args:
|
||||||
tag (bool): Whether to add part-of-speech tags to the text. Also
|
tag (bool): Whether to add part-of-speech tags to the text. Also
|
||||||
sets morphological analysis and lemmas.
|
sets morphological analysis and lemmas.
|
||||||
|
|
||||||
parse (True, False, -1): Whether to add labelled syntactic dependencies.
|
parse (True, False, -1): Whether to add labelled syntactic dependencies.
|
||||||
|
|
||||||
-1 (default) is "guess": It will guess True if tag=True and the
|
-1 (default) is "guess": It will guess True if tag=True and the
|
||||||
model has been installed.
|
model has been installed.
|
||||||
|
|
||||||
|
|
|
@ -39,7 +39,7 @@ def install_parser_model(url, dest_dir):
|
||||||
def install_dep_vectors(url, dest_dir):
|
def install_dep_vectors(url, dest_dir):
|
||||||
if not os.path.exists(dest_dir):
|
if not os.path.exists(dest_dir):
|
||||||
os.mkdir(dest_dir)
|
os.mkdir(dest_dir)
|
||||||
|
|
||||||
filename = download_file(url, dest_dir)
|
filename = download_file(url, dest_dir)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -22,4 +22,3 @@ cdef class EnPosTagger:
|
||||||
|
|
||||||
cdef int set_morph(self, const int i, const PosTag* tag, TokenC* tokens) except -1
|
cdef int set_morph(self, const int i, const PosTag* tag, TokenC* tokens) except -1
|
||||||
cdef int lemmatize(self, const univ_pos_t pos, const LexemeC* lex) except -1
|
cdef int lemmatize(self, const univ_pos_t pos, const LexemeC* lex) except -1
|
||||||
|
|
||||||
|
|
|
@ -353,7 +353,7 @@ cdef class EnPosTagger:
|
||||||
cached.lemma = self.strings[lemma_str]
|
cached.lemma = self.strings[lemma_str]
|
||||||
set_morph_from_dict(&cached.morph, props)
|
set_morph_from_dict(&cached.morph, props)
|
||||||
self._morph_cache.set(pos, orth, <void*>cached)
|
self._morph_cache.set(pos, orth, <void*>cached)
|
||||||
|
|
||||||
|
|
||||||
cdef int fill_context(atom_t* context, const int i, const TokenC* tokens) except -1:
|
cdef int fill_context(atom_t* context, const int i, const TokenC* tokens) except -1:
|
||||||
_fill_from_token(&context[P2_orth], &tokens[i-2])
|
_fill_from_token(&context[P2_orth], &tokens[i-2])
|
||||||
|
@ -381,4 +381,3 @@ cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
|
||||||
context[7] = 4
|
context[7] = 4
|
||||||
else:
|
else:
|
||||||
context[7] = 0
|
context[7] = 0
|
||||||
|
|
||||||
|
|
|
@ -12,7 +12,7 @@ cdef LexemeC EMPTY_LEXEME
|
||||||
|
|
||||||
cdef int set_lex_struct_props(LexemeC* lex, dict props, StringStore strings,
|
cdef int set_lex_struct_props(LexemeC* lex, dict props, StringStore strings,
|
||||||
const float* empty_vec) except -1
|
const float* empty_vec) except -1
|
||||||
|
|
||||||
cdef class Lexeme:
|
cdef class Lexeme:
|
||||||
cdef readonly ndarray repvec
|
cdef readonly ndarray repvec
|
||||||
|
|
||||||
|
|
|
@ -17,12 +17,12 @@ cdef int set_lex_struct_props(LexemeC* lex, dict props, StringStore string_store
|
||||||
const float* empty_vec) except -1:
|
const float* empty_vec) except -1:
|
||||||
lex.length = props['length']
|
lex.length = props['length']
|
||||||
lex.orth = string_store[props['orth']]
|
lex.orth = string_store[props['orth']]
|
||||||
lex.lower = string_store[props['lower']]
|
lex.lower = string_store[props['lower']]
|
||||||
lex.norm = string_store[props['norm']]
|
lex.norm = string_store[props['norm']]
|
||||||
lex.shape = string_store[props['shape']]
|
lex.shape = string_store[props['shape']]
|
||||||
lex.prefix = string_store[props['prefix']]
|
lex.prefix = string_store[props['prefix']]
|
||||||
lex.suffix = string_store[props['suffix']]
|
lex.suffix = string_store[props['suffix']]
|
||||||
|
|
||||||
lex.cluster = props['cluster']
|
lex.cluster = props['cluster']
|
||||||
lex.prob = props['prob']
|
lex.prob = props['prob']
|
||||||
lex.sentiment = props['sentiment']
|
lex.sentiment = props['sentiment']
|
||||||
|
|
|
@ -58,10 +58,10 @@ LOCAL = (
|
||||||
(N3.sic,),
|
(N3.sic,),
|
||||||
(P4.sic,),
|
(P4.sic,),
|
||||||
(N4.sic,),
|
(N4.sic,),
|
||||||
|
|
||||||
(P1.sic, N0.sic,),
|
(P1.sic, N0.sic,),
|
||||||
(N0.sic, N1.sic),
|
(N0.sic, N1.sic),
|
||||||
|
|
||||||
(N0.prefix,),
|
(N0.prefix,),
|
||||||
(N0.suffix,),
|
(N0.suffix,),
|
||||||
|
|
||||||
|
|
|
@ -11,7 +11,7 @@ cdef class NERAnnotation:
|
||||||
memset(self.starts, -1, sizeof(int) * length)
|
memset(self.starts, -1, sizeof(int) * length)
|
||||||
memset(self.ends, -1, sizeof(int) * length)
|
memset(self.ends, -1, sizeof(int) * length)
|
||||||
memset(self.labels, -1, sizeof(int) * length)
|
memset(self.labels, -1, sizeof(int) * length)
|
||||||
|
|
||||||
cdef int start, end, label
|
cdef int start, end, label
|
||||||
for start, end, label in entities:
|
for start, end, label in entities:
|
||||||
for i in range(start, end):
|
for i in range(start, end):
|
||||||
|
|
|
@ -107,7 +107,7 @@ cdef bint is_oracle(ActionType act, int tag, ActionType g_act, int g_tag,
|
||||||
# U, Gold L --> False
|
# U, Gold L --> False
|
||||||
# U, Gold O --> False
|
# U, Gold O --> False
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
cdef int set_accept_if_valid(Move* moves, int n_classes, State* s) except 0:
|
cdef int set_accept_if_valid(Move* moves, int n_classes, State* s) except 0:
|
||||||
cdef int n_accept = 0
|
cdef int n_accept = 0
|
||||||
|
@ -160,7 +160,7 @@ cdef Move* best_accepted(Move* moves, weight_t* scores, int n) except NULL:
|
||||||
cdef int best = first_accept
|
cdef int best = first_accept
|
||||||
cdef weight_t score = scores[first_accept-1]
|
cdef weight_t score = scores[first_accept-1]
|
||||||
cdef int i
|
cdef int i
|
||||||
for i in range(first_accept+1, n):
|
for i in range(first_accept+1, n):
|
||||||
if moves[i].accept and scores[i-1] > score:
|
if moves[i].accept and scores[i-1] > score:
|
||||||
best = i
|
best = i
|
||||||
score = scores[i-1]
|
score = scores[i-1]
|
||||||
|
@ -179,7 +179,7 @@ cdef int transition(State *s, Move* move) except -1:
|
||||||
end_entity(s)
|
end_entity(s)
|
||||||
elif move.action == OUT:
|
elif move.action == OUT:
|
||||||
pass
|
pass
|
||||||
s.tags[s.i] = move.clas
|
s.tags[s.i] = move.clas
|
||||||
s.i += 1
|
s.i += 1
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -149,5 +149,3 @@ cpdef enum:
|
||||||
|
|
||||||
|
|
||||||
cdef int fill_context(atom_t* context, State* s, Tokens tokens) except -1
|
cdef int fill_context(atom_t* context, State* s, Tokens tokens) except -1
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -18,7 +18,7 @@ cdef int _fill_token(atom_t* c, Lexeme* lex, atom_t pos):
|
||||||
c[T_postype] = lex.postype
|
c[T_postype] = lex.postype
|
||||||
c[T_nertype] = 0
|
c[T_nertype] = 0
|
||||||
c[T_sensetype] = 0
|
c[T_sensetype] = 0
|
||||||
|
|
||||||
c[T_is_alpha] = lex.flags & (1 << IS_ALPHA)
|
c[T_is_alpha] = lex.flags & (1 << IS_ALPHA)
|
||||||
c[T_is_digit] = lex.flags & (1 << IS_DIGIT)
|
c[T_is_digit] = lex.flags & (1 << IS_DIGIT)
|
||||||
c[T_is_lower] = lex.flags & (1 << IS_LOWER)
|
c[T_is_lower] = lex.flags & (1 << IS_LOWER)
|
||||||
|
|
|
@ -7,10 +7,10 @@ LOCAL = (
|
||||||
(N1_sic,),
|
(N1_sic,),
|
||||||
(P2_sic,),
|
(P2_sic,),
|
||||||
(N2_sic,),
|
(N2_sic,),
|
||||||
|
|
||||||
(P1_sic, W_sic,),
|
(P1_sic, W_sic,),
|
||||||
(W_sic, N1_sic),
|
(W_sic, N1_sic),
|
||||||
|
|
||||||
(W_prefix,),
|
(W_prefix,),
|
||||||
(W_suffix,),
|
(W_suffix,),
|
||||||
|
|
||||||
|
|
|
@ -92,7 +92,7 @@ cdef class NERParser:
|
||||||
fill_context(self._context, s, tokens)
|
fill_context(self._context, s, tokens)
|
||||||
self.extractor.extract(self._feats, self._values, self._context, NULL)
|
self.extractor.extract(self._feats, self._values, self._context, NULL)
|
||||||
self.model.score(self._scores, self._feats, self._values)
|
self.model.score(self._scores, self._feats, self._values)
|
||||||
|
|
||||||
set_accept_if_valid(self._moves, self.n_classes, s)
|
set_accept_if_valid(self._moves, self.n_classes, s)
|
||||||
guess = best_accepted(self._moves, self._scores, self.n_classes)
|
guess = best_accepted(self._moves, self._scores, self.n_classes)
|
||||||
assert guess.clas != 0
|
assert guess.clas != 0
|
||||||
|
|
|
@ -16,7 +16,7 @@ cpdef enum ActionType:
|
||||||
|
|
||||||
cdef int set_accept_if_oracle(Move* moves, int n, State* s,
|
cdef int set_accept_if_oracle(Move* moves, int n, State* s,
|
||||||
int* g_starts, int* g_ends, int* g_labels) except 0
|
int* g_starts, int* g_ends, int* g_labels) except 0
|
||||||
|
|
||||||
cdef int set_accept_if_valid(Move* moves, int n, State* s) except 0
|
cdef int set_accept_if_valid(Move* moves, int n, State* s) except 0
|
||||||
|
|
||||||
cdef Move* best_accepted(Move* moves, weight_t* scores, int n) except NULL
|
cdef Move* best_accepted(Move* moves, weight_t* scores, int n) except NULL
|
||||||
|
|
|
@ -97,7 +97,7 @@ cdef Move* best_accepted(Move* moves, weight_t* scores, int n) except NULL:
|
||||||
cdef int best = first_accept
|
cdef int best = first_accept
|
||||||
cdef weight_t score = scores[first_accept-1]
|
cdef weight_t score = scores[first_accept-1]
|
||||||
cdef int i
|
cdef int i
|
||||||
for i in range(first_accept+1, n):
|
for i in range(first_accept+1, n):
|
||||||
if moves[i].accept and scores[i-1] > score:
|
if moves[i].accept and scores[i-1] > score:
|
||||||
best = i
|
best = i
|
||||||
score = scores[i-1]
|
score = scores[i-1]
|
||||||
|
@ -105,7 +105,7 @@ cdef Move* best_accepted(Move* moves, weight_t* scores, int n) except NULL:
|
||||||
|
|
||||||
|
|
||||||
cdef int transition(State *s, Move* move) except -1:
|
cdef int transition(State *s, Move* move) except -1:
|
||||||
s.tags[s.i] = move.clas
|
s.tags[s.i] = move.clas
|
||||||
if move.action == OUT:
|
if move.action == OUT:
|
||||||
s.i += 1
|
s.i += 1
|
||||||
elif move.action == SHIFT:
|
elif move.action == SHIFT:
|
||||||
|
|
|
@ -8,7 +8,7 @@ cdef class PyState:
|
||||||
cdef readonly list tag_names
|
cdef readonly list tag_names
|
||||||
cdef readonly int n_classes
|
cdef readonly int n_classes
|
||||||
cdef readonly dict moves_by_name
|
cdef readonly dict moves_by_name
|
||||||
|
|
||||||
cdef Move* _moves
|
cdef Move* _moves
|
||||||
cdef Move* _golds
|
cdef Move* _golds
|
||||||
cdef State* _s
|
cdef State* _s
|
||||||
|
|
|
@ -33,7 +33,7 @@ class Scorer(object):
|
||||||
@property
|
@property
|
||||||
def ents_r(self):
|
def ents_r(self):
|
||||||
return (self.ents_tp / (self.ents_tp + self.ents_fn + 1e-100)) * 100
|
return (self.ents_tp / (self.ents_tp + self.ents_fn + 1e-100)) * 100
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def ents_f(self):
|
def ents_f(self):
|
||||||
return (2 * self.ents_p * self.ents_r) / (self.ents_p + self.ents_r + 1e-100)
|
return (2 * self.ents_p * self.ents_r) / (self.ents_p + self.ents_r + 1e-100)
|
||||||
|
|
|
@ -5,7 +5,7 @@ from .structs cimport Morphology, TokenC, LexemeC
|
||||||
from .vocab cimport Vocab
|
from .vocab cimport Vocab
|
||||||
from .strings cimport StringStore
|
from .strings cimport StringStore
|
||||||
|
|
||||||
|
|
||||||
cdef class Span:
|
cdef class Span:
|
||||||
cdef readonly Tokens _seq
|
cdef readonly Tokens _seq
|
||||||
cdef public int i
|
cdef public int i
|
||||||
|
@ -15,4 +15,3 @@ cdef class Span:
|
||||||
cdef public Span head
|
cdef public Span head
|
||||||
cdef public list rights
|
cdef public list rights
|
||||||
cdef public list lefts
|
cdef public list lefts
|
||||||
|
|
||||||
|
|
|
@ -8,7 +8,7 @@ cdef struct LexemeC:
|
||||||
const float* repvec
|
const float* repvec
|
||||||
|
|
||||||
flags_t flags
|
flags_t flags
|
||||||
|
|
||||||
attr_t id
|
attr_t id
|
||||||
attr_t length
|
attr_t length
|
||||||
|
|
||||||
|
@ -18,7 +18,7 @@ cdef struct LexemeC:
|
||||||
attr_t shape
|
attr_t shape
|
||||||
attr_t prefix
|
attr_t prefix
|
||||||
attr_t suffix
|
attr_t suffix
|
||||||
|
|
||||||
attr_t cluster
|
attr_t cluster
|
||||||
|
|
||||||
float prob
|
float prob
|
||||||
|
|
|
@ -99,7 +99,7 @@ cpdef enum:
|
||||||
S0_shape
|
S0_shape
|
||||||
S0_ne_iob
|
S0_ne_iob
|
||||||
S0_ne_type
|
S0_ne_type
|
||||||
|
|
||||||
S0r2w
|
S0r2w
|
||||||
S0r2W
|
S0r2W
|
||||||
S0r2p
|
S0r2p
|
||||||
|
@ -164,7 +164,7 @@ cpdef enum:
|
||||||
N0_shape
|
N0_shape
|
||||||
N0_ne_iob
|
N0_ne_iob
|
||||||
N0_ne_type
|
N0_ne_type
|
||||||
|
|
||||||
N1w
|
N1w
|
||||||
N1W
|
N1W
|
||||||
N1p
|
N1p
|
||||||
|
@ -190,7 +190,7 @@ cpdef enum:
|
||||||
N2_shape
|
N2_shape
|
||||||
N2_ne_iob
|
N2_ne_iob
|
||||||
N2_ne_type
|
N2_ne_type
|
||||||
|
|
||||||
P1w
|
P1w
|
||||||
P1W
|
P1W
|
||||||
P1p
|
P1p
|
||||||
|
@ -203,7 +203,7 @@ cpdef enum:
|
||||||
P1_shape
|
P1_shape
|
||||||
P1_ne_iob
|
P1_ne_iob
|
||||||
P1_ne_type
|
P1_ne_type
|
||||||
|
|
||||||
P2w
|
P2w
|
||||||
P2W
|
P2W
|
||||||
P2p
|
P2p
|
||||||
|
@ -216,7 +216,7 @@ cpdef enum:
|
||||||
P2_shape
|
P2_shape
|
||||||
P2_ne_iob
|
P2_ne_iob
|
||||||
P2_ne_type
|
P2_ne_type
|
||||||
|
|
||||||
E0w
|
E0w
|
||||||
E0W
|
E0W
|
||||||
E0p
|
E0p
|
||||||
|
@ -229,7 +229,7 @@ cpdef enum:
|
||||||
E0_shape
|
E0_shape
|
||||||
E0_ne_iob
|
E0_ne_iob
|
||||||
E0_ne_type
|
E0_ne_type
|
||||||
|
|
||||||
E1w
|
E1w
|
||||||
E1W
|
E1W
|
||||||
E1p
|
E1p
|
||||||
|
@ -242,7 +242,7 @@ cpdef enum:
|
||||||
E1_shape
|
E1_shape
|
||||||
E1_ne_iob
|
E1_ne_iob
|
||||||
E1_ne_type
|
E1_ne_type
|
||||||
|
|
||||||
# Misc features at the end
|
# Misc features at the end
|
||||||
dist
|
dist
|
||||||
N0lv
|
N0lv
|
||||||
|
|
|
@ -111,10 +111,10 @@ ner = (
|
||||||
(N1W,),
|
(N1W,),
|
||||||
(P2W,),
|
(P2W,),
|
||||||
(N2W,),
|
(N2W,),
|
||||||
|
|
||||||
(P1W, N0W,),
|
(P1W, N0W,),
|
||||||
(N0W, N1W),
|
(N0W, N1W),
|
||||||
|
|
||||||
(N0_prefix,),
|
(N0_prefix,),
|
||||||
(N0_suffix,),
|
(N0_suffix,),
|
||||||
|
|
||||||
|
@ -205,22 +205,22 @@ ner = (
|
||||||
unigrams = (
|
unigrams = (
|
||||||
(S2W, S2p),
|
(S2W, S2p),
|
||||||
(S2c6, S2p),
|
(S2c6, S2p),
|
||||||
|
|
||||||
(S1W, S1p),
|
(S1W, S1p),
|
||||||
(S1c6, S1p),
|
(S1c6, S1p),
|
||||||
|
|
||||||
(S0W, S0p),
|
(S0W, S0p),
|
||||||
(S0c6, S0p),
|
(S0c6, S0p),
|
||||||
|
|
||||||
(N0W, N0p),
|
(N0W, N0p),
|
||||||
(N0p,),
|
(N0p,),
|
||||||
(N0c,),
|
(N0c,),
|
||||||
(N0c6, N0p),
|
(N0c6, N0p),
|
||||||
(N0L,),
|
(N0L,),
|
||||||
|
|
||||||
(N1W, N1p),
|
(N1W, N1p),
|
||||||
(N1c6, N1p),
|
(N1c6, N1p),
|
||||||
|
|
||||||
(N2W, N2p),
|
(N2W, N2p),
|
||||||
(N2c6, N2p),
|
(N2c6, N2p),
|
||||||
|
|
||||||
|
@ -316,7 +316,7 @@ trigrams = (
|
||||||
(S0p, S0lp, N0p),
|
(S0p, S0lp, N0p),
|
||||||
(S0p, N0p, N0lp),
|
(S0p, N0p, N0lp),
|
||||||
(N0p, N0lp, N0l2p),
|
(N0p, N0lp, N0l2p),
|
||||||
|
|
||||||
(S0W, S0p, S0rL, S0r2L),
|
(S0W, S0p, S0rL, S0r2L),
|
||||||
(S0p, S0rL, S0r2L),
|
(S0p, S0rL, S0r2L),
|
||||||
|
|
||||||
|
|
|
@ -27,7 +27,7 @@ cdef int pop_stack(State *s) except -1:
|
||||||
s.stack -= 1
|
s.stack -= 1
|
||||||
if s.stack_len == 0 and not at_eol(s):
|
if s.stack_len == 0 and not at_eol(s):
|
||||||
push_stack(s)
|
push_stack(s)
|
||||||
|
|
||||||
|
|
||||||
cdef int push_stack(State *s) except -1:
|
cdef int push_stack(State *s) except -1:
|
||||||
assert s.i < s.sent_len
|
assert s.i < s.sent_len
|
||||||
|
|
|
@ -3,7 +3,7 @@ from cymem.cymem cimport Pool
|
||||||
from thinc.typedefs cimport weight_t
|
from thinc.typedefs cimport weight_t
|
||||||
|
|
||||||
|
|
||||||
from ._state cimport State
|
from ._state cimport State
|
||||||
from .transition_system cimport TransitionSystem, Transition
|
from .transition_system cimport TransitionSystem, Transition
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -280,5 +280,3 @@ class OracleError(Exception):
|
||||||
|
|
||||||
class UnknownMove(Exception):
|
class UnknownMove(Exception):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -36,7 +36,7 @@ from . import _parse_features
|
||||||
from ._parse_features cimport fill_context, CONTEXT_SIZE
|
from ._parse_features cimport fill_context, CONTEXT_SIZE
|
||||||
|
|
||||||
|
|
||||||
DEBUG = False
|
DEBUG = False
|
||||||
def set_debug(val):
|
def set_debug(val):
|
||||||
global DEBUG
|
global DEBUG
|
||||||
DEBUG = val
|
DEBUG = val
|
||||||
|
@ -112,7 +112,7 @@ cdef class GreedyParser:
|
||||||
scores = self.model.score(context)
|
scores = self.model.score(context)
|
||||||
guess = self.moves.best_valid(scores, state)
|
guess = self.moves.best_valid(scores, state)
|
||||||
best = self.moves.best_gold(scores, state, gold)
|
best = self.moves.best_gold(scores, state, gold)
|
||||||
|
|
||||||
cost = guess.get_cost(&guess, state, gold)
|
cost = guess.get_cost(&guess, state, gold)
|
||||||
self.model.update(context, guess.clas, best.clas, cost)
|
self.model.update(context, guess.clas, best.clas, cost)
|
||||||
|
|
||||||
|
|
|
@ -33,16 +33,16 @@ cdef class TransitionSystem:
|
||||||
cdef int first_state(self, State* state) except -1
|
cdef int first_state(self, State* state) except -1
|
||||||
|
|
||||||
cdef int preprocess_gold(self, GoldParse gold) except -1
|
cdef int preprocess_gold(self, GoldParse gold) except -1
|
||||||
|
|
||||||
cdef Transition lookup_transition(self, object name) except *
|
cdef Transition lookup_transition(self, object name) except *
|
||||||
|
|
||||||
cdef Transition init_transition(self, int clas, int move, int label) except *
|
cdef Transition init_transition(self, int clas, int move, int label) except *
|
||||||
|
|
||||||
cdef Transition best_valid(self, const weight_t* scores, const State* state) except *
|
cdef Transition best_valid(self, const weight_t* scores, const State* state) except *
|
||||||
|
|
||||||
cdef Transition best_gold(self, const weight_t* scores, const State* state,
|
cdef Transition best_gold(self, const weight_t* scores, const State* state,
|
||||||
GoldParse gold) except *
|
GoldParse gold) except *
|
||||||
|
|
||||||
|
|
||||||
#cdef class PyState:
|
#cdef class PyState:
|
||||||
# """Provide a Python class for testing purposes."""
|
# """Provide a Python class for testing purposes."""
|
||||||
|
|
|
@ -13,5 +13,3 @@ class Config(object):
|
||||||
@classmethod
|
@classmethod
|
||||||
def read(cls, model_dir, name):
|
def read(cls, model_dir, name):
|
||||||
return cls(**json.load(open(path.join(model_dir, '%s.json' % name))))
|
return cls(**json.load(open(path.join(model_dir, '%s.json' % name))))
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -60,7 +60,7 @@ cdef class Tokenizer:
|
||||||
split off a suffix, and repeat.
|
split off a suffix, and repeat.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
string (unicode): The string to be tokenized.
|
string (unicode): The string to be tokenized.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
tokens (Tokens): A Tokens object, giving access to a sequence of LexemeCs.
|
tokens (Tokens): A Tokens object, giving access to a sequence of LexemeCs.
|
||||||
|
@ -213,7 +213,7 @@ cdef class Tokenizer:
|
||||||
cdef unicode string = chars[:length]
|
cdef unicode string = chars[:length]
|
||||||
match = self._infix_re.search(string)
|
match = self._infix_re.search(string)
|
||||||
return match.start() if match is not None else 0
|
return match.start() if match is not None else 0
|
||||||
|
|
||||||
cdef int _find_prefix(self, Py_UNICODE* chars, size_t length) except -1:
|
cdef int _find_prefix(self, Py_UNICODE* chars, size_t length) except -1:
|
||||||
cdef unicode string = chars[:length]
|
cdef unicode string = chars[:length]
|
||||||
match = self._prefix_re.search(string)
|
match = self._prefix_re.search(string)
|
||||||
|
|
|
@ -31,9 +31,9 @@ cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil:
|
||||||
cdef class Tokens:
|
cdef class Tokens:
|
||||||
cdef Pool mem
|
cdef Pool mem
|
||||||
cdef Vocab vocab
|
cdef Vocab vocab
|
||||||
|
|
||||||
cdef TokenC* data
|
cdef TokenC* data
|
||||||
|
|
||||||
|
|
||||||
cdef list _py_tokens
|
cdef list _py_tokens
|
||||||
cdef unicode _string
|
cdef unicode _string
|
||||||
|
@ -61,7 +61,7 @@ cdef class Token:
|
||||||
cdef int array_len
|
cdef int array_len
|
||||||
cdef bint _owns_c_data
|
cdef bint _owns_c_data
|
||||||
|
|
||||||
|
|
||||||
cdef Tokens _seq
|
cdef Tokens _seq
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
|
|
@ -104,10 +104,10 @@ cdef class Tokens:
|
||||||
|
|
||||||
def __getitem__(self, object i):
|
def __getitem__(self, object i):
|
||||||
"""Retrieve a token.
|
"""Retrieve a token.
|
||||||
|
|
||||||
The Python Token objects are created lazily from internal C data, and
|
The Python Token objects are created lazily from internal C data, and
|
||||||
cached in _py_tokens
|
cached in _py_tokens
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
token (Token):
|
token (Token):
|
||||||
"""
|
"""
|
||||||
|
@ -180,7 +180,7 @@ cdef class Tokens:
|
||||||
yield Span(self, start, i+1)
|
yield Span(self, start, i+1)
|
||||||
start = None
|
start = None
|
||||||
if start is not None:
|
if start is not None:
|
||||||
yield Span(self, start, self.length)
|
yield Span(self, start, self.length)
|
||||||
|
|
||||||
cdef int push_back(self, int idx, LexemeOrToken lex_or_tok) except -1:
|
cdef int push_back(self, int idx, LexemeOrToken lex_or_tok) except -1:
|
||||||
if self.length == self.max_length:
|
if self.length == self.max_length:
|
||||||
|
@ -298,7 +298,7 @@ cdef class Tokens:
|
||||||
# What to do about morphology??
|
# What to do about morphology??
|
||||||
# TODO: token.morph = ???
|
# TODO: token.morph = ???
|
||||||
token.tag = self.vocab.strings[tag]
|
token.tag = self.vocab.strings[tag]
|
||||||
token.lemma = self.vocab.strings[lemma]
|
token.lemma = self.vocab.strings[lemma]
|
||||||
if ent_type == 'O':
|
if ent_type == 'O':
|
||||||
token.ent_iob = 2
|
token.ent_iob = 2
|
||||||
token.ent_type = 0
|
token.ent_type = 0
|
||||||
|
@ -355,7 +355,7 @@ cdef class Tokens:
|
||||||
self._py_tokens = [None] * self.length
|
self._py_tokens = [None] * self.length
|
||||||
# Return the merged Python object
|
# Return the merged Python object
|
||||||
return self[start]
|
return self[start]
|
||||||
|
|
||||||
|
|
||||||
cdef class Token:
|
cdef class Token:
|
||||||
"""An individual token --- i.e. a word, a punctuation symbol, etc. Created
|
"""An individual token --- i.e. a word, a punctuation symbol, etc. Created
|
||||||
|
@ -608,4 +608,3 @@ _parse_unset_error = """Text has not been parsed, so cannot be accessed.
|
||||||
Check that the parser data is installed. Run "python -m spacy.en.download" if not.
|
Check that the parser data is installed. Run "python -m spacy.en.download" if not.
|
||||||
Check whether parse=False in the call to English.__call__
|
Check whether parse=False in the call to English.__call__
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
|
@ -94,5 +94,3 @@ ctypedef uint64_t flags_t
|
||||||
ctypedef uint32_t id_t
|
ctypedef uint32_t id_t
|
||||||
ctypedef uint16_t len_t
|
ctypedef uint16_t len_t
|
||||||
ctypedef uint16_t tag_t
|
ctypedef uint16_t tag_t
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -71,7 +71,7 @@ def read_detoken_rules(lang):
|
||||||
for line in file_:
|
for line in file_:
|
||||||
entries.append(line.strip())
|
entries.append(line.strip())
|
||||||
return entries
|
return entries
|
||||||
|
|
||||||
|
|
||||||
def align_tokens(ref, indices):
|
def align_tokens(ref, indices):
|
||||||
start = 0
|
start = 0
|
||||||
|
@ -87,7 +87,7 @@ def align_tokens(ref, indices):
|
||||||
|
|
||||||
|
|
||||||
def detokenize(token_rules, words):
|
def detokenize(token_rules, words):
|
||||||
"""To align with treebanks, return a list of "chunks", where a chunk is a
|
"""To align with treebanks, return a list of "chunks", where a chunk is a
|
||||||
sequence of tokens that are separated by whitespace in actual strings. Each
|
sequence of tokens that are separated by whitespace in actual strings. Each
|
||||||
chunk should be a tuple of token indices, e.g.
|
chunk should be a tuple of token indices, e.g.
|
||||||
|
|
||||||
|
|
|
@ -31,6 +31,5 @@ cdef class Vocab:
|
||||||
|
|
||||||
cdef const LexemeC* get(self, Pool mem, UniStr* s) except NULL
|
cdef const LexemeC* get(self, Pool mem, UniStr* s) except NULL
|
||||||
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
|
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
|
||||||
|
|
||||||
cdef PreshMap _map
|
cdef PreshMap _map
|
||||||
|
|
||||||
|
|
|
@ -170,7 +170,7 @@ cdef class Vocab:
|
||||||
self.lexemes[lexeme.id] = lexeme
|
self.lexemes[lexeme.id] = lexeme
|
||||||
i += 1
|
i += 1
|
||||||
fclose(fp)
|
fclose(fp)
|
||||||
|
|
||||||
def load_rep_vectors(self, loc):
|
def load_rep_vectors(self, loc):
|
||||||
file_ = _CFile(loc, b'rb')
|
file_ = _CFile(loc, b'rb')
|
||||||
cdef int32_t word_len
|
cdef int32_t word_len
|
||||||
|
@ -187,7 +187,7 @@ cdef class Vocab:
|
||||||
except IOError:
|
except IOError:
|
||||||
break
|
break
|
||||||
file_.read(&vec_len, sizeof(vec_len), 1)
|
file_.read(&vec_len, sizeof(vec_len), 1)
|
||||||
|
|
||||||
mem = Address(word_len, sizeof(char))
|
mem = Address(word_len, sizeof(char))
|
||||||
chars = <char*>mem.ptr
|
chars = <char*>mem.ptr
|
||||||
vec = <float*>self.mem.alloc(vec_len, sizeof(float))
|
vec = <float*>self.mem.alloc(vec_len, sizeof(float))
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
The Sun is the star at the center of the Solar System. It is almost perfectly spherical and consists of hot plasma interwoven with magnetic fields. [ 12 ] [ 13 ] It has a diameter of about 1 , 392 , 684 km ( 865 , 374 mi ) , [ 5 ] around 109 times that of Earth , and its mass ( 1.989×1030 kilograms , approximately 330 , 000 times the mass of Earth ) accounts for about 99.86 % of the total mass of the Solar System. [ 14 ] Chemically , about three quarters of the Sun 's mass consists of hydrogen , while the rest is mostly helium. The remaining 1.69 % ( equal to 5 , 600 times the mass of Earth ) consists of heavier elements , including oxygen , carbon , neon and iron , among others. [ 15 ]
|
The Sun is the star at the center of the Solar System. It is almost perfectly spherical and consists of hot plasma interwoven with magnetic fields. [ 12 ] [ 13 ] It has a diameter of about 1 , 392 , 684 km ( 865 , 374 mi ) , [ 5 ] around 109 times that of Earth , and its mass ( 1.989×1030 kilograms , approximately 330 , 000 times the mass of Earth ) accounts for about 99.86 % of the total mass of the Solar System. [ 14 ] Chemically , about three quarters of the Sun 's mass consists of hydrogen , while the rest is mostly helium. The remaining 1.69 % ( equal to 5 , 600 times the mass of Earth ) consists of heavier elements , including oxygen , carbon , neon and iron , among others. [ 15 ]
|
||||||
|
|
||||||
The Sun formed about 4.567 billion [ a ] [ 16 ] years ago from the gravitational collapse of a region within a large molecular cloud. Most of the matter gathered in the center , while the rest flattened into an orbiting disk that would become the Solar System. The central mass became increasingly hot and dense , eventually initiating thermonuclear fusion in its core. It is thought that almost all stars form by this process. The Sun is a G-type main-sequence star ( G2V ) based on spectral class and it is informally designated as a yellow dwarf because its visible radiation is most intense in the yellow-green portion of the spectrum , and although it is actually white in color , from the surface of the Earth it may appear yellow because of atmospheric scattering of blue light. [ 17 ] In the spectral class label , G2 indicates its surface temperature , of approximately 5778 K ( 5505 °C ) , and V indicates that the Sun , like most stars , is a main-sequence star , and thus generates its energy by nuclear fusion of hydrogen nuclei into helium. In its core , the Sun fuses about 620 million metric tons of hydrogen each second. [ 18 ] [ 19 ]
|
The Sun formed about 4.567 billion [ a ] [ 16 ] years ago from the gravitational collapse of a region within a large molecular cloud. Most of the matter gathered in the center , while the rest flattened into an orbiting disk that would become the Solar System. The central mass became increasingly hot and dense , eventually initiating thermonuclear fusion in its core. It is thought that almost all stars form by this process. The Sun is a G-type main-sequence star ( G2V ) based on spectral class and it is informally designated as a yellow dwarf because its visible radiation is most intense in the yellow-green portion of the spectrum , and although it is actually white in color , from the surface of the Earth it may appear yellow because of atmospheric scattering of blue light. [ 17 ] In the spectral class label , G2 indicates its surface temperature , of approximately 5778 K ( 5505 °C ) , and V indicates that the Sun , like most stars , is a main-sequence star , and thus generates its energy by nuclear fusion of hydrogen nuclei into helium. In its core , the Sun fuses about 620 million metric tons of hydrogen each second. [ 18 ] [ 19 ]
|
||||||
Once regarded by astronomers as a small and relatively insignificant star , the Sun is now thought to be brighter than about 85 % of the stars in the Milky Way , most of which are red dwarfs. [ 20 ] [ 21 ] The absolute magnitude of the Sun is +4.83 ; however , as the star closest to Earth , the Sun is by far the brightest object in the sky with an apparent magnitude of −26.74. [ 22 ] [ 23 ] This is about 13 billion times brighter than the next brightest star , Sirius , with an apparent magnitude of −1.46. The Sun 's hot corona continuously expands in space creating the solar wind , a stream of charged particles that extends to the heliopause at roughly 100 astronomical units. The bubble in the interstellar medium formed by the solar wind , the heliosphere , is the largest continuous structure in the Solar System. [ 24 ] [ 25 ]
|
Once regarded by astronomers as a small and relatively insignificant star , the Sun is now thought to be brighter than about 85 % of the stars in the Milky Way , most of which are red dwarfs. [ 20 ] [ 21 ] The absolute magnitude of the Sun is +4.83 ; however , as the star closest to Earth , the Sun is by far the brightest object in the sky with an apparent magnitude of −26.74. [ 22 ] [ 23 ] This is about 13 billion times brighter than the next brightest star , Sirius , with an apparent magnitude of −1.46. The Sun 's hot corona continuously expands in space creating the solar wind , a stream of charged particles that extends to the heliopause at roughly 100 astronomical units. The bubble in the interstellar medium formed by the solar wind , the heliosphere , is the largest continuous structure in the Solar System. [ 24 ] [ 25 ]
|
||||||
|
|
|
@ -30,6 +30,3 @@ def test_align_continue():
|
||||||
assert aligned[2] == ('re-align', [(5, 7), (7, 8), (8, 13)])
|
assert aligned[2] == ('re-align', [(5, 7), (7, 8), (8, 13)])
|
||||||
assert aligned[3] == ('and', [(13, 16)])
|
assert aligned[3] == ('and', [(13, 16)])
|
||||||
assert aligned[4] == ('continue', [(16, 24)])
|
assert aligned[4] == ('continue', [(16, 24)])
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -37,5 +37,3 @@ def test_dep():
|
||||||
assert feats_array[1][1] == tokens[1].dep
|
assert feats_array[1][1] == tokens[1].dep
|
||||||
assert feats_array[2][1] == tokens[2].dep
|
assert feats_array[2][1] == tokens[2].dep
|
||||||
assert feats_array[3][1] == tokens[3].dep
|
assert feats_array[3][1] == tokens[3].dep
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -57,7 +57,7 @@ def test3():
|
||||||
assert sum(o) != 0
|
assert sum(o) != 0
|
||||||
from numpy import dot
|
from numpy import dot
|
||||||
from numpy.linalg import norm
|
from numpy.linalg import norm
|
||||||
|
|
||||||
cosine = lambda v1, v2: dot(v1, v2) / (norm(v1) * norm(v2))
|
cosine = lambda v1, v2: dot(v1, v2) / (norm(v1) * norm(v2))
|
||||||
words = [w for w in nlp.vocab if w.check(IS_LOWER) and w.has_repvec]
|
words = [w for w in nlp.vocab if w.check(IS_LOWER) and w.has_repvec]
|
||||||
words.sort(key=lambda w: cosine(w.repvec, pleaded.repvec))
|
words.sort(key=lambda w: cosine(w.repvec, pleaded.repvec))
|
||||||
|
|
|
@ -35,4 +35,3 @@ def test_merge_heads():
|
||||||
def test_issue_54():
|
def test_issue_54():
|
||||||
text = u'Talks given by women had a slightly higher number of questions asked (3.2$\pm$0.2) than talks given by men (2.6$\pm$0.1).'
|
text = u'Talks given by women had a slightly higher number of questions asked (3.2$\pm$0.2) than talks given by men (2.6$\pm$0.1).'
|
||||||
tokens = NLU(text, merge_mwes=True)
|
tokens = NLU(text, merge_mwes=True)
|
||||||
|
|
||||||
|
|
|
@ -33,4 +33,3 @@ def test_word():
|
||||||
def test_not_number():
|
def test_not_number():
|
||||||
assert not like_number('dog')
|
assert not like_number('dog')
|
||||||
assert not like_number(',')
|
assert not like_number(',')
|
||||||
|
|
||||||
|
|
|
@ -58,4 +58,3 @@ def test_child_consistency(nlp, sun_text):
|
||||||
assert not children
|
assert not children
|
||||||
for head_index, children in rights.items():
|
for head_index, children in rights.items():
|
||||||
assert not children
|
assert not children
|
||||||
|
|
||||||
|
|
|
@ -49,4 +49,3 @@ def test_three_same_close(close_puncts, EN):
|
||||||
def test_double_end_quote(EN):
|
def test_double_end_quote(EN):
|
||||||
assert len(EN("Hello''")) == 2
|
assert len(EN("Hello''")) == 2
|
||||||
assert len(EN("''")) == 1
|
assert len(EN("''")) == 1
|
||||||
|
|
||||||
|
|
|
@ -16,6 +16,3 @@ def test_one(EN):
|
||||||
assert tokens[0].orth_ == 'Betty'
|
assert tokens[0].orth_ == 'Betty'
|
||||||
tokens2 = EN('Betty also bought a pound of butter.')
|
tokens2 = EN('Betty also bought a pound of butter.')
|
||||||
assert tokens2[0].orth_ == 'Betty'
|
assert tokens2[0].orth_ == 'Betty'
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -16,4 +16,3 @@ def test_subtrees():
|
||||||
assert len(list(bus.children)) == 1
|
assert len(list(bus.children)) == 1
|
||||||
|
|
||||||
assert len(list(wheels.subtree)) == 6
|
assert len(list(wheels.subtree)) == 6
|
||||||
|
|
||||||
|
|
|
@ -35,5 +35,3 @@ def test_single_token_string():
|
||||||
nlp = English()
|
nlp = English()
|
||||||
tokens = nlp(u'foobar')
|
tokens = nlp(u'foobar')
|
||||||
assert tokens[0].string == 'foobar'
|
assert tokens[0].string == 'foobar'
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -63,15 +63,15 @@ def test_contraction_punct(EN):
|
||||||
def test_sample(EN):
|
def test_sample(EN):
|
||||||
text = """Tributes pour in for late British Labour Party leader
|
text = """Tributes pour in for late British Labour Party leader
|
||||||
|
|
||||||
Tributes poured in from around the world Thursday
|
Tributes poured in from around the world Thursday
|
||||||
to the late Labour Party leader John Smith, who died earlier from a massive
|
to the late Labour Party leader John Smith, who died earlier from a massive
|
||||||
heart attack aged 55.
|
heart attack aged 55.
|
||||||
|
|
||||||
In Washington, the US State Department issued a statement regretting "the
|
In Washington, the US State Department issued a statement regretting "the
|
||||||
untimely death" of the rapier-tongued Scottish barrister and parliamentarian.
|
untimely death" of the rapier-tongued Scottish barrister and parliamentarian.
|
||||||
|
|
||||||
"Mr. Smith, throughout his distinguished"""
|
"Mr. Smith, throughout his distinguished"""
|
||||||
|
|
||||||
tokens = EN(text)
|
tokens = EN(text)
|
||||||
assert len(tokens) > 5
|
assert len(tokens) > 5
|
||||||
|
|
||||||
|
|
|
@ -39,5 +39,3 @@ def test_newline_double_space(EN):
|
||||||
def test_newline_space_wrap(EN):
|
def test_newline_space_wrap(EN):
|
||||||
tokens = EN('hello \n possums')
|
tokens = EN('hello \n possums')
|
||||||
assert len(tokens) == 3
|
assert len(tokens) == 3
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -20,7 +20,7 @@ s=\.\.\.= ... =g
|
||||||
s=[,;:@#$%&]= & =g
|
s=[,;:@#$%&]= & =g
|
||||||
|
|
||||||
# Assume sentence tokenization has been done first, so split FINAL periods
|
# Assume sentence tokenization has been done first, so split FINAL periods
|
||||||
# only.
|
# only.
|
||||||
s=\([^.]\)\([.]\)\([])}>"']*\)[ ]*$=\1 \2\3 =g
|
s=\([^.]\)\([.]\)\([])}>"']*\)[ ]*$=\1 \2\3 =g
|
||||||
# however, we may as well split ALL question marks and exclamation points,
|
# however, we may as well split ALL question marks and exclamation points,
|
||||||
# since they shouldn't have the abbrev.-marker ambiguity problem
|
# since they shouldn't have the abbrev.-marker ambiguity problem
|
||||||
|
|
Loading…
Reference in New Issue
Block a user