Merge branch 'develop' of https://github.com/explosion/spaCy into develop

This commit is contained in:
Matthew Honnibal 2019-03-12 13:33:22 +01:00
commit 1179de0860
5 changed files with 175 additions and 20 deletions

107
bin/train_word_vectors.py Normal file
View File

@ -0,0 +1,107 @@
#!/usr/bin/env python
from __future__ import print_function, unicode_literals, division
import logging
from pathlib import Path
from collections import defaultdict
from gensim.models import Word2Vec
from preshed.counter import PreshCounter
import plac
import spacy
logger = logging.getLogger(__name__)
class Corpus(object):
def __init__(self, directory, min_freq=10):
self.directory = directory
self.counts = PreshCounter()
self.strings = {}
self.min_freq = min_freq
def count_doc(self, doc):
# Get counts for this document
for word in doc:
self.counts.inc(word.orth, 1)
return len(doc)
def __iter__(self):
for text_loc in iter_dir(self.directory):
with text_loc.open("r", encoding="utf-8") as file_:
text = file_.read()
yield text
def iter_dir(loc):
dir_path = Path(loc)
for fn_path in dir_path.iterdir():
if fn_path.is_dir():
for sub_path in fn_path.iterdir():
yield sub_path
else:
yield fn_path
@plac.annotations(
lang=("ISO language code"),
in_dir=("Location of input directory"),
out_loc=("Location of output file"),
n_workers=("Number of workers", "option", "n", int),
size=("Dimension of the word vectors", "option", "d", int),
window=("Context window size", "option", "w", int),
min_count=("Min count", "option", "m", int),
negative=("Number of negative samples", "option", "g", int),
nr_iter=("Number of iterations", "option", "i", int),
)
def main(
lang,
in_dir,
out_loc,
negative=5,
n_workers=4,
window=5,
size=128,
min_count=10,
nr_iter=2,
):
logging.basicConfig(
format="%(asctime)s : %(levelname)s : %(message)s", level=logging.INFO
)
model = Word2Vec(
size=size,
window=window,
min_count=min_count,
workers=n_workers,
sample=1e-5,
negative=negative,
)
nlp = spacy.blank(lang)
corpus = Corpus(in_dir)
total_words = 0
total_sents = 0
for text_no, text_loc in enumerate(iter_dir(corpus.directory)):
with text_loc.open("r", encoding="utf-8") as file_:
text = file_.read()
total_sents += text.count("\n")
doc = nlp(text)
total_words += corpus.count_doc(doc)
logger.info(
"PROGRESS: at batch #%i, processed %i words, keeping %i word types",
text_no,
total_words,
len(corpus.strings),
)
model.corpus_count = total_sents
model.raw_vocab = defaultdict(int)
for orth, freq in corpus.counts:
if freq >= min_count:
model.raw_vocab[nlp.vocab.strings[orth]] = freq
model.scale_vocab()
model.finalize_vocab()
model.iter = nr_iter
model.train(corpus)
model.save(out_loc)
if __name__ == "__main__":
plac.call(main)

View File

@ -8,16 +8,13 @@ from .stop_words import STOP_WORDS
from .tag_map import TAG_MAP from .tag_map import TAG_MAP
from ...attrs import LANG from ...attrs import LANG
from ...language import Language from ...language import Language
from ...tokens import Doc, Token from ...tokens import Doc
from ...compat import copy_reg from ...compat import copy_reg
from ...util import DummyTokenizer from ...util import DummyTokenizer
ShortUnitWord = namedtuple("ShortUnitWord", ["surface", "lemma", "pos"]) ShortUnitWord = namedtuple("ShortUnitWord", ["surface", "lemma", "pos"])
# TODO: Is this the right place for this?
Token.set_extension("mecab_tag", default=None)
def try_mecab_import(): def try_mecab_import():
"""Mecab is required for Japanese support, so check for it. """Mecab is required for Japanese support, so check for it.
@ -82,10 +79,12 @@ class JapaneseTokenizer(DummyTokenizer):
words = [x.surface for x in dtokens] words = [x.surface for x in dtokens]
spaces = [False] * len(words) spaces = [False] * len(words)
doc = Doc(self.vocab, words=words, spaces=spaces) doc = Doc(self.vocab, words=words, spaces=spaces)
mecab_tags = []
for token, dtoken in zip(doc, dtokens): for token, dtoken in zip(doc, dtokens):
token._.mecab_tag = dtoken.pos mecab_tags.append(dtoken.pos)
token.tag_ = resolve_pos(dtoken) token.tag_ = resolve_pos(dtoken)
token.lemma_ = dtoken.lemma token.lemma_ = dtoken.lemma
doc.user_data["mecab_tags"] = mecab_tags
return doc return doc

View File

@ -39,9 +39,9 @@ together all components and creating the `Language` subclass for example,
| **Morph rules**<br />[`morph_rules.py`][morph_rules.py] | Exception rules for morphological analysis of irregular words like personal pronouns. | | **Morph rules**<br />[`morph_rules.py`][morph_rules.py] | Exception rules for morphological analysis of irregular words like personal pronouns. |
[stop_words.py]: [stop_words.py]:
https://github.com/explosion/spacy-dev-resources/tree/master/templates/new_language/stop_words.py https://github.com/explosion/spaCy/tree/master/spacy/lang/en/stop_words.py
[tokenizer_exceptions.py]: [tokenizer_exceptions.py]:
https://github.com/explosion/spacy-dev-resources/tree/master/templates/new_language/tokenizer_exceptions.py https://github.com/explosion/spaCy/tree/master/spacy/lang/de/tokenizer_exceptions.py
[norm_exceptions.py]: [norm_exceptions.py]:
https://github.com/explosion/spaCy/tree/master/spacy/lang/norm_exceptions.py https://github.com/explosion/spaCy/tree/master/spacy/lang/norm_exceptions.py
[punctuation.py]: [punctuation.py]:
@ -49,12 +49,12 @@ together all components and creating the `Language` subclass for example,
[char_classes.py]: [char_classes.py]:
https://github.com/explosion/spaCy/tree/master/spacy/lang/char_classes.py https://github.com/explosion/spaCy/tree/master/spacy/lang/char_classes.py
[lex_attrs.py]: [lex_attrs.py]:
https://github.com/explosion/spacy-dev-resources/tree/master/templates/new_language/lex_attrs.py https://github.com/explosion/spaCy/tree/master/spacy/lang/en/lex_attrs.py
[syntax_iterators.py]: [syntax_iterators.py]:
https://github.com/explosion/spaCy/tree/master/spacy/lang/en/syntax_iterators.py https://github.com/explosion/spaCy/tree/master/spacy/lang/en/syntax_iterators.py
[lemmatizer.py]: [lemmatizer.py]:
https://github.com/explosion/spacy-dev-resources/tree/master/templates/new_language/lemmatizer.py https://github.com/explosion/spaCy/tree/master/spacy/lang/de/lemmatizer.py
[tag_map.py]: [tag_map.py]:
https://github.com/explosion/spacy-dev-resources/tree/master/templates/new_language/tag_map.py https://github.com/explosion/spaCy/tree/master/spacy/lang/en/tag_map.py
[morph_rules.py]: [morph_rules.py]:
https://github.com/explosion/spaCy/tree/master/spacy/lang/en/morph_rules.py https://github.com/explosion/spaCy/tree/master/spacy/lang/en/morph_rules.py

View File

@ -105,11 +105,11 @@ to know the language's character set. If the language you're adding uses
non-latin characters, you might need to define the required character classes in non-latin characters, you might need to define the required character classes in
the global the global
[`char_classes.py`](https://github.com/explosion/spaCy/tree/master/spacy/lang/char_classes.py). [`char_classes.py`](https://github.com/explosion/spaCy/tree/master/spacy/lang/char_classes.py).
For efficiency, spaCy uses hard-coded unicode ranges to define character classes, For efficiency, spaCy uses hard-coded unicode ranges to define character
the definitions of which can be found on [Wikipedia](https://en.wikipedia.org/wiki/Unicode_block). classes, the definitions of which can be found on
If the language requires very specific punctuation [Wikipedia](https://en.wikipedia.org/wiki/Unicode_block). If the language
rules, you should consider overwriting the default regular expressions with your requires very specific punctuation rules, you should consider overwriting the
own in the language's `Defaults`. default regular expressions with your own in the language's `Defaults`.
</Infobox> </Infobox>
@ -121,9 +121,9 @@ spaCy, named according to the language's
code and resources specific to Spanish are placed into a directory code and resources specific to Spanish are placed into a directory
`spacy/lang/es`, which can be imported as `spacy.lang.es`. `spacy/lang/es`, which can be imported as `spacy.lang.es`.
To get started, you can use our To get started, you can check out the
[templates](https://github.com/explosion/spacy-dev-resources/templates/new_language) [existing languages](https://github.com/explosion/spacy/tree/master/spacy/lang).
for the most important files. Here's what the class template looks like: Here's what the class could look like:
```python ```python
### __init__.py (excerpt) ### __init__.py (excerpt)
@ -631,13 +631,13 @@ of using deep learning for NLP with limited labeled data. The vectors are also
useful by themselves they power the `.similarity` methods in spaCy. For best useful by themselves they power the `.similarity` methods in spaCy. For best
results, you should pre-process the text with spaCy before training the Word2vec results, you should pre-process the text with spaCy before training the Word2vec
model. This ensures your tokenization will match. You can use our model. This ensures your tokenization will match. You can use our
[word vectors training script](https://github.com/explosion/spacy-dev-resources/tree/master/training/word_vectors.py), [word vectors training script](https://github.com/explosion/spacy/tree/master/bin/train_word_vectors.py),
which pre-processes the text with your language-specific tokenizer and trains which pre-processes the text with your language-specific tokenizer and trains
the model using [Gensim](https://radimrehurek.com/gensim/). The `vectors.bin` the model using [Gensim](https://radimrehurek.com/gensim/). The `vectors.bin`
file should consist of one word and vector per line. file should consist of one word and vector per line.
```python ```python
https://github.com/explosion/spacy-dev-resources/tree/master/training/word_vectors.py https://github.com/explosion/spacy/tree/master/bin/train_word_vectors.py
``` ```
If you don't have a large sample of text available, you can also convert word If you don't have a large sample of text available, you can also convert word

View File

@ -524,6 +524,22 @@
}, },
"category": ["standalone", "research"] "category": ["standalone", "research"]
}, },
{
"id": "scispacy",
"title": "scispaCy",
"slogan": "A full spaCy pipeline and models for scientific/biomedical documents",
"github": "allenai/scispacy",
"pip": "scispacy",
"thumb": "https://i.imgur.com/dJQSclW.png",
"url": "https://allenai.github.io/scispacy/",
"author": " Allen Institute for Artificial Intelligence",
"author_links": {
"github": "allenai",
"twitter": "allenai_org",
"website": "http://allenai.org"
},
"category": ["models", "research"]
},
{ {
"id": "textacy", "id": "textacy",
"slogan": "NLP, before and after spaCy", "slogan": "NLP, before and after spaCy",
@ -851,6 +867,22 @@
}, },
"category": ["courses"] "category": ["courses"]
}, },
{
"type": "education",
"id": "datacamp-advanced-nlp",
"title": "Advanced Natural Language Processing with spaCy",
"slogan": "Datacamp, 2019",
"description": "If you're working with a lot of text, you'll eventually want to know more about it. For example, what's it about? What do the words mean in context? Who is doing what to whom? What companies and products are mentioned? Which texts are similar to each other? In this course, you'll learn how to use spaCy, a fast-growing industry standard library for NLP in Python, to build advanced natural language understanding systems, using both rule-based and machine learning approaches.",
"url": "https://www.datacamp.com/courses/advanced-nlp-with-spacy",
"thumb": "https://i.imgur.com/0Zks7c0.jpg",
"author": "Ines Montani",
"author_links": {
"twitter": "_inesmontani",
"github": "ines",
"website": "https://ines.io"
},
"category": ["courses"]
},
{ {
"type": "education", "type": "education",
"id": "learning-path-spacy", "id": "learning-path-spacy",
@ -910,6 +942,7 @@
"description": "Most NLP projects rely crucially on the quality of annotations used for training and evaluating models. In this episode, Matt and Ines of Explosion AI tell us how Prodigy can improve data annotation and model development workflows. Prodigy is an annotation tool implemented as a python library, and it comes with a web application and a command line interface. A developer can define input data streams and design simple annotation interfaces. Prodigy can help break down complex annotation decisions into a series of binary decisions, and it provides easy integration with spaCy models. Developers can specify how models should be modified as new annotations come in in an active learning framework.", "description": "Most NLP projects rely crucially on the quality of annotations used for training and evaluating models. In this episode, Matt and Ines of Explosion AI tell us how Prodigy can improve data annotation and model development workflows. Prodigy is an annotation tool implemented as a python library, and it comes with a web application and a command line interface. A developer can define input data streams and design simple annotation interfaces. Prodigy can help break down complex annotation decisions into a series of binary decisions, and it provides easy integration with spaCy models. Developers can specify how models should be modified as new annotations come in in an active learning framework.",
"soundcloud": "559200912", "soundcloud": "559200912",
"thumb": "https://i.imgur.com/hOBQEzc.jpg", "thumb": "https://i.imgur.com/hOBQEzc.jpg",
"url": "https://soundcloud.com/nlp-highlights/78-where-do-corpora-come-from-with-matt-honnibal-and-ines-montani",
"author": "Matt Gardner, Waleed Ammar (Allen AI)", "author": "Matt Gardner, Waleed Ammar (Allen AI)",
"author_links": { "author_links": {
"website": "https://soundcloud.com/nlp-highlights" "website": "https://soundcloud.com/nlp-highlights"
@ -925,12 +958,28 @@
"iframe": "https://www.pythonpodcast.com/wp-content/plugins/podlove-podcasting-plugin-for-wordpress/lib/modules/podlove_web_player/player_v4/dist/share.html?episode=https://www.pythonpodcast.com/?podlove_player4=176", "iframe": "https://www.pythonpodcast.com/wp-content/plugins/podlove-podcasting-plugin-for-wordpress/lib/modules/podlove_web_player/player_v4/dist/share.html?episode=https://www.pythonpodcast.com/?podlove_player4=176",
"iframe_height": 200, "iframe_height": 200,
"thumb": "https://i.imgur.com/rpo6BuY.png", "thumb": "https://i.imgur.com/rpo6BuY.png",
"url": "https://www.podcastinit.com/episode-87-spacy-with-matthew-honnibal/",
"author": "Tobias Macey", "author": "Tobias Macey",
"author_links": { "author_links": {
"website": "https://www.podcastinit.com" "website": "https://www.podcastinit.com"
}, },
"category": ["podcasts"] "category": ["podcasts"]
}, },
{
"type": "education",
"id": "talk-python-podcast",
"title": "Talk Python 202: Building a software business",
"slogan": "March 2019",
"description": "One core question around open source is how do you fund it? Well, there is always that PayPal donate button. But that's been a tremendous failure for many projects. Often the go-to answer is consulting. But what if you don't want to trade time for money? You could take things up a notch and change the equation, exchanging value for money. That's what Ines Montani and her co-founder did when they started Explosion AI with spaCy as the foundation.",
"thumb": "https://i.imgur.com/q1twuK8.png",
"url": "https://talkpython.fm/episodes/show/202/building-a-software-business",
"soundcloud": "588364857",
"author": "Michael Kennedy",
"author_links": {
"website": "https://talkpython.fm/"
},
"category": ["podcasts"]
},
{ {
"id": "adam_qas", "id": "adam_qas",
"title": "ADAM: Question Answering System", "title": "ADAM: Question Answering System",