Merge branch 'develop' into spacy.io

This commit is contained in:
Ines Montani 2019-03-06 14:41:25 +01:00
commit 0c09831227
2 changed files with 16 additions and 22 deletions

View File

@ -4,6 +4,7 @@ from __future__ import unicode_literals, print_function
import re import re
from collections import namedtuple from collections import namedtuple
from .stop_words import STOP_WORDS
from .tag_map import TAG_MAP from .tag_map import TAG_MAP
from ...attrs import LANG from ...attrs import LANG
from ...language import Language from ...language import Language
@ -38,24 +39,20 @@ def resolve_pos(token):
in the sentence. This function adds information to the POS tag to in the sentence. This function adds information to the POS tag to
resolve ambiguous mappings. resolve ambiguous mappings.
""" """
# TODO: This is a first take. The rules here are crude approximations. # TODO: This is a first take. The rules here are crude approximations.
# For many of these, full dependencies are needed to properly resolve # For many of these, full dependencies are needed to properly resolve
# PoS mappings. # PoS mappings.
if token.pos == "連体詞,*,*,*": if token.pos == "連体詞,*,*,*":
if re.match(r"[こそあど此其彼]の", token.surface): if re.match(r"[こそあど此其彼]の", token.surface):
return token.pos + ",DET" return token.pos + ",DET"
if re.match(r"[こそあど此其彼]", token.surface): if re.match(r"[こそあど此其彼]", token.surface):
return token.pos + ",PRON" return token.pos + ",PRON"
return token.pos + ",ADJ" return token.pos + ",ADJ"
return token.pos return token.pos
def detailed_tokens(tokenizer, text): def detailed_tokens(tokenizer, text):
"""Format Mecab output into a nice data structure, based on Janome.""" """Format Mecab output into a nice data structure, based on Janome."""
node = tokenizer.parseToNode(text) node = tokenizer.parseToNode(text)
node = node.next # first node is beginning of sentence and empty, skip it node = node.next # first node is beginning of sentence and empty, skip it
words = [] words = []
@ -64,12 +61,10 @@ def detailed_tokens(tokenizer, text):
base = surface # a default value. Updated if available later. base = surface # a default value. Updated if available later.
parts = node.feature.split(",") parts = node.feature.split(",")
pos = ",".join(parts[0:4]) pos = ",".join(parts[0:4])
if len(parts) > 7: if len(parts) > 7:
# this information is only available for words in the tokenizer # this information is only available for words in the tokenizer
# dictionary # dictionary
base = parts[7] base = parts[7]
words.append(ShortUnitWord(surface, base, pos)) words.append(ShortUnitWord(surface, base, pos))
node = node.next node = node.next
return words return words
@ -78,29 +73,25 @@ def detailed_tokens(tokenizer, text):
class JapaneseTokenizer(DummyTokenizer): class JapaneseTokenizer(DummyTokenizer):
def __init__(self, cls, nlp=None): def __init__(self, cls, nlp=None):
self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp) self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
self.tokenizer = try_mecab_import().Tagger() self.tokenizer = try_mecab_import().Tagger()
self.tokenizer.parseToNode("") # see #2901 self.tokenizer.parseToNode("") # see #2901
def __call__(self, text): def __call__(self, text):
dtokens = detailed_tokens(self.tokenizer, text) dtokens = detailed_tokens(self.tokenizer, text)
words = [x.surface for x in dtokens] words = [x.surface for x in dtokens]
spaces = [False] * len(words) spaces = [False] * len(words)
doc = Doc(self.vocab, words=words, spaces=spaces) doc = Doc(self.vocab, words=words, spaces=spaces)
for token, dtoken in zip(doc, dtokens): for token, dtoken in zip(doc, dtokens):
token._.mecab_tag = dtoken.pos token._.mecab_tag = dtoken.pos
token.tag_ = resolve_pos(dtoken) token.tag_ = resolve_pos(dtoken)
token.lemma_ = dtoken.lemma token.lemma_ = dtoken.lemma
return doc return doc
class JapaneseDefaults(Language.Defaults): class JapaneseDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda _text: "ja" lex_attr_getters[LANG] = lambda _text: "ja"
stop_words = STOP_WORDS
tag_map = TAG_MAP tag_map = TAG_MAP
@classmethod @classmethod

View File

@ -208,21 +208,24 @@ $ python -m spacy train [lang] [output_path] [train_path] [dev_path]
### Environment variables for hyperparameters {#train-hyperparams new="2"} ### Environment variables for hyperparameters {#train-hyperparams new="2"}
spaCy lets you set hyperparameters for training via environment variables. This spaCy lets you set hyperparameters for training via environment variables. For
is useful, because it keeps the command simple and allows you to example:
[create an alias](https://askubuntu.com/questions/17536/how-do-i-create-a-permanent-bash-alias/17537#17537)
for your custom `train` command while still being able to easily tweak the
hyperparameters. For example:
```bash ```bash
$ parser_hidden_depth=2 parser_maxout_pieces=1 spacy train [...] $ token_vector_width=256 learn_rate=0.0001 spacy train [...]
``` ```
```bash > #### Usage with alias
### Usage with alias >
alias train-parser="spacy train en /output /data /train /dev -n 1000" > Environment variables keep the command simple and allow you to to
parser_maxout_pieces=1 train-parser > [create an alias](https://askubuntu.com/questions/17536/how-do-i-create-a-permanent-bash-alias/17537#17537)
``` > for your custom `train` command while still being able to easily tweak the
> hyperparameters.
>
> ```bash
> alias train-parser="python -m spacy train en /output /data /train /dev -n 1000"
> token_vector_width=256 train-parser
> ```
| Name | Description | Default | | Name | Description | Default |
| -------------------- | --------------------------------------------------- | ------- | | -------------------- | --------------------------------------------------- | ------- |