* Add initial stuff for Chinese parsing

This commit is contained in:
Matthew Honnibal 2016-04-24 18:44:24 +02:00
parent 67ce96c9c9
commit 8569dbc2d0
10 changed files with 302 additions and 16 deletions

View File

@ -36,11 +36,8 @@ from spacy.strings import hash_string
from preshed.counter import PreshCounter from preshed.counter import PreshCounter
from spacy.parts_of_speech import NOUN, VERB, ADJ from spacy.parts_of_speech import NOUN, VERB, ADJ
from spacy.util import get_lang_class
import spacy.en
import spacy.de
import spacy.fi
import spacy.it
try: try:
unicode unicode
@ -197,13 +194,6 @@ def setup_vocab(get_lex_attr, tag_map, src_dir, dst_dir):
def main(lang_id, lang_data_dir, corpora_dir, model_dir): def main(lang_id, lang_data_dir, corpora_dir, model_dir):
languages = {
'en': spacy.en.English.default_lex_attrs(),
'de': spacy.de.German.default_lex_attrs(),
'fi': spacy.fi.Finnish.default_lex_attrs(),
'it': spacy.it.Italian.default_lex_attrs(),
}
model_dir = Path(model_dir) model_dir = Path(model_dir)
lang_data_dir = Path(lang_data_dir) / lang_id lang_data_dir = Path(lang_data_dir) / lang_id
corpora_dir = Path(corpora_dir) / lang_id corpora_dir = Path(corpora_dir) / lang_id
@ -216,7 +206,8 @@ def main(lang_id, lang_data_dir, corpora_dir, model_dir):
tag_map = json.load((lang_data_dir / 'tag_map.json').open()) tag_map = json.load((lang_data_dir / 'tag_map.json').open())
setup_tokenizer(lang_data_dir, model_dir / 'tokenizer') setup_tokenizer(lang_data_dir, model_dir / 'tokenizer')
setup_vocab(languages[lang_id], tag_map, corpora_dir, model_dir / 'vocab') setup_vocab(get_lang_class(lang_id).default_lex_attrs(), tag_map, corpora_dir,
model_dir / 'vocab')
if (lang_data_dir / 'gazetteer.json').exists(): if (lang_data_dir / 'gazetteer.json').exists():
copyfile(str(lang_data_dir / 'gazetteer.json'), copyfile(str(lang_data_dir / 'gazetteer.json'),

View File

@ -13,8 +13,6 @@ import plac
import re import re
import spacy.util import spacy.util
from spacy.en import English
from spacy.de import German
from spacy.syntax.util import Config from spacy.syntax.util import Config
from spacy.gold import read_json_file from spacy.gold import read_json_file
@ -207,7 +205,7 @@ def write_parses(Language, dev_loc, model_dir, out_loc):
@plac.annotations( @plac.annotations(
language=("The language to train", "positional", None, str, ['en','de']), language=("The language to train", "positional", None, str, ['en','de', 'zh']),
train_loc=("Location of training file or directory"), train_loc=("Location of training file or directory"),
dev_loc=("Location of development file or directory"), dev_loc=("Location of development file or directory"),
model_dir=("Location of output model directory",), model_dir=("Location of output model directory",),
@ -223,7 +221,7 @@ def write_parses(Language, dev_loc, model_dir, out_loc):
) )
def main(language, train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False, def main(language, train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False,
debug=False, corruption_level=0.0, gold_preproc=False, eval_only=False, pseudoprojective=False): debug=False, corruption_level=0.0, gold_preproc=False, eval_only=False, pseudoprojective=False):
lang = {'en':English, 'de':German}.get(language) lang = spacy.util.get_lang_class(language)
if not eval_only: if not eval_only:
gold_train = list(read_json_file(train_loc)) gold_train = list(read_json_file(train_loc))

194
lang_data/zh/gazetteer.json Normal file
View File

@ -0,0 +1,194 @@
{
"Reddit": [
"PRODUCT",
{},
[
[{"lower": "reddit"}]
]
],
"SeptemberElevenAttacks": [
"EVENT",
{},
[
[
{"orth": "9/11"}
],
[
{"lower": "september"},
{"orth": "11"}
]
]
],
"Linux": [
"PRODUCT",
{},
[
[{"lower": "linux"}]
]
],
"Haskell": [
"PRODUCT",
{},
[
[{"lower": "haskell"}]
]
],
"HaskellCurry": [
"PERSON",
{},
[
[
{"lower": "haskell"},
{"lower": "curry"}
]
]
],
"Javascript": [
"PRODUCT",
{},
[
[{"lower": "javascript"}]
]
],
"CSS": [
"PRODUCT",
{},
[
[{"lower": "css"}],
[{"lower": "css3"}]
]
],
"displaCy": [
"PRODUCT",
{},
[
[{"lower": "displacy"}]
]
],
"spaCy": [
"PRODUCT",
{},
[
[{"orth": "spaCy"}]
]
],
"HTML": [
"PRODUCT",
{},
[
[{"lower": "html"}],
[{"lower": "html5"}]
]
],
"Python": [
"PRODUCT",
{},
[
[{"orth": "Python"}]
]
],
"Ruby": [
"PRODUCT",
{},
[
[{"orth": "Ruby"}]
]
],
"Digg": [
"PRODUCT",
{},
[
[{"lower": "digg"}]
]
],
"FoxNews": [
"ORG",
{},
[
[{"orth": "Fox"}],
[{"orth": "News"}]
]
],
"Google": [
"ORG",
{},
[
[{"lower": "google"}]
]
],
"Mac": [
"PRODUCT",
{},
[
[{"lower": "mac"}]
]
],
"Wikipedia": [
"PRODUCT",
{},
[
[{"lower": "wikipedia"}]
]
],
"Windows": [
"PRODUCT",
{},
[
[{"orth": "Windows"}]
]
],
"Dell": [
"ORG",
{},
[
[{"lower": "dell"}]
]
],
"Facebook": [
"ORG",
{},
[
[{"lower": "facebook"}]
]
],
"Blizzard": [
"ORG",
{},
[
[{"orth": "Blizzard"}]
]
],
"Ubuntu": [
"ORG",
{},
[
[{"orth": "Ubuntu"}]
]
],
"Youtube": [
"PRODUCT",
{},
[
[{"lower": "youtube"}]
]
],
"false_positives": [
null,
{},
[
[{"orth": "Shit"}],
[{"orth": "Weed"}],
[{"orth": "Cool"}],
[{"orth": "Btw"}],
[{"orth": "Bah"}],
[{"orth": "Bullshit"}],
[{"orth": "Lol"}],
[{"orth": "Yo"}, {"lower": "dawg"}],
[{"orth": "Yay"}],
[{"orth": "Ahh"}],
[{"orth": "Yea"}],
[{"orth": "Bah"}]
]
]
}

6
lang_data/zh/infix.txt Normal file
View File

@ -0,0 +1,6 @@
\.\.\.
(?<=[a-z])\.(?=[A-Z])
(?<=[a-zA-Z])-(?=[a-zA-z])
(?<=[a-zA-Z])--(?=[a-zA-z])
(?<=[0-9])-(?=[0-9])
(?<=[A-Za-z]),(?=[A-Za-z])

1
lang_data/zh/morphs.json Normal file
View File

@ -0,0 +1 @@
{}

21
lang_data/zh/prefix.txt Normal file
View File

@ -0,0 +1,21 @@
,
"
(
[
{
*
<
$
£
'
``
`
#
US$
C$
A$
a-
....
...

View File

@ -0,0 +1 @@
{}

26
lang_data/zh/suffix.txt Normal file
View File

@ -0,0 +1,26 @@
,
\"
\)
\]
\}
\*
\!
\?
%
\$
>
:
;
'
''
's
'S
s
S
\.\.
\.\.\.
\.\.\.\.
(?<=[a-z0-9)\]"'%\)])\.
(?<=[0-9])km

43
lang_data/zh/tag_map.json Normal file
View File

@ -0,0 +1,43 @@
{
"NR": {"pos": "PROPN"},
"AD": {"pos": "ADV"},
"NN": {"pos": "NOUN"},
"CD": {"pos": "NUM"},
"DEG": {"pos": "PART"},
"PN": {"pos": "PRON"},
"M": {"pos": "PART"},
"JJ": {"pos": "ADJ"},
"DEC": {"pos": "PART"},
"NT": {"pos": "NOUN"},
"DT": {"pos": "DET"},
"LC": {"pos": "PART"},
"CC": {"pos": "CONJ"},
"AS": {"pos": "PART"},
"SP": {"pos": "PART"},
"IJ": {"pos": "INTJ"},
"OD": {"pos": "NUM"},
"MSP": {"pos": "PART"},
"CS": {"pos": "SCONJ"},
"ETC": {"pos": "PART"},
"DEV": {"pos": "PART"},
"BA": {"pos": "AUX"},
"SB": {"pos": "AUX"},
"DER": {"pos": "PART"},
"LB": {"pos": "AUX"},
"P": {"pos": "ADP"},
"URL": {"pos": "SYM"},
"FRAG": {"pos": "X"},
"X": {"pos": "X"},
"ON": {"pos": "X"},
"FW": {"pos": "X"},
"VC": {"pos": "VERB"},
"VV": {"pos": "VERB"},
"VA": {"pos": "VERB"},
"VE": {"pos": "VERB"},
"PU": {"pos": "PUNCT"},
"SP": {"pos": "SPACE"},
"NP": {"pos": "X"},
"_": {"pos": "X"},
"VP": {"pos": "X"},
"CHAR": {"pos": "X"}
}

5
spacy/zh/__init__.py Normal file
View File

@ -0,0 +1,5 @@
from ..language import Language
class Chinese(Language):
lang = u'zh'