From d9312bc9ea74e48074f0b7f6a079b959f689e3f5 Mon Sep 17 00:00:00 2001 From: Wolfgang Seeker Date: Wed, 9 Mar 2016 16:18:48 +0100 Subject: [PATCH 01/34] add new files npchunks.{pyx,pxd} to hold noun phrase chunk generators --- setup.py | 2 ++ spacy/tokens/doc.pyx | 28 ++++++++------------ spacy/tokens/npchunks.pxd | 0 spacy/tokens/npchunks.pyx | 54 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 66 insertions(+), 18 deletions(-) create mode 100644 spacy/tokens/npchunks.pxd create mode 100644 spacy/tokens/npchunks.pyx diff --git a/setup.py b/setup.py index 176434151..de7d95d22 100644 --- a/setup.py +++ b/setup.py @@ -56,6 +56,7 @@ MOD_NAMES = [ 'spacy.tokens.doc', 'spacy.tokens.span', 'spacy.tokens.token', + 'spacy.tokens.npchunks', 'spacy.serialize.packer', 'spacy.serialize.huffman', 'spacy.serialize.bits', @@ -184,3 +185,4 @@ def setup_package(): if __name__ == '__main__': setup_package() + diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 26088be0c..fa45c8b3e 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -23,6 +23,7 @@ from .token cimport Token from ..serialize.bits cimport BitArray from ..util import normalize_slice +import npchunks DEF PADDING = 5 @@ -239,24 +240,15 @@ cdef class Doc: "requires data to be installed. If you haven't done so, run: " "\npython -m spacy.en.download all\n" "to install the data") - - cdef const TokenC* word - labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj', - 'attr', 'root'] - np_deps = [self.vocab.strings[label] for label in labels] - conj = self.vocab.strings['conj'] - np_label = self.vocab.strings['NP'] - for i in range(self.length): - word = &self.c[i] - if word.pos == NOUN and word.dep in np_deps: - yield Span(self, word.l_edge, i+1, label=np_label) - elif word.pos == NOUN and word.dep == conj: - head = word+word.head - while head.dep == conj and head.head < 0: - head += head.head - # If the head is an NP, and we're coordinated to it, we're an NP - if head.dep in np_deps: - yield Span(self, word.l_edge, i+1, label=np_label) + + chunk_rules = {'en':npchunks.english, 'de':npchunks.german} + + for sent in self.sents: + lang = 'en' # todo: make dependent on language of root token + for chunk in chunk_rules.get(lang)(sent): + yield chunk + + @property def sents(self): diff --git a/spacy/tokens/npchunks.pxd b/spacy/tokens/npchunks.pxd new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tokens/npchunks.pyx b/spacy/tokens/npchunks.pyx new file mode 100644 index 000000000..0c5ca32a5 --- /dev/null +++ b/spacy/tokens/npchunks.pyx @@ -0,0 +1,54 @@ + +from ..structs cimport TokenC +from .doc cimport Doc +from .span cimport Span + +from ..parts_of_speech cimport NOUN, PROPN, PRON + +def english(Span sent): + cdef const TokenC* word + strings = sent.doc.vocab.strings + labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj', 'attr', 'root'] + np_deps = [strings[label] for label in labels] + conj = strings['conj'] + np_label = strings['NP'] + for i in range(sent.start, sent.end): + word = &sent.doc.c[i] + if word.pos == NOUN and word.dep in np_deps: + yield Span(sent.doc, word.l_edge, i+1, label=np_label) + elif word.pos == NOUN and word.dep == conj: + head = word+word.head + while head.dep == conj and head.head < 0: + head += head.head + # If the head is an NP, and we're coordinated to it, we're an NP + if head.dep in np_deps: + yield Span(sent.doc, word.l_edge, i+1, label=np_label) + + +def german(Span sent): + # this function extracts spans headed by NOUNs starting from the left-most + # syntactic dependent until the NOUN itself + # for close apposition and measurement construction, the span is sometimes + # extended to the right of the NOUN + # example: "eine Tasse Tee" (a cup (of) tea) returns "eine Tasse Tee" and not + # just "eine Tasse", same for "das Thema Familie" + cdef const TokenC* word + strings = sent.doc.vocab.strings + labels = ['sb', 'oa', 'da', 'nk', 'mo', 'ag', 'root', 'cj', 'pd', 'og', 'app'] + close_app = strings['nk'] + np_deps = [strings[label] for label in labels] + np_label = strings['NP'] + for i in range(sent.start, sent.end): + word = &sent.doc.c[i] + if word.pos == NOUN and word.dep in np_deps: + rbracket = i+1 + # try to extend the span to the right + # to capture close apposition/measurement constructions + for rdep in sent.doc[i].rights: + if rdep.pos == NOUN and rdep.dep == close_app: + rbracket = rdep.i+1 + yield Span(sent.doc, word.l_edge, rbracket, label=np_label) + + + + From bc9c62e2792a001bece151dd24414572425a7004 Mon Sep 17 00:00:00 2001 From: Wolfgang Seeker Date: Wed, 9 Mar 2016 18:07:37 +0100 Subject: [PATCH 02/34] replace Language functions with corresponding orth functions implement punctuation functions in orth --- spacy/language.py | 84 ++++++++--------------------------------------- spacy/orth.pyx | 22 +++++++------ 2 files changed, 26 insertions(+), 80 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index 157f7d040..4df34d956 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -33,10 +33,6 @@ class Language(object): @staticmethod def norm(string): return string - - @staticmethod - def shape(string): - return orth.word_shape(string) @staticmethod def prefix(string): @@ -50,66 +46,14 @@ class Language(object): def cluster(string): return 0 - @staticmethod - def is_alpha(string): - return orth.is_alpha(string) - - @staticmethod - def is_ascii(string): - return orth.is_ascii(string) - @staticmethod def is_digit(string): return string.isdigit() - @staticmethod - def is_lower(string): - return orth.is_lower(string) - - @staticmethod - def is_punct(string): - return orth.is_punct(string) - @staticmethod def is_space(string): return string.isspace() - @staticmethod - def is_title(string): - return orth.is_title(string) - - @staticmethod - def is_bracket(string): - return orth.is_bracket(string) - - @staticmethod - def is_quote(string): - return orth.is_quote(string) - - @staticmethod - def is_left_punct(string): - return orth.is_left_punct(string) - - @staticmethod - def is_right_punct(string): - return orth.is_right_punct(string) - - @staticmethod - def is_upper(string): - return orth.is_upper(string) - - @staticmethod - def like_url(string): - return orth.like_url(string) - - @staticmethod - def like_num(string): - return orth.like_number(string) - - @staticmethod - def like_email(string): - return orth.like_email(string) - @staticmethod def is_stop(string): return 0 @@ -120,26 +64,26 @@ class Language(object): return { attrs.LOWER: cls.lower, attrs.NORM: cls.norm, - attrs.SHAPE: cls.shape, + attrs.SHAPE: orth.word_shape, attrs.PREFIX: cls.prefix, attrs.SUFFIX: cls.suffix, attrs.CLUSTER: cls.cluster, attrs.PROB: lambda string: oov_prob, - attrs.IS_ALPHA: cls.is_alpha, - attrs.IS_ASCII: cls.is_ascii, + attrs.IS_ALPHA: orth.is_alpha, + attrs.IS_ASCII: orth.is_ascii, attrs.IS_DIGIT: cls.is_digit, - attrs.IS_LOWER: cls.is_lower, - attrs.IS_PUNCT: cls.is_punct, + attrs.IS_LOWER: orth.is_lower, + attrs.IS_PUNCT: orth.is_punct, attrs.IS_SPACE: cls.is_space, - attrs.IS_TITLE: cls.is_title, - attrs.IS_UPPER: cls.is_upper, - attrs.FLAG14: cls.is_bracket, - attrs.FLAG15: cls.is_quote, - attrs.FLAG16: cls.is_left_punct, - attrs.FLAG17: cls.is_right_punct, - attrs.LIKE_URL: cls.like_url, - attrs.LIKE_NUM: cls.like_num, - attrs.LIKE_EMAIL: cls.like_email, + attrs.IS_TITLE: orth.is_title, + attrs.IS_UPPER: orth.is_upper, + attrs.FLAG14: orth.is_bracket, + attrs.FLAG15: orth.is_quote, + attrs.FLAG16: orth.is_left_punct, + attrs.FLAG17: orth.is_right_punct, + attrs.LIKE_URL: orth.like_url, + attrs.LIKE_NUM: orth.like_number, + attrs.LIKE_EMAIL: orth.like_email, attrs.IS_STOP: cls.is_stop, attrs.IS_OOV: lambda string: True } diff --git a/spacy/orth.pyx b/spacy/orth.pyx index cf3c3a9c9..418c3cfd4 100644 --- a/spacy/orth.pyx +++ b/spacy/orth.pyx @@ -5,9 +5,6 @@ import unicodedata import re -TAGS = 'adj adp adv conj det noun num pdt pos pron prt punct verb'.upper().split() - - # Binary string features cpdef bint is_alpha(unicode string): return string.isalpha() @@ -36,20 +33,25 @@ cpdef bint is_ascii(unicode string): else: return True + cpdef bint is_bracket(unicode string): - return False + brackets = ('(',')','[',']','{','}','<','>') + return string in brackets + cpdef bint is_quote(unicode string): - if string in ('"', "'"): - return True - else: - return False + quotes = ('"',"'",'`','«','»','‘','’','‚','‛','“','”','„','‟','‹','›','❮','❯') + return string in quotes + cpdef bint is_left_punct(unicode string): - return False + left_punct = ('(','[','{','<','"',"'",'«','‘','‚','‛','“','„','‟','‹','❮') + return string in left_punct + cpdef bint is_right_punct(unicode string): - return False + right_punct = (')',']','}','>','"',"'",'»','’','”','›','❯') + return string in right_punct cpdef bint is_title(unicode string): From 03fb498dbeb86e52b9b3e487ab8edfd836b53660 Mon Sep 17 00:00:00 2001 From: Wolfgang Seeker Date: Thu, 10 Mar 2016 13:01:34 +0100 Subject: [PATCH 03/34] introduce lang field for LexemeC to hold language id put noun_chunk logic into iterators.py for each language separately --- bin/init_model.py | 4 +-- setup.py | 5 +-- spacy/attrs.pxd | 20 +++++------ spacy/attrs.pyx | 9 ++--- .../{tokens/npchunks.pxd => de/iterators.pxd} | 0 .../{tokens/npchunks.pyx => de/iterators.pyx} | 34 +++---------------- spacy/en/iterators.pxd | 0 spacy/en/iterators.pyx | 24 +++++++++++++ spacy/language.py | 9 ++--- spacy/lexeme.pxd | 6 +++- spacy/lexeme.pyx | 16 ++++++--- spacy/orth.pyx | 6 ++-- spacy/structs.pxd | 2 ++ spacy/tokens/doc.pyx | 20 ++++++++--- spacy/tokens/token.pyx | 16 ++++++--- spacy/vocab.pyx | 2 ++ 16 files changed, 103 insertions(+), 70 deletions(-) rename spacy/{tokens/npchunks.pxd => de/iterators.pxd} (100%) rename spacy/{tokens/npchunks.pyx => de/iterators.pyx} (53%) create mode 100644 spacy/en/iterators.pxd create mode 100644 spacy/en/iterators.pyx diff --git a/bin/init_model.py b/bin/init_model.py index 19cfcdc25..5e62a7faf 100644 --- a/bin/init_model.py +++ b/bin/init_model.py @@ -109,7 +109,7 @@ def _read_freqs(loc, max_length=100, min_doc_freq=0, min_freq=200): else: file_ = loc.open() for i, line in enumerate(file_): - freq, doc_freq, key = line.split('\t', 2) + freq, doc_freq, key = line.rstrip().split('\t', 2) freq = int(freq) counts.inc(i+1, freq) total += freq @@ -121,7 +121,7 @@ def _read_freqs(loc, max_length=100, min_doc_freq=0, min_freq=200): file_ = loc.open() probs = {} for line in file_: - freq, doc_freq, key = line.split('\t', 2) + freq, doc_freq, key = line.rstrip().split('\t', 2) doc_freq = int(doc_freq) freq = int(freq) if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length: diff --git a/setup.py b/setup.py index de7d95d22..7449212b9 100644 --- a/setup.py +++ b/setup.py @@ -56,14 +56,15 @@ MOD_NAMES = [ 'spacy.tokens.doc', 'spacy.tokens.span', 'spacy.tokens.token', - 'spacy.tokens.npchunks', 'spacy.serialize.packer', 'spacy.serialize.huffman', 'spacy.serialize.bits', 'spacy.cfile', 'spacy.matcher', 'spacy.syntax.ner', - 'spacy.symbols'] + 'spacy.symbols', + 'spacy.en.iterators', + 'spacy.de.iterators'] # By subclassing build_extensions we have the actual compiler that will be used diff --git a/spacy/attrs.pxd b/spacy/attrs.pxd index 61a00ba1b..a878a49d8 100644 --- a/spacy/attrs.pxd +++ b/spacy/attrs.pxd @@ -14,12 +14,12 @@ cpdef enum attr_id_t: LIKE_EMAIL IS_STOP IS_OOV - - FLAG14 = 14 - FLAG15 - FLAG16 - FLAG17 - FLAG18 + IS_BRACKET + IS_QUOTE + IS_LEFT_PUNCT + IS_RIGHT_PUNCT + + FLAG18 = 18 FLAG19 FLAG20 FLAG21 @@ -85,11 +85,7 @@ cpdef enum attr_id_t: HEAD SPACY PROB + + LANG -# Move these up to FLAG14--FLAG18 once we finish the functionality and -# are ready to regenerate the model -#IS_BRACKET -#IS_QUOTE -#IS_LEFT_PUNCT -#IS_RIGHT_PUNCT diff --git a/spacy/attrs.pyx b/spacy/attrs.pyx index 146f3ab26..9a191beda 100644 --- a/spacy/attrs.pyx +++ b/spacy/attrs.pyx @@ -13,10 +13,10 @@ IDS = { "LIKE_EMAIL": LIKE_EMAIL, "IS_STOP": IS_STOP, "IS_OOV": IS_OOV, - "FLAG14": FLAG14, - "FLAG15": FLAG15, - "FLAG16": FLAG16, - "FLAG17": FLAG17, + "IS_BRACKET": IS_BRACKET, + "IS_QUOTE": IS_QUOTE, + "IS_LEFT_PUNCT": IS_LEFT_PUNCT, + "IS_RIGHT_PUNCT": IS_RIGHT_PUNCT, "FLAG18": FLAG18, "FLAG19": FLAG19, "FLAG20": FLAG20, @@ -83,6 +83,7 @@ IDS = { "HEAD": HEAD, "SPACY": SPACY, "PROB": PROB, + "LANG": LANG, } # ATTR IDs, in order of the symbol diff --git a/spacy/tokens/npchunks.pxd b/spacy/de/iterators.pxd similarity index 100% rename from spacy/tokens/npchunks.pxd rename to spacy/de/iterators.pxd diff --git a/spacy/tokens/npchunks.pyx b/spacy/de/iterators.pyx similarity index 53% rename from spacy/tokens/npchunks.pyx rename to spacy/de/iterators.pyx index 0c5ca32a5..a6321bd57 100644 --- a/spacy/tokens/npchunks.pyx +++ b/spacy/de/iterators.pyx @@ -1,31 +1,9 @@ +from spacy.structs cimport TokenC +from spacy.tokens.span cimport Span -from ..structs cimport TokenC -from .doc cimport Doc -from .span cimport Span +from spacy.parts_of_speech cimport NOUN -from ..parts_of_speech cimport NOUN, PROPN, PRON - -def english(Span sent): - cdef const TokenC* word - strings = sent.doc.vocab.strings - labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj', 'attr', 'root'] - np_deps = [strings[label] for label in labels] - conj = strings['conj'] - np_label = strings['NP'] - for i in range(sent.start, sent.end): - word = &sent.doc.c[i] - if word.pos == NOUN and word.dep in np_deps: - yield Span(sent.doc, word.l_edge, i+1, label=np_label) - elif word.pos == NOUN and word.dep == conj: - head = word+word.head - while head.dep == conj and head.head < 0: - head += head.head - # If the head is an NP, and we're coordinated to it, we're an NP - if head.dep in np_deps: - yield Span(sent.doc, word.l_edge, i+1, label=np_label) - - -def german(Span sent): +def noun_chunks(Span sent): # this function extracts spans headed by NOUNs starting from the left-most # syntactic dependent until the NOUN itself # for close apposition and measurement construction, the span is sometimes @@ -48,7 +26,3 @@ def german(Span sent): if rdep.pos == NOUN and rdep.dep == close_app: rbracket = rdep.i+1 yield Span(sent.doc, word.l_edge, rbracket, label=np_label) - - - - diff --git a/spacy/en/iterators.pxd b/spacy/en/iterators.pxd new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/en/iterators.pyx b/spacy/en/iterators.pyx new file mode 100644 index 000000000..e4f0fe2a4 --- /dev/null +++ b/spacy/en/iterators.pyx @@ -0,0 +1,24 @@ +from spacy.structs cimport TokenC +from spacy.tokens.span cimport Span + +from spacy.parts_of_speech cimport NOUN + +def noun_chunks(Span sent): + cdef const TokenC* word + strings = sent.doc.vocab.strings + labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj', 'attr', 'root'] + np_deps = [strings[label] for label in labels] + conj = strings['conj'] + np_label = strings['NP'] + for i in range(sent.start, sent.end): + word = &sent.doc.c[i] + if word.pos == NOUN and word.dep in np_deps: + yield Span(sent.doc, word.l_edge, i+1, label=np_label) + elif word.pos == NOUN and word.dep == conj: + head = word+word.head + while head.dep == conj and head.head < 0: + head += head.head + # If the head is an NP, and we're coordinated to it, we're an NP + if head.dep in np_deps: + yield Span(sent.doc, word.l_edge, i+1, label=np_label) + diff --git a/spacy/language.py b/spacy/language.py index 4df34d956..f186c2f2b 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -69,6 +69,7 @@ class Language(object): attrs.SUFFIX: cls.suffix, attrs.CLUSTER: cls.cluster, attrs.PROB: lambda string: oov_prob, + attrs.LANG: lambda string: cls.lang, attrs.IS_ALPHA: orth.is_alpha, attrs.IS_ASCII: orth.is_ascii, attrs.IS_DIGIT: cls.is_digit, @@ -77,10 +78,10 @@ class Language(object): attrs.IS_SPACE: cls.is_space, attrs.IS_TITLE: orth.is_title, attrs.IS_UPPER: orth.is_upper, - attrs.FLAG14: orth.is_bracket, - attrs.FLAG15: orth.is_quote, - attrs.FLAG16: orth.is_left_punct, - attrs.FLAG17: orth.is_right_punct, + attrs.IS_BRACKET: orth.is_bracket, + attrs.IS_QUOTE: orth.is_quote, + attrs.IS_LEFT_PUNCT: orth.is_left_punct, + attrs.IS_RIGHT_PUNCT: orth.is_right_punct, attrs.LIKE_URL: orth.like_url, attrs.LIKE_NUM: orth.like_number, attrs.LIKE_EMAIL: orth.like_email, diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd index 6fc25efb6..12d4e3de3 100644 --- a/spacy/lexeme.pxd +++ b/spacy/lexeme.pxd @@ -1,6 +1,6 @@ from .typedefs cimport attr_t, hash_t, flags_t, len_t, tag_t from .attrs cimport attr_id_t -from .attrs cimport ID, ORTH, LOWER, NORM, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER +from .attrs cimport ID, ORTH, LOWER, NORM, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER, LANG from .structs cimport LexemeC from .strings cimport StringStore @@ -41,6 +41,8 @@ cdef class Lexeme: lex.suffix = value elif name == CLUSTER: lex.cluster = value + elif name == LANG: + lex.lang = value @staticmethod cdef inline attr_t get_struct_attr(const LexemeC* lex, attr_id_t feat_name) nogil: @@ -67,6 +69,8 @@ cdef class Lexeme: return lex.length elif feat_name == CLUSTER: return lex.cluster + elif feat_name == LANG: + return lex.lang else: return 0 diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 1aec4a018..4e0f2cf2e 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -18,10 +18,10 @@ import numpy from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP -from .attrs cimport FLAG14 as IS_BRACKET -from .attrs cimport FLAG15 as IS_QUOTE -from .attrs cimport FLAG16 as IS_LEFT_PUNCT -from .attrs cimport FLAG17 as IS_RIGHT_PUNCT +from .attrs cimport IS_BRACKET +from .attrs cimport IS_QUOTE +from .attrs cimport IS_LEFT_PUNCT +from .attrs cimport IS_RIGHT_PUNCT from .attrs cimport IS_OOV @@ -123,6 +123,10 @@ cdef class Lexeme: def __get__(self): return self.c.cluster def __set__(self, int x): self.c.cluster = x + property lang: + def __get__(self): return self.c.lang + def __set__(self, int x): self.c.lang = x + property prob: def __get__(self): return self.c.prob def __set__(self, float x): self.c.prob = x @@ -147,6 +151,10 @@ cdef class Lexeme: def __get__(self): return self.vocab.strings[self.c.suffix] def __set__(self, unicode x): self.c.suffix = self.vocab.strings[x] + property lang_: + def __get__(self): return self.vocab.strings[self.c.lang] + def __set__(self, unicode x): self.c.lang = self.vocab.strings[x] + property flags: def __get__(self): return self.c.flags def __set__(self, flags_t x): self.c.flags = x diff --git a/spacy/orth.pyx b/spacy/orth.pyx index 418c3cfd4..0f30c1136 100644 --- a/spacy/orth.pyx +++ b/spacy/orth.pyx @@ -40,17 +40,17 @@ cpdef bint is_bracket(unicode string): cpdef bint is_quote(unicode string): - quotes = ('"',"'",'`','«','»','‘','’','‚','‛','“','”','„','‟','‹','›','❮','❯') + quotes = ('"',"'",'`','«','»','‘','’','‚','‛','“','”','„','‟','‹','›','❮','❯',"''",'``') return string in quotes cpdef bint is_left_punct(unicode string): - left_punct = ('(','[','{','<','"',"'",'«','‘','‚','‛','“','„','‟','‹','❮') + left_punct = ('(','[','{','<','"',"'",'«','‘','‚','‛','“','„','‟','‹','❮','``') return string in left_punct cpdef bint is_right_punct(unicode string): - right_punct = (')',']','}','>','"',"'",'»','’','”','›','❯') + right_punct = (')',']','}','>','"',"'",'»','’','”','›','❯',"''") return string in right_punct diff --git a/spacy/structs.pxd b/spacy/structs.pxd index 733ce3022..f7e6b1ec7 100644 --- a/spacy/structs.pxd +++ b/spacy/structs.pxd @@ -9,6 +9,8 @@ cdef struct LexemeC: flags_t flags + attr_t lang + attr_t id attr_t length diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index fa45c8b3e..887b1085f 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -8,6 +8,7 @@ import struct cimport numpy as np import math import six +import warnings from ..lexeme cimport Lexeme from ..lexeme cimport EMPTY_LEXEME @@ -23,7 +24,6 @@ from .token cimport Token from ..serialize.bits cimport BitArray from ..util import normalize_slice -import npchunks DEF PADDING = 5 @@ -241,11 +241,23 @@ cdef class Doc: "\npython -m spacy.en.download all\n" "to install the data") - chunk_rules = {'en':npchunks.english, 'de':npchunks.german} + from spacy.en.iterators import noun_chunks as en_noun_chunks + from spacy.de.iterators import noun_chunks as de_noun_chunks + + chunk_rules = {'en':en_noun_chunks, + 'de':de_noun_chunks, + } for sent in self.sents: - lang = 'en' # todo: make dependent on language of root token - for chunk in chunk_rules.get(lang)(sent): + print(sent) + lang = sent.root.lang_ + chunker = chunk_rules.get(lang,None) + if chunker == None: + warnings.warn("noun_chunks is not available for language %s." % lang) + print(sent.root.orth_) + continue + + for chunk in chunker(sent): yield chunk diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 0ff574f1b..17d756b3e 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -18,10 +18,10 @@ from ..attrs cimport POS, LEMMA, TAG, DEP from ..parts_of_speech cimport CONJ, PUNCT from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE -from ..attrs cimport FLAG14 as IS_BRACKET -from ..attrs cimport FLAG15 as IS_QUOTE -from ..attrs cimport FLAG16 as IS_LEFT_PUNCT -from ..attrs cimport FLAG17 as IS_RIGHT_PUNCT +from ..attrs cimport IS_BRACKET +from ..attrs cimport IS_QUOTE +from ..attrs cimport IS_LEFT_PUNCT +from ..attrs cimport IS_RIGHT_PUNCT from ..attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP from ..attrs cimport IS_OOV @@ -95,6 +95,10 @@ cdef class Token: def __get__(self): return self.c.lex.prob + property lang: + def __get__(self): + return self.c.lex.lang + property idx: def __get__(self): return self.c.idx @@ -310,6 +314,10 @@ cdef class Token: def __get__(self): return self.vocab.strings[self.c.lex.suffix] + property lang_: + def __get__(self): + return self.vocab.strings[self.c.lex.lang] + property lemma_: def __get__(self): return self.vocab.strings[self.c.lemma] diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index f876bfefb..df8a4bbd5 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -246,6 +246,7 @@ cdef class Vocab: fp.write_from(&lexeme.prob, sizeof(lexeme.prob), 1) fp.write_from(&lexeme.sentiment, sizeof(lexeme.sentiment), 1) fp.write_from(&lexeme.l2_norm, sizeof(lexeme.l2_norm), 1) + fp.write_from(&lexeme.lang, sizeof(lexeme.lang), 1) fp.close() def load_lexemes(self, loc): @@ -278,6 +279,7 @@ cdef class Vocab: fp.read_into(&lexeme.prob, 1, sizeof(lexeme.prob)) fp.read_into(&lexeme.sentiment, 1, sizeof(lexeme.sentiment)) fp.read_into(&lexeme.l2_norm, 1, sizeof(lexeme.l2_norm)) + fp.read_into(&lexeme.lang, 1, sizeof(lexeme.lang)) lexeme.vector = EMPTY_VEC py_str = self.strings[lexeme.orth] From 5e2e8e951a75348d069d68cade7972c6cff55ee9 Mon Sep 17 00:00:00 2001 From: Wolfgang Seeker Date: Wed, 16 Mar 2016 15:53:35 +0100 Subject: [PATCH 04/34] add baseclass DocIterator for iterators over documents add classes for English and German noun chunks the respective iterators are set for the document when created by the parser as they depend on the annotation scheme of the parsing model --- setup.py | 3 +- spacy/de/iterators.pxd | 0 spacy/de/iterators.pyx | 28 ------------- spacy/en/iterators.pxd | 0 spacy/en/iterators.pyx | 24 ----------- spacy/syntax/iterators.pxd | 16 ++++++++ spacy/syntax/iterators.pyx | 82 ++++++++++++++++++++++++++++++++++++++ spacy/syntax/parser.pyx | 24 +++++++---- spacy/tokens/doc.pxd | 4 ++ spacy/tokens/doc.pyx | 39 ++++++------------ spacy/vocab.pyx | 9 ++++- 11 files changed, 140 insertions(+), 89 deletions(-) delete mode 100644 spacy/de/iterators.pxd delete mode 100644 spacy/de/iterators.pyx delete mode 100644 spacy/en/iterators.pxd delete mode 100644 spacy/en/iterators.pyx create mode 100644 spacy/syntax/iterators.pxd create mode 100644 spacy/syntax/iterators.pyx diff --git a/setup.py b/setup.py index 7449212b9..91a118227 100644 --- a/setup.py +++ b/setup.py @@ -63,8 +63,7 @@ MOD_NAMES = [ 'spacy.matcher', 'spacy.syntax.ner', 'spacy.symbols', - 'spacy.en.iterators', - 'spacy.de.iterators'] + 'spacy.syntax.iterators'] # By subclassing build_extensions we have the actual compiler that will be used diff --git a/spacy/de/iterators.pxd b/spacy/de/iterators.pxd deleted file mode 100644 index e69de29bb..000000000 diff --git a/spacy/de/iterators.pyx b/spacy/de/iterators.pyx deleted file mode 100644 index a6321bd57..000000000 --- a/spacy/de/iterators.pyx +++ /dev/null @@ -1,28 +0,0 @@ -from spacy.structs cimport TokenC -from spacy.tokens.span cimport Span - -from spacy.parts_of_speech cimport NOUN - -def noun_chunks(Span sent): - # this function extracts spans headed by NOUNs starting from the left-most - # syntactic dependent until the NOUN itself - # for close apposition and measurement construction, the span is sometimes - # extended to the right of the NOUN - # example: "eine Tasse Tee" (a cup (of) tea) returns "eine Tasse Tee" and not - # just "eine Tasse", same for "das Thema Familie" - cdef const TokenC* word - strings = sent.doc.vocab.strings - labels = ['sb', 'oa', 'da', 'nk', 'mo', 'ag', 'root', 'cj', 'pd', 'og', 'app'] - close_app = strings['nk'] - np_deps = [strings[label] for label in labels] - np_label = strings['NP'] - for i in range(sent.start, sent.end): - word = &sent.doc.c[i] - if word.pos == NOUN and word.dep in np_deps: - rbracket = i+1 - # try to extend the span to the right - # to capture close apposition/measurement constructions - for rdep in sent.doc[i].rights: - if rdep.pos == NOUN and rdep.dep == close_app: - rbracket = rdep.i+1 - yield Span(sent.doc, word.l_edge, rbracket, label=np_label) diff --git a/spacy/en/iterators.pxd b/spacy/en/iterators.pxd deleted file mode 100644 index e69de29bb..000000000 diff --git a/spacy/en/iterators.pyx b/spacy/en/iterators.pyx deleted file mode 100644 index e4f0fe2a4..000000000 --- a/spacy/en/iterators.pyx +++ /dev/null @@ -1,24 +0,0 @@ -from spacy.structs cimport TokenC -from spacy.tokens.span cimport Span - -from spacy.parts_of_speech cimport NOUN - -def noun_chunks(Span sent): - cdef const TokenC* word - strings = sent.doc.vocab.strings - labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj', 'attr', 'root'] - np_deps = [strings[label] for label in labels] - conj = strings['conj'] - np_label = strings['NP'] - for i in range(sent.start, sent.end): - word = &sent.doc.c[i] - if word.pos == NOUN and word.dep in np_deps: - yield Span(sent.doc, word.l_edge, i+1, label=np_label) - elif word.pos == NOUN and word.dep == conj: - head = word+word.head - while head.dep == conj and head.head < 0: - head += head.head - # If the head is an NP, and we're coordinated to it, we're an NP - if head.dep in np_deps: - yield Span(sent.doc, word.l_edge, i+1, label=np_label) - diff --git a/spacy/syntax/iterators.pxd b/spacy/syntax/iterators.pxd new file mode 100644 index 000000000..662f851c8 --- /dev/null +++ b/spacy/syntax/iterators.pxd @@ -0,0 +1,16 @@ + +from spacy.tokens.doc cimport Doc + +cdef class DocIterator: + cdef Doc _doc + +cdef class EnglishNounChunks(DocIterator): + cdef int i + cdef int _np_label + cdef set _np_deps + +cdef class GermanNounChunks(DocIterator): + cdef int i + cdef int _np_label + cdef set _np_deps + cdef int _close_app diff --git a/spacy/syntax/iterators.pyx b/spacy/syntax/iterators.pyx new file mode 100644 index 000000000..78679b8ce --- /dev/null +++ b/spacy/syntax/iterators.pyx @@ -0,0 +1,82 @@ +from spacy.structs cimport TokenC +from spacy.tokens.span cimport Span +from spacy.tokens.doc cimport Doc +from spacy.tokens.token cimport Token + +from spacy.parts_of_speech cimport NOUN + +# base class for document iterators +cdef class DocIterator: + def __init__(self, Doc doc): + self._doc = doc + + def __iter__(self): + return self + + def __next__(self): + raise NotImplementedError + + +cdef class EnglishNounChunks(DocIterator): + def __init__(self, Doc doc): + super(EnglishNounChunks,self).__init__(doc) + labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj', 'attr', 'root'] + self._np_label = self._doc.vocab.strings['NP'] + self._np_deps = set( self._doc.vocab.strings[label] for label in labels ) + self._conjunct = self._doc.vocab.strings['conj'] + self.i = 0 + + def __next__(self): + cdef const TokenC* word + cdef widx + while self.i < self._doc.length: + widx = self.i + self.i += 1 + word = &self._doc.c[widx] + if word.pos == NOUN: + if word.dep in self._np_deps: + return Span(self._doc, word.l_edge, widx+1, label=self._np_label) + elif word.dep == self._conjunct: + head = word+word.head + while head.dep == self._conjunct and head.head < 0: + head += head.head + # If the head is an NP, and we're coordinated to it, we're an NP + if head.dep in self._np_deps: + return Span(self._doc, word.l_edge, widx+1, label=self._np_label) + raise StopIteration + + +# this iterator extracts spans headed by NOUNs starting from the left-most +# syntactic dependent until the NOUN itself +# for close apposition and measurement construction, the span is sometimes +# extended to the right of the NOUN +# example: "eine Tasse Tee" (a cup (of) tea) returns "eine Tasse Tee" and not +# just "eine Tasse", same for "das Thema Familie" +cdef class GermanNounChunks(DocIterator): + def __init__(self, Doc doc): + super(GermanNounChunks,self).__init__(doc) + labels = ['sb', 'oa', 'da', 'nk', 'mo', 'ag', 'root', 'cj', 'pd', 'og', 'app'] + self._np_label = self._doc.vocab.strings['NP'] + self._np_deps = set( self._doc.vocab.strings[label] for label in labels ) + self._close_app = self._doc.vocab.strings['nk'] + self.i = 0 + + def __next__(self): + cdef const TokenC* word + cdef int rbracket + cdef Token rdep + cdef widx + while self.i < self._doc.length: + widx = self.i + self.i += 1 + word = &self._doc.c[widx] + if word.pos == NOUN and word.dep in self._np_deps: + rbracket = widx+1 + # try to extend the span to the right + # to capture close apposition/measurement constructions + for rdep in self._doc[widx].rights: + if rdep.pos == NOUN and rdep.dep == self._close_app: + rbracket = rdep.i+1 + return Span(self._doc, word.l_edge, rbracket, label=self._np_label) + raise StopIteration + diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index a83c397dc..c7b88d5b8 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -47,6 +47,8 @@ from ._parse_features cimport fill_context from .stateclass cimport StateClass from ._state cimport StateC +from spacy.syntax.iterators cimport DocIterator, EnglishNounChunks, GermanNounChunks +CHUNKERS = {'en':EnglishNounChunks, 'de':GermanNounChunks} DEBUG = False @@ -113,12 +115,9 @@ cdef class Parser: cdef int nr_feat = self.model.nr_feat with nogil: self.parseC(tokens.c, tokens.length, nr_feat, nr_class) - tokens.is_parsed = True # Check for KeyboardInterrupt etc. Untested PyErr_CheckSignals() - # projectivize output - if self._projectivize: - PseudoProjectivity.deprojectivize(tokens) + self._finalize(tokens) def pipe(self, stream, int batch_size=1000, int n_threads=2): cdef Pool mem = Pool() @@ -144,7 +143,7 @@ cdef class Parser: raise ValueError("Error parsing doc: %s" % sent_str) PyErr_CheckSignals() for doc in queue: - doc.is_parsed = True + self._finalize(doc) yield doc queue = [] batch_size = len(queue) @@ -155,10 +154,19 @@ cdef class Parser: with gil: sent_str = queue[i].text raise ValueError("Error parsing doc: %s" % sent_str) - for doc in queue: - doc.is_parsed = True - yield doc PyErr_CheckSignals() + for doc in queue: + self._finalize(doc) + yield doc + + def _finalize(self, Doc doc): + # deprojectivize output + if self._projectivize: + PseudoProjectivity.deprojectivize(doc) + # set annotation-specific iterators + doc.noun_chunks = CHUNKERS.get(doc.vocab.lang,DocIterator) + # mark doc as parsed + doc.is_parsed = True cdef int parseC(self, TokenC* tokens, int length, int nr_feat, int nr_class) nogil: cdef ExampleC eg diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd index aa2cf6b54..02b6f29a5 100644 --- a/spacy/tokens/doc.pxd +++ b/spacy/tokens/doc.pxd @@ -7,6 +7,8 @@ from ..structs cimport TokenC, LexemeC from ..typedefs cimport attr_t from ..attrs cimport attr_id_t +from spacy.syntax.iterators cimport DocIterator + cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil @@ -42,6 +44,8 @@ cdef class Doc: cdef int length cdef int max_length + cdef DocIterator noun_chunks_iterator + cdef int push_back(self, LexemeOrToken lex_or_tok, bint trailing_space) except -1 cpdef np.ndarray to_array(self, object features) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 887b1085f..faed51e23 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -81,6 +81,7 @@ cdef class Doc: self.is_parsed = False self._py_tokens = [] self._vector = None + self.noun_chunks_iterator = DocIterator(self) def __getitem__(self, object i): """Get a Token or a Span from the Doc. @@ -231,36 +232,22 @@ cdef class Doc: # Set start as B self.c[start].ent_iob = 3 - @property - def noun_chunks(self): - """Yield spans for base noun phrases.""" - if not self.is_parsed: - raise ValueError( - "noun_chunks requires the dependency parse, which " - "requires data to be installed. If you haven't done so, run: " - "\npython -m spacy.en.download all\n" - "to install the data") - from spacy.en.iterators import noun_chunks as en_noun_chunks - from spacy.de.iterators import noun_chunks as de_noun_chunks + property noun_chunks: + def __get__(self): + """Yield spans for base noun phrases.""" + if not self.is_parsed: + raise ValueError( + "noun_chunks requires the dependency parse, which " + "requires data to be installed. If you haven't done so, run: " + "\npython -m spacy.en.download all\n" + "to install the data") - chunk_rules = {'en':en_noun_chunks, - 'de':de_noun_chunks, - } + yield from self.noun_chunks_iterator - for sent in self.sents: - print(sent) - lang = sent.root.lang_ - chunker = chunk_rules.get(lang,None) - if chunker == None: - warnings.warn("noun_chunks is not available for language %s." % lang) - print(sent.root.orth_) - continue + def __set__(self, DocIterator): + self.noun_chunks_iterator = DocIterator(self) - for chunk in chunker(sent): - yield chunk - - @property def sents(self): diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index df8a4bbd5..3494d2e40 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -27,7 +27,7 @@ from . import symbols from cymem.cymem cimport Address from . import util from .serialize.packer cimport Packer -from .attrs cimport PROB +from .attrs cimport PROB, LANG try: import copy_reg @@ -105,6 +105,13 @@ cdef class Vocab: self._serializer = Packer(self, self.serializer_freqs) return self._serializer + property lang: + def __get__(self): + langfunc = None + if self.get_lex_attr: + langfunc = self.get_lex_attr.get(LANG,None) + return langfunc('_') if langfunc else '' + def __len__(self): """The current number of lexemes stored.""" return self.length From 5080077097c7b88ffeb225ad9624595e7fe83694 Mon Sep 17 00:00:00 2001 From: Wolfgang Seeker Date: Mon, 21 Mar 2016 16:10:25 +0100 Subject: [PATCH 05/34] revert init_model.py back to pre-german state (because it makes more sense) simplify token.n_rights and token.n_lefts --- bin/init_model.py | 7 +++---- spacy/tokens/token.pyx | 16 ++-------------- 2 files changed, 5 insertions(+), 18 deletions(-) diff --git a/bin/init_model.py b/bin/init_model.py index 5e62a7faf..4d7611cce 100644 --- a/bin/init_model.py +++ b/bin/init_model.py @@ -98,7 +98,7 @@ def _read_probs(loc): return probs, probs['-OOV-'] -def _read_freqs(loc, max_length=100, min_doc_freq=0, min_freq=200): +def _read_freqs(loc, max_length=100, min_doc_freq=5, min_freq=200): if not loc.exists(): print("Warning: Frequencies file not found") return {}, 0.0 @@ -125,8 +125,7 @@ def _read_freqs(loc, max_length=100, min_doc_freq=0, min_freq=200): doc_freq = int(doc_freq) freq = int(freq) if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length: -# word = literal_eval(key) - word = key + word = literal_eval(key) smooth_count = counts.smoother(int(freq)) log_smooth_count = math.log(smooth_count) probs[word] = math.log(smooth_count) - log_total @@ -166,7 +165,7 @@ def setup_vocab(get_lex_attr, tag_map, src_dir, dst_dir): clusters = _read_clusters(src_dir / 'clusters.txt') probs, oov_prob = _read_probs(src_dir / 'words.sgt.prob') if not probs: - probs, oov_prob = _read_freqs(src_dir / 'freqs.txt') + probs, oov_prob = _read_freqs(src_dir / 'freqs.txt.gz') if not probs: oov_prob = -20 else: diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 17d756b3e..68ce2ffb5 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -179,23 +179,11 @@ cdef class Token: property n_lefts: def __get__(self): - cdef int n = 0 - cdef const TokenC* ptr = self.c - self.i - while ptr != self.c: - if ptr + ptr.head == self.c: - n += 1 - ptr += 1 - return n + return self.c.l_kids property n_rights: def __get__(self): - cdef int n = 0 - cdef const TokenC* ptr = self.c + (self.array_len - self.i) - while ptr != self.c: - if ptr + ptr.head == self.c: - n += 1 - ptr -= 1 - return n + return self.c.r_kids property lefts: def __get__(self): From a7d7ea3afa776132d5f46f2f1b59a4deeda1748c Mon Sep 17 00:00:00 2001 From: Henning Peters Date: Thu, 24 Mar 2016 11:19:43 +0100 Subject: [PATCH 06/34] first idea for supporting multiple langs in download script --- spacy/__init__.py | 9 ++++++-- spacy/about.py | 14 ++++++++++++- spacy/de/download.py | 13 ++++++++++++ spacy/download.py | 33 +++++++++++++++++++++++++++++ spacy/en/download.py | 49 ++------------------------------------------ spacy/util.py | 14 +++++++------ 6 files changed, 76 insertions(+), 56 deletions(-) create mode 100644 spacy/de/download.py create mode 100644 spacy/download.py diff --git a/spacy/__init__.py b/spacy/__init__.py index 70e72b7a1..b09ee3491 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -1,8 +1,13 @@ from . import util -from .en import English +from .about import __models__ +import importlib def load(name, vectors=None, via=None): - return English( + if name not in __models__: + raise Exception('Model %s not found.' % name) + + mod = importlib.import_module('.%s' % __models__[name]['module'], 'spacy') + return getattr(mod, __models__[name]['class'])( package=util.get_package_by_name(name, via=via), vectors_package=util.get_package_by_name(vectors, via=via)) diff --git a/spacy/about.py b/spacy/about.py index 3814b8d61..eed7c3f81 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -10,4 +10,16 @@ __uri__ = 'https://spacy.io' __author__ = 'Matthew Honnibal' __email__ = 'matt@spacy.io' __license__ = 'MIT' -__default_model__ = 'en>=1.0.0,<1.1.0' +__models__ = { + 'en': { + 'module': 'en', + 'class': 'English', + 'package': 'en>=1.0.0,<1.1.0', + }, + 'de': { + 'module': 'de', + 'class': 'German', + 'package': 'de>=1.0.0,<1.1.0', + }, +} +__default_model__ = 'en' diff --git a/spacy/de/download.py b/spacy/de/download.py new file mode 100644 index 000000000..ba57c1d31 --- /dev/null +++ b/spacy/de/download.py @@ -0,0 +1,13 @@ +import plac +from ..download import download + + +@plac.annotations( + force=("Force overwrite", "flag", "f", bool), +) +def main(data_size='all', force=False): + download('de', force) + + +if __name__ == '__main__': + plac.call(main) diff --git a/spacy/download.py b/spacy/download.py new file mode 100644 index 000000000..537c06872 --- /dev/null +++ b/spacy/download.py @@ -0,0 +1,33 @@ +from __future__ import print_function + +import sys + +import sputnik +from sputnik.package_list import (PackageNotFoundException, + CompatiblePackageNotFoundException) + +from . import about + + +def download(lang, force=False): + if force: + sputnik.purge(about.__title__, about.__version__) + + try: + sputnik.package(about.__title__, about.__version__, about.__models__[lang]['package']) + print("Model already installed. Please run 'python -m " + "spacy.%s.download --force' to reinstall." % lang, file=sys.stderr) + sys.exit(1) + except (PackageNotFoundException, CompatiblePackageNotFoundException): + pass + + package = sputnik.install(about.__title__, about.__version__, about.__models__[lang]['package']) + + try: + sputnik.package(about.__title__, about.__version__, about.__models__[lang]['package']) + except (PackageNotFoundException, CompatiblePackageNotFoundException): + print("Model failed to install. Please run 'python -m " + "spacy.%s.download --force'." % lang, file=sys.stderr) + sys.exit(1) + + print("Model successfully installed.", file=sys.stderr) diff --git a/spacy/en/download.py b/spacy/en/download.py index 993b8b16d..f0c23b088 100644 --- a/spacy/en/download.py +++ b/spacy/en/download.py @@ -1,57 +1,12 @@ -from __future__ import print_function - -import sys -import os -import shutil - import plac -import sputnik -from sputnik.package_list import (PackageNotFoundException, - CompatiblePackageNotFoundException) - -from .. import about - - -def migrate(path): - data_path = os.path.join(path, 'data') - if os.path.isdir(data_path): - if os.path.islink(data_path): - os.unlink(data_path) - else: - shutil.rmtree(data_path) - for filename in os.listdir(path): - if filename.endswith('.tgz'): - os.unlink(os.path.join(path, filename)) +from ..download import download @plac.annotations( force=("Force overwrite", "flag", "f", bool), ) def main(data_size='all', force=False): - if force: - sputnik.purge(about.__title__, about.__version__) - - try: - sputnik.package(about.__title__, about.__version__, about.__default_model__) - print("Model already installed. Please run 'python -m " - "spacy.en.download --force' to reinstall.", file=sys.stderr) - sys.exit(1) - except (PackageNotFoundException, CompatiblePackageNotFoundException): - pass - - package = sputnik.install(about.__title__, about.__version__, about.__default_model__) - - try: - sputnik.package(about.__title__, about.__version__, about.__default_model__) - except (PackageNotFoundException, CompatiblePackageNotFoundException): - print("Model failed to install. Please run 'python -m " - "spacy.en.download --force'.", file=sys.stderr) - sys.exit(1) - - # FIXME clean up old-style packages - migrate(os.path.dirname(os.path.abspath(__file__))) - - print("Model successfully installed.", file=sys.stderr) + download('en', force) if __name__ == '__main__': diff --git a/spacy/util.py b/spacy/util.py index bcc55c656..37d3b7bab 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -23,15 +23,17 @@ def get_package(data_dir): def get_package_by_name(name=None, via=None): try: return sputnik.package(about.__title__, about.__version__, - name or about.__default_model__, data_path=via) + name or about.__models__[about.__default_model__]['package'], + data_path=via) except PackageNotFoundException as e: raise RuntimeError("Model %s not installed. Please run 'python -m " - "spacy.en.download' to install latest compatible " - "model." % name) + "spacy.%s.download' to install latest compatible " + "model." % (name, about.__models__[name]['module'])) except CompatiblePackageNotFoundException as e: - raise RuntimeError("Installed model is not compatible with spaCy " - "version. Please run 'python -m spacy.en.download " - "--force' to install latest compatible model.") + raise RuntimeError("Installed model %s is not compatible with spaCy " + "version. Please run 'python -m spacy.%s.download " + "--force' to install latest compatible model." % + (name, about.__models__[name]['module'])) def normalize_slice(length, start, stop, step=None): From d65ef41d08194bbdda3934a2570d6895645da8af Mon Sep 17 00:00:00 2001 From: Wolfgang Seeker Date: Thu, 24 Mar 2016 11:47:09 +0100 Subject: [PATCH 07/34] make error messages language independent --- spacy/lexeme.pyx | 4 ++-- spacy/tokens/doc.pyx | 8 ++++---- spacy/tokens/token.pyx | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 4e0f2cf2e..bae1eab39 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -74,8 +74,8 @@ cdef class Lexeme: raise ValueError( "Word vectors set to length 0. This may be because the " "data is not installed. If you haven't already, run" - "\npython -m spacy.en.download all\n" - "to install the data." + "\npython -m spacy.%s.download all\n" + "to install the data." % self.vocab.lang ) vector_view = self.c.vector diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index faed51e23..f46c1fe5e 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -240,8 +240,8 @@ cdef class Doc: raise ValueError( "noun_chunks requires the dependency parse, which " "requires data to be installed. If you haven't done so, run: " - "\npython -m spacy.en.download all\n" - "to install the data") + "\npython -m spacy.%s.download all\n" + "to install the data" % self.vocab.lang) yield from self.noun_chunks_iterator @@ -258,8 +258,8 @@ cdef class Doc: raise ValueError( "sentence boundary detection requires the dependency parse, which " "requires data to be installed. If you haven't done so, run: " - "\npython -m spacy.en.download all\n" - "to install the data") + "\npython -m spacy.%s.download all\n" + "to install the data" % self.vocab.lang) cdef int i start = 0 for i in range(1, self.length): diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 68ce2ffb5..52d6f05e7 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -163,8 +163,8 @@ cdef class Token: raise ValueError( "Word vectors set to length 0. This may be because the " "data is not installed. If you haven't already, run" - "\npython -m spacy.en.download all\n" - "to install the data." + "\npython -m spacy.%s.download all\n" + "to install the data." % self.vocab.lang ) vector_view = self.c.lex.vector return numpy.asarray(vector_view) From f2cfbfc412f6c7e82afe37083c8a6d181779139f Mon Sep 17 00:00:00 2001 From: Stefan Behnel Date: Thu, 24 Mar 2016 15:09:55 +0100 Subject: [PATCH 08/34] remove internal redundancy and overhead from StringStore --- spacy/strings.pxd | 1 + spacy/strings.pyx | 84 +++++++++++++++++++++++------------------------ 2 files changed, 43 insertions(+), 42 deletions(-) diff --git a/spacy/strings.pxd b/spacy/strings.pxd index 6ba86d2ce..e2cd579c0 100644 --- a/spacy/strings.pxd +++ b/spacy/strings.pxd @@ -24,3 +24,4 @@ cdef class StringStore: cdef int64_t _resize_at cdef const Utf8Str* intern(self, unicode py_string) except NULL + cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length) except NULL diff --git a/spacy/strings.pyx b/spacy/strings.pyx index d11936d12..aa1f5c92d 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -1,30 +1,25 @@ -from __future__ import unicode_literals -import codecs +from __future__ import unicode_literals, absolute_import +cimport cython from libc.string cimport memcpy +from libc.stdint cimport uint64_t + from murmurhash.mrmr cimport hash64 from preshed.maps cimport map_iter, key_t -from cpython cimport PyUnicode_AS_DATA -from cpython cimport PyUnicode_GET_DATA_SIZE - -from libc.stdint cimport int64_t - - -from .typedefs cimport hash_t, attr_t - -try: - import codecs as io -except ImportError: - import io +from .typedefs cimport hash_t import ujson as json cpdef hash_t hash_string(unicode string) except 0: chars = string.encode('utf8') - return hash64(chars, len(chars), 1) + return _hash_utf8(chars, len(chars)) + + +cdef hash_t _hash_utf8(char* utf8_string, int length): + return hash64(utf8_string, length, 1) cdef unicode _decode(const Utf8Str* string): @@ -92,45 +87,43 @@ cdef class StringStore: def __getitem__(self, object string_or_id): cdef bytes byte_string - cdef unicode py_string cdef const Utf8Str* utf8str + cdef unsigned int int_id - cdef int id_ - if isinstance(string_or_id, int) or isinstance(string_or_id, long): - if string_or_id == 0: - return u'' - elif string_or_id < 1 or string_or_id >= self.size: + if isinstance(string_or_id, (int, long)): + try: + int_id = string_or_id + except OverflowError: raise IndexError(string_or_id) - utf8str = &self.c[string_or_id] + if int_id == 0: + return u'' + elif int_id >= self.size: + raise IndexError(string_or_id) + utf8str = &self.c[int_id] return _decode(utf8str) elif isinstance(string_or_id, bytes): - if len(string_or_id) == 0: + byte_string = string_or_id + if len(byte_string) == 0: return 0 - py_string = string_or_id.decode('utf8') - utf8str = self.intern(py_string) + utf8str = self._intern_utf8(byte_string, len(byte_string)) return utf8str - self.c elif isinstance(string_or_id, unicode): - if len(string_or_id) == 0: + if len(string_or_id) == 0: return 0 - py_string = string_or_id - utf8str = self.intern(py_string) + byte_string = (string_or_id).encode('utf8') + utf8str = self._intern_utf8(byte_string, len(byte_string)) return utf8str - self.c else: raise TypeError(type(string_or_id)) def __contains__(self, unicode string): cdef hash_t key = hash_string(string) - value = self._map.get(key) - return True if value is not NULL else False + return self._map.get(key) is not NULL def __iter__(self): cdef int i for i in range(self.size): - if i == 0: - yield u'' - else: - utf8str = &self.c[i] - yield _decode(utf8str) + yield _decode(&self.c[i]) if i > 0 else u'' def __reduce__(self): strings = [""] @@ -142,21 +135,26 @@ cdef class StringStore: cdef const Utf8Str* intern(self, unicode py_string) except NULL: # 0 means missing, but we don't bother offsetting the index. - cdef hash_t key = hash_string(py_string) + cdef bytes byte_string = py_string.encode('utf8') + return self._intern_utf8(byte_string, len(byte_string)) + + @cython.final + cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length) except NULL: + # 0 means missing, but we don't bother offsetting the index. + cdef hash_t key = _hash_utf8(utf8_string, length) value = self._map.get(key) - if value != NULL: + if value is not NULL: return value if self.size == self._resize_at: self._realloc() - cdef bytes byte_string = py_string.encode('utf8') - self.c[self.size] = _allocate(self.mem, byte_string, len(byte_string)) + self.c[self.size] = _allocate(self.mem, utf8_string, length) self._map.set(key, &self.c[self.size]) self.size += 1 return &self.c[self.size-1] def dump(self, file_): - string_data = json.dumps([s for s in self]) + string_data = json.dumps(list(self)) if not isinstance(string_data, unicode): string_data = string_data.decode('utf8') file_.write(string_data) @@ -166,8 +164,10 @@ cdef class StringStore: if strings == ['']: return None cdef unicode string - for string in strings: - if string: + for string in strings: + # explicit None/len check instead of simple truth testing + # (bug in Cython <= 0.23.4) + if string is not None and len(string): self.intern(string) def _realloc(self): From f18805ee1c4c37632cc5cfbc369223cb1fdd4641 Mon Sep 17 00:00:00 2001 From: Stefan Behnel Date: Thu, 24 Mar 2016 15:40:12 +0100 Subject: [PATCH 09/34] make StringStore.__contains__() return True for the empty string (which is also contained in iteration) --- spacy/strings.pyx | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/spacy/strings.pyx b/spacy/strings.pyx index aa1f5c92d..c890cdd22 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -116,7 +116,9 @@ cdef class StringStore: else: raise TypeError(type(string_or_id)) - def __contains__(self, unicode string): + def __contains__(self, unicode string not None): + if len(string) == 0: + return True cdef hash_t key = hash_string(string) return self._map.get(key) is not NULL From b8f63071eb1a8a1523ca91819485a350afd83c14 Mon Sep 17 00:00:00 2001 From: Henning Peters Date: Fri, 25 Mar 2016 18:54:45 +0100 Subject: [PATCH 10/34] add lang registration facility --- spacy/__init__.py | 21 ++++++++++++--------- spacy/about.py | 14 +++----------- spacy/download.py | 6 +++--- spacy/tokenizer.pyx | 3 +-- spacy/util.py | 32 ++++++++++++++++++++++++-------- spacy/vocab.pyx | 1 - 6 files changed, 43 insertions(+), 34 deletions(-) diff --git a/spacy/__init__.py b/spacy/__init__.py index b09ee3491..f47926a63 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -1,13 +1,16 @@ from . import util -from .about import __models__ -import importlib + +from .en import English +from .de import German +from . import util + + +util.register_lang(English.lang, English) +util.register_lang(German.lang, German) def load(name, vectors=None, via=None): - if name not in __models__: - raise Exception('Model %s not found.' % name) - - mod = importlib.import_module('.%s' % __models__[name]['module'], 'spacy') - return getattr(mod, __models__[name]['class'])( - package=util.get_package_by_name(name, via=via), - vectors_package=util.get_package_by_name(vectors, via=via)) + package = util.get_package_by_name(name, via=via) + vectors_package = util.get_package_by_name(vectors, via=via) + cls = util.get_lang(name) + return cls(package=package, vectors_package=vectors_package) diff --git a/spacy/about.py b/spacy/about.py index eed7c3f81..7f889cad8 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -11,15 +11,7 @@ __author__ = 'Matthew Honnibal' __email__ = 'matt@spacy.io' __license__ = 'MIT' __models__ = { - 'en': { - 'module': 'en', - 'class': 'English', - 'package': 'en>=1.0.0,<1.1.0', - }, - 'de': { - 'module': 'de', - 'class': 'German', - 'package': 'de>=1.0.0,<1.1.0', - }, + 'en': 'en>=1.0.0,<1.1.0', + 'de': 'de>=1.0.0,<1.1.0', } -__default_model__ = 'en' +__default_lang__ = 'en' diff --git a/spacy/download.py b/spacy/download.py index 537c06872..f7fc798ae 100644 --- a/spacy/download.py +++ b/spacy/download.py @@ -14,17 +14,17 @@ def download(lang, force=False): sputnik.purge(about.__title__, about.__version__) try: - sputnik.package(about.__title__, about.__version__, about.__models__[lang]['package']) + sputnik.package(about.__title__, about.__version__, about.__models__[lang]) print("Model already installed. Please run 'python -m " "spacy.%s.download --force' to reinstall." % lang, file=sys.stderr) sys.exit(1) except (PackageNotFoundException, CompatiblePackageNotFoundException): pass - package = sputnik.install(about.__title__, about.__version__, about.__models__[lang]['package']) + package = sputnik.install(about.__title__, about.__version__, about.__models__[lang]) try: - sputnik.package(about.__title__, about.__version__, about.__models__[lang]['package']) + sputnik.package(about.__title__, about.__version__, about.__models__[lang]) except (PackageNotFoundException, CompatiblePackageNotFoundException): print("Model failed to install. Please run 'python -m " "spacy.%s.download --force'." % lang, file=sys.stderr) diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index f8613fce8..44d627505 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -16,8 +16,7 @@ cimport cython from . import util from .tokens.doc cimport Doc -from .util import read_lang_data -from .util import get_package +from .util import read_lang_data, get_package cdef class Tokenizer: diff --git a/spacy/util.py b/spacy/util.py index 37d3b7bab..4eda2d0e4 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -14,6 +14,21 @@ from . import about from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE +LANGUAGES = {} + + +def register_lang(name, cls): + global LANGUAGES + LANGUAGES[name] = cls + + +def get_lang(name): + lang = re.split('[^a-zA-Z0-9_]', name, 1)[0] + if lang not in LANGUAGES: + raise RuntimeError('Language not supported: %s' % lang) + return LANGUAGES[lang] + + def get_package(data_dir): if not isinstance(data_dir, six.string_types): raise RuntimeError('data_dir must be a string') @@ -21,19 +36,20 @@ def get_package(data_dir): def get_package_by_name(name=None, via=None): + package_name = name or about.__models__[about.__default_lang__] + lang = get_lang(package_name) try: return sputnik.package(about.__title__, about.__version__, - name or about.__models__[about.__default_model__]['package'], - data_path=via) + package_name, data_path=via) except PackageNotFoundException as e: - raise RuntimeError("Model %s not installed. Please run 'python -m " - "spacy.%s.download' to install latest compatible " - "model." % (name, about.__models__[name]['module'])) + raise RuntimeError("Model '%s' not installed. Please run 'python -m " + "%s.download' to install latest compatible " + "model." % (name, lang.__module__)) except CompatiblePackageNotFoundException as e: - raise RuntimeError("Installed model %s is not compatible with spaCy " - "version. Please run 'python -m spacy.%s.download " + raise RuntimeError("Installed model is not compatible with spaCy " + "version. Please run 'python -m %s.download " "--force' to install latest compatible model." % - (name, about.__models__[name]['module'])) + (lang.__module__)) def normalize_slice(length, start, stop, step=None): diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index f876bfefb..3712a7383 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -25,7 +25,6 @@ from . import attrs from . import symbols from cymem.cymem cimport Address -from . import util from .serialize.packer cimport Packer from .attrs cimport PROB From db095a162c12d4e68b11543e16ba5a9c47881d23 Mon Sep 17 00:00:00 2001 From: Henning Peters Date: Fri, 25 Mar 2016 18:59:47 +0100 Subject: [PATCH 11/34] fix --- spacy/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/spacy/__init__.py b/spacy/__init__.py index f47926a63..d01bb11f3 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -2,7 +2,6 @@ from . import util from .en import English from .de import German -from . import util util.register_lang(English.lang, English) From c90d4a6f17aa2940b744863c2491f23637fe0c24 Mon Sep 17 00:00:00 2001 From: Henning Peters Date: Sat, 26 Mar 2016 11:44:53 +0100 Subject: [PATCH 12/34] relative imports in __init__.py --- spacy/__init__.py | 12 ++++++------ spacy/util.py | 6 +++--- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/spacy/__init__.py b/spacy/__init__.py index d01bb11f3..676659fdd 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -1,15 +1,15 @@ -from . import util +from .util import set_lang_class, get_lang_class, get_package, get_package_by_name from .en import English from .de import German -util.register_lang(English.lang, English) -util.register_lang(German.lang, German) +set_lang_class(English.lang, English) +set_lang_class(German.lang, German) def load(name, vectors=None, via=None): - package = util.get_package_by_name(name, via=via) - vectors_package = util.get_package_by_name(vectors, via=via) - cls = util.get_lang(name) + package = get_package_by_name(name, via=via) + vectors_package = get_package_by_name(vectors, via=via) + cls = get_lang_class(name) return cls(package=package, vectors_package=vectors_package) diff --git a/spacy/util.py b/spacy/util.py index 4eda2d0e4..b1e93d08b 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -17,12 +17,12 @@ from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE LANGUAGES = {} -def register_lang(name, cls): +def set_lang_class(name, cls): global LANGUAGES LANGUAGES[name] = cls -def get_lang(name): +def get_lang_class(name): lang = re.split('[^a-zA-Z0-9_]', name, 1)[0] if lang not in LANGUAGES: raise RuntimeError('Language not supported: %s' % lang) @@ -37,7 +37,7 @@ def get_package(data_dir): def get_package_by_name(name=None, via=None): package_name = name or about.__models__[about.__default_lang__] - lang = get_lang(package_name) + lang = get_lang_class(package_name) try: return sputnik.package(about.__title__, about.__version__, package_name, data_path=via) From ad119c074fa6eb5c89f3eb4118fd630e5505b45c Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 29 Mar 2016 13:02:42 +1100 Subject: [PATCH 13/34] * Fix incorrect whitespacing in Doc.text. This change is potentially breaking, to anyone who was relying on the previous incorrect semantics. --- spacy/tokens/doc.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 26088be0c..c5111088f 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -166,7 +166,7 @@ cdef class Doc: @property def text(self): - return u' '.join(t.text for t in self) + return u''.join(t.text for t in self) property ents: def __get__(self): From 910a6c805ffa892a05a24e7452c58b6d649afda4 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 29 Mar 2016 13:03:44 +1100 Subject: [PATCH 14/34] * Add infix rule for double hyphens, re Issue #302 --- lang_data/en/infix.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/lang_data/en/infix.txt b/lang_data/en/infix.txt index cb9cc0a78..8c83b7d4c 100644 --- a/lang_data/en/infix.txt +++ b/lang_data/en/infix.txt @@ -1,4 +1,5 @@ \.\.\. (?<=[a-z])\.(?=[A-Z]) (?<=[a-zA-Z])-(?=[a-zA-z]) +(?<=[a-zA-Z])--(?=[a-zA-z]) (?<=[0-9])-(?=[0-9]) From d249e2f7f30af76470ae4a02fd2203ed641f6f91 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 29 Mar 2016 13:04:33 +1100 Subject: [PATCH 15/34] * Improve error message in bin/parser/train.py --- bin/parser/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/parser/train.py b/bin/parser/train.py index 642ed53e7..2b9e24051 100755 --- a/bin/parser/train.py +++ b/bin/parser/train.py @@ -141,7 +141,7 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', nlp.tagger(tokens) gold = GoldParse(tokens, annot_tuples) if not gold.is_projective: - raise Exception("Non-projective sentence in training: %s" % annot_tuples) + raise Exception("Non-projective sentence in training: %s" % annot_tuples[1]) loss += nlp.parser.train(tokens, gold) nlp.entity.train(tokens, gold) nlp.tagger.train(tokens, gold.tags) From 9c73983bdd8fcf3df15a00ef5066edd105ef2fd7 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 29 Mar 2016 14:27:13 +1100 Subject: [PATCH 16/34] * Add test for hyphenation problem in Issue #302 --- spacy/tests/tokenizer/test_infix.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/spacy/tests/tokenizer/test_infix.py b/spacy/tests/tokenizer/test_infix.py index d703682cf..eda4643a6 100644 --- a/spacy/tests/tokenizer/test_infix.py +++ b/spacy/tests/tokenizer/test_infix.py @@ -32,3 +32,9 @@ def test_email(en_tokenizer): assert len(tokens) == 1 +def test_double_hyphen(en_tokenizer): + tokens = en_tokenizer(u'No decent--let alone well-bred--people.') + assert tokens[0].text == u'No' + assert tokens[1].text == u'decent' + assert tokens[2].text == u'--' + assert tokens[3].text == u'let' From b1fe41b45d6a36cbbabb203ddf0357fca1689265 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 29 Mar 2016 14:31:05 +1100 Subject: [PATCH 17/34] * Extend infix test, commenting on limitation of tokenizer w.r.t. infixes at the moment. --- spacy/tests/tokenizer/test_infix.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/spacy/tests/tokenizer/test_infix.py b/spacy/tests/tokenizer/test_infix.py index eda4643a6..7a107733b 100644 --- a/spacy/tests/tokenizer/test_infix.py +++ b/spacy/tests/tokenizer/test_infix.py @@ -38,3 +38,12 @@ def test_double_hyphen(en_tokenizer): assert tokens[1].text == u'decent' assert tokens[2].text == u'--' assert tokens[3].text == u'let' + assert tokens[4].text == u'alone' + assert tokens[5].text == u'well' + assert tokens[6].text == u'-' + # TODO: This points to a deeper issue with the tokenizer: it doesn't re-enter + # on infixes. + #assert tokens[7].text == u'bred' + #assert tokens[8].text == u'--' + #assert tokens[9].text == u'people' + From a8f4e4990096be4be6922761aae7c73c7f3ce80e Mon Sep 17 00:00:00 2001 From: Wolfgang Seeker Date: Tue, 29 Mar 2016 16:12:13 +0200 Subject: [PATCH 18/34] update init_model.py to previous (better) state --- bin/init_model.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/bin/init_model.py b/bin/init_model.py index b14015b39..3bbd7c469 100644 --- a/bin/init_model.py +++ b/bin/init_model.py @@ -98,7 +98,7 @@ def _read_probs(loc): return probs, probs['-OOV-'] -def _read_freqs(loc, max_length=100, min_doc_freq=0, min_freq=200): +def _read_freqs(loc, max_length=100, min_doc_freq=5, min_freq=200): if not loc.exists(): print("Warning: Frequencies file not found") return {}, 0.0 @@ -109,7 +109,7 @@ def _read_freqs(loc, max_length=100, min_doc_freq=0, min_freq=200): else: file_ = loc.open() for i, line in enumerate(file_): - freq, doc_freq, key = line.split('\t', 2) + freq, doc_freq, key = line.rstrip().split('\t', 2) freq = int(freq) counts.inc(i+1, freq) total += freq @@ -121,15 +121,13 @@ def _read_freqs(loc, max_length=100, min_doc_freq=0, min_freq=200): file_ = loc.open() probs = {} for line in file_: - freq, doc_freq, key = line.split('\t', 2) + freq, doc_freq, key = line.rstrip().split('\t', 2) doc_freq = int(doc_freq) freq = int(freq) if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length: -# word = literal_eval(key) - word = key + word = literal_eval(key) smooth_count = counts.smoother(int(freq)) - log_smooth_count = math.log(smooth_count) - probs[word] = log_smooth_count - log_total + probs[word] = math.log(smooth_count) - log_total oov_prob = math.log(counts.smoother(0)) - log_total return probs, oov_prob @@ -166,7 +164,7 @@ def setup_vocab(get_lex_attr, tag_map, src_dir, dst_dir): clusters = _read_clusters(src_dir / 'clusters.txt') probs, oov_prob = _read_probs(src_dir / 'words.sgt.prob') if not probs: - probs, oov_prob = _read_freqs(src_dir / 'freqs.txt') + probs, oov_prob = _read_freqs(src_dir / 'freqs.txt.gz') if not probs: oov_prob = -20 else: From f321272bee5ade7836b2bcc2e6ed68fbb5903bba Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 1 Apr 2016 00:36:56 +1100 Subject: [PATCH 19/34] Update gitignore for website --- .gitignore | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.gitignore b/.gitignore index 5c75b8b05..0311fe842 100644 --- a/.gitignore +++ b/.gitignore @@ -73,6 +73,11 @@ htmlcov/ nosetests.xml coverage.xml +# Website +website/www/ +website/demos/displacy/ +website/demos/sense2vec/ + # Translations *.mo @@ -88,11 +93,6 @@ coverage.xml *.log *.pot -# Sphinx documentation -docs/_build/ -docs/_themes/ -setup.py - # Windows local helper files *.bat From 1f8309a862c0c7f004a7e5586b4eac1c222319d5 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 1 Apr 2016 01:24:48 +1100 Subject: [PATCH 20/34] Replace website with new version --- website/404.jade | 10 + website/README.md | 35 +- website/_data.json | 51 + website/_fabfile.py | 94 + website/_harp.json | 85 + website/_includes/_analytics.jade | 7 + website/_includes/_article.jade | 34 + website/_includes/_footer.jade | 14 + website/_includes/_functions.jade | 101 + website/_includes/_head.jade | 31 + website/_includes/_header.jade | 21 + website/_includes/_latest-posts.jade | 17 + website/_includes/_logo.jade | 5 + website/_includes/_mixins.jade | 381 +++ website/_includes/_nav.jade | 32 + website/_includes/_newsletter.jade | 21 + website/_includes/_profile.jade | 21 + website/_includes/_sidebar.jade | 12 + website/_includes/_teaser.jade | 22 + website/_layout.jade | 43 + website/assets/css/_base/_animations.sass | 23 + website/assets/css/_base/_fonts.sass | 112 + website/assets/css/_base/_grid.sass | 131 + website/assets/css/_base/_reset.sass | 75 + website/assets/css/_base/_typography.sass | 174 + website/assets/css/_components/_alerts.sass | 33 + website/assets/css/_components/_asides.sass | 69 + website/assets/css/_components/_boxes.sass | 39 + website/assets/css/_components/_buttons.sass | 75 + website/assets/css/_components/_cards.sass | 44 + website/assets/css/_components/_code.sass | 87 + website/assets/css/_components/_dividers.sass | 72 + website/assets/css/_components/_embeds.sass | 35 + website/assets/css/_components/_forms.sass | 28 + website/assets/css/_components/_icons.sass | 39 + website/assets/css/_components/_images.sass | 39 + website/assets/css/_components/_labels.sass | 46 + website/assets/css/_components/_links.sass | 60 + website/assets/css/_components/_lists.sass | 59 + website/assets/css/_components/_logo.sass | 36 + website/assets/css/_components/_misc.sass | 88 + website/assets/css/_components/_quotes.sass | 36 + website/assets/css/_components/_tables.sass | 79 + website/assets/css/_components/_tooltips.sass | 44 + website/assets/css/_layout/_article.sass | 25 + website/assets/css/_layout/_body.sass | 53 + website/assets/css/_layout/_footer.sass | 20 + website/assets/css/_layout/_header.sass | 50 + website/assets/css/_layout/_nav.sass | 108 + website/assets/css/_layout/_sidebar.sass | 65 + website/assets/css/_utils/_functions.sass | 23 + website/assets/css/_utils/_mixins.sass | 96 + website/assets/css/_variables.scss | 93 + website/assets/css/_vendors/_displacy.sass | 42 + website/assets/css/_vendors/_normalize.sass | 181 ++ website/assets/css/_vendors/_prism.sass | 77 + website/assets/css/style.sass | 60 + website/assets/css/style_blog.sass | 6 + website/assets/fonts/icomoon.eot | Bin 0 -> 5148 bytes website/assets/fonts/icomoon.svg | 27 + website/assets/fonts/icomoon.ttf | Bin 0 -> 4984 bytes website/assets/fonts/icomoon.woff | Bin 0 -> 5060 bytes website/assets/fonts/lato-bold.eot | Bin 0 -> 35215 bytes website/assets/fonts/lato-bold.svg | 2787 ++++++++++++++++ website/assets/fonts/lato-bold.ttf | Bin 0 -> 76172 bytes website/assets/fonts/lato-bold.woff | Bin 0 -> 38312 bytes website/assets/fonts/lato-bold.woff2 | Bin 0 -> 30576 bytes website/assets/fonts/lato-bolditalic.eot | Bin 0 -> 37995 bytes website/assets/fonts/lato-bolditalic.svg | 2800 ++++++++++++++++ website/assets/fonts/lato-bolditalic.ttf | Bin 0 -> 81572 bytes website/assets/fonts/lato-bolditalic.woff | Bin 0 -> 41100 bytes website/assets/fonts/lato-bolditalic.woff2 | Bin 0 -> 32900 bytes website/assets/fonts/lato-italic.eot | Bin 0 -> 37885 bytes website/assets/fonts/lato-italic.svg | 2805 +++++++++++++++++ website/assets/fonts/lato-italic.ttf | Bin 0 -> 82492 bytes website/assets/fonts/lato-italic.woff | Bin 0 -> 41084 bytes website/assets/fonts/lato-italic.woff2 | Bin 0 -> 32980 bytes website/assets/fonts/lato-regular.eot | Bin 0 -> 35026 bytes website/assets/fonts/lato-regular.svg | 2788 ++++++++++++++++ website/assets/fonts/lato-regular.ttf | Bin 0 -> 76396 bytes website/assets/fonts/lato-regular.woff | Bin 0 -> 38240 bytes website/assets/fonts/lato-regular.woff2 | Bin 0 -> 30348 bytes .../assets/fonts/sourcecodepro-semibold.eot | Bin 0 -> 24976 bytes .../assets/fonts/sourcecodepro-semibold.svg | 244 ++ .../assets/fonts/sourcecodepro-semibold.ttf | Bin 0 -> 55544 bytes .../assets/fonts/sourcecodepro-semibold.woff | Bin 0 -> 27952 bytes website/assets/fonts/worksans-bold.eot | Bin 0 -> 28428 bytes website/assets/fonts/worksans-bold.svg | 1909 +++++++++++ website/assets/fonts/worksans-bold.ttf | Bin 0 -> 66460 bytes website/assets/fonts/worksans-bold.woff | Bin 0 -> 31812 bytes website/assets/fonts/worksans-bold.woff2 | Bin 0 -> 25116 bytes website/assets/fonts/worksans-regular.eot | Bin 0 -> 26774 bytes website/assets/fonts/worksans-regular.svg | 1586 ++++++++++ website/assets/fonts/worksans-regular.ttf | Bin 0 -> 62940 bytes website/assets/fonts/worksans-regular.woff | Bin 0 -> 30088 bytes website/assets/fonts/worksans-regular.woff2 | Bin 0 -> 23644 bytes website/assets/fonts/worksans-semibold.eot | Bin 0 -> 29022 bytes website/assets/fonts/worksans-semibold.svg | 1909 +++++++++++ website/assets/fonts/worksans-semibold.ttf | Bin 0 -> 67040 bytes website/assets/fonts/worksans-semibold.woff | Bin 0 -> 32208 bytes website/assets/fonts/worksans-semibold.woff2 | Bin 0 -> 25464 bytes website/assets/img/favicon.ico | Bin 0 -> 1150 bytes website/assets/img/logo.png | Bin 0 -> 4246 bytes website/assets/img/logo.svg | 1 + website/assets/img/logos/chartbeat.png | Bin 0 -> 1764 bytes website/assets/img/logos/cytora.png | Bin 0 -> 2079 bytes website/assets/img/logos/keyreply.png | Bin 0 -> 5901 bytes website/assets/img/logos/kip.png | Bin 0 -> 3268 bytes website/assets/img/logos/signaln.png | Bin 0 -> 1291 bytes website/assets/img/logos/socrata.png | Bin 0 -> 2693 bytes website/assets/img/pattern_blue.jpg | Bin 0 -> 222353 bytes website/assets/img/pattern_red.jpg | Bin 0 -> 143953 bytes website/assets/img/profile_elmar.png | Bin 0 -> 1313 bytes website/assets/img/profile_henning.png | Bin 0 -> 91071 bytes website/assets/img/profile_ines.png | Bin 0 -> 91860 bytes website/assets/img/profile_ines_alt.png | Bin 0 -> 118606 bytes website/assets/img/profile_matt.png | Bin 0 -> 102151 bytes website/assets/img/profile_matt_alt.png | Bin 0 -> 99721 bytes website/assets/img/profile_placeholder.png | Bin 0 -> 1313 bytes website/assets/img/profile_wolfgang.png | Bin 0 -> 98714 bytes website/assets/img/social.png | Bin 0 -> 252761 bytes website/assets/img/spacy_screen.png | Bin 0 -> 515998 bytes website/assets/js/main.js | 92 + website/assets/js/prism.js | 25 + website/blog/_data.json | 187 ++ website/blog/dead-code-should-be-buried.jade | 30 + website/blog/displacy.jade | 28 + .../displacy/pizza-with-anchovies-bad.html | 14 + .../displacy/pizza-with-anchovies-good.html | 14 + .../displacy/robots-in-popular-culture.html | 14 + .../blog/eli5-computers-learn-reading.jade | 31 + website/blog/how-spacy-works.jade | 143 + website/blog/img/agpl-not-free.jpg | Bin 0 -> 345274 bytes website/blog/img/agpl-not-free_large.jpg | Bin 0 -> 600488 bytes website/blog/img/agpl-not-free_small.jpg | Bin 0 -> 100526 bytes website/blog/img/anchovies.png | Bin 0 -> 151225 bytes website/blog/img/basic-english.jpg | Bin 0 -> 323077 bytes website/blog/img/basic-english_large.jpg | Bin 0 -> 583016 bytes website/blog/img/basic-english_small.jpg | Bin 0 -> 88650 bytes website/blog/img/cython.jpg | Bin 0 -> 328880 bytes website/blog/img/cython_large.jpg | Bin 0 -> 639656 bytes website/blog/img/cython_small.jpg | Bin 0 -> 84288 bytes website/blog/img/deadcode.jpg | Bin 0 -> 333040 bytes website/blog/img/deadcode_large.jpg | Bin 0 -> 645418 bytes website/blog/img/deadcode_small.jpg | Bin 0 -> 83174 bytes website/blog/img/displacy.jpg | Bin 0 -> 38432 bytes website/blog/img/displacy_large.jpg | Bin 0 -> 42721 bytes website/blog/img/displacy_small.jpg | Bin 0 -> 21451 bytes website/blog/img/how-spacy-works.jpg | Bin 0 -> 364963 bytes website/blog/img/how-spacy-works_large.jpg | Bin 0 -> 741151 bytes website/blog/img/how-spacy-works_small.jpg | Bin 0 -> 77669 bytes website/blog/img/introducing-spacy.jpg | Bin 0 -> 248667 bytes website/blog/img/introducing-spacy_large.jpg | Bin 0 -> 373223 bytes website/blog/img/introducing-spacy_small.jpg | Bin 0 -> 84481 bytes website/blog/img/linguistic-structure.jpg | Bin 0 -> 87627 bytes website/blog/img/markup.jpg | Bin 0 -> 488608 bytes website/blog/img/markup_basscss.jpg | Bin 0 -> 21348 bytes website/blog/img/markup_bootstrap.jpg | Bin 0 -> 33933 bytes website/blog/img/markup_docs.jpg | Bin 0 -> 73732 bytes website/blog/img/markup_large.jpg | Bin 0 -> 812645 bytes website/blog/img/markup_mixins.jpg | Bin 0 -> 101381 bytes website/blog/img/markup_sections.jpg | Bin 0 -> 40888 bytes website/blog/img/markup_small.jpg | Bin 0 -> 147823 bytes website/blog/img/markup_workflow.jpg | Bin 0 -> 90331 bytes website/blog/img/pizza.jpg | Bin 0 -> 354385 bytes website/blog/img/pizza_large.jpg | Bin 0 -> 660178 bytes website/blog/img/pizza_small.jpg | Bin 0 -> 98574 bytes website/blog/img/pos-tagger.jpg | Bin 0 -> 344945 bytes website/blog/img/pos-tagger_large.jpg | Bin 0 -> 613014 bytes website/blog/img/pos-tagger_small.jpg | Bin 0 -> 91301 bytes website/blog/img/sense2vec.jpg | Bin 0 -> 289956 bytes website/blog/img/sense2vec_large.jpg | Bin 0 -> 536346 bytes website/blog/img/sense2vec_small.jpg | Bin 0 -> 76683 bytes website/blog/index.jade | 31 + website/blog/introducing-spacy.jade | 18 + website/blog/modular-markup.jade | 147 + website/blog/parsing-english-in-python.jade | 575 ++++ .../part-of-speech-pos-tagger-in-python.jade | 257 ++ website/blog/sense2vec-with-spacy.jade | 166 + website/blog/spacy-now-mit.jade | 50 + website/blog/writing-c-in-cython.jade | 96 + website/create_code_samples | 76 - website/demos/_data.json | 6 + website/demos/img/displacy.jpg | Bin 0 -> 125511 bytes website/demos/img/sense2vec.jpg | Bin 0 -> 158352 bytes website/demos/index.jade | 15 + website/docs/_annotation-specs.jade | 170 + website/docs/_api-doc.jade | 313 ++ website/docs/_api-english.jade | 273 ++ website/docs/_api-lexeme.jade | 204 ++ website/docs/_api-matcher.jade | 82 + website/docs/_api-span.jade | 307 ++ website/docs/_api-stringstore.jade | 105 + website/docs/_api-token.jade | 317 ++ website/docs/_api-vocab.jade | 157 + website/docs/_data.json | 29 + website/docs/_quickstart-examples.jade | 173 + website/docs/_quickstart-install.jade | 119 + website/docs/_tutorials.jade | 10 + website/docs/index.jade | 37 + website/docs/legacy/index.html | 933 ++++++ website/docs/legacy/resources/css/style.css | 1 + .../resources/fonts/inconsolata-bold.eot | Bin 0 -> 23547 bytes .../resources/fonts/inconsolata-bold.svg | 230 ++ .../resources/fonts/inconsolata-bold.ttf | Bin 0 -> 49932 bytes .../resources/fonts/inconsolata-bold.woff | Bin 0 -> 26296 bytes .../resources/fonts/inconsolata-bold.woff2 | Bin 0 -> 21452 bytes .../resources/fonts/inconsolata-regular.eot | Bin 0 -> 37211 bytes .../resources/fonts/inconsolata-regular.svg | 229 ++ .../resources/fonts/inconsolata-regular.ttf | Bin 0 -> 65668 bytes .../resources/fonts/inconsolata-regular.woff | Bin 0 -> 41320 bytes .../resources/fonts/inconsolata-regular.woff2 | Bin 0 -> 33932 bytes .../legacy/resources/fonts/karla-bold.eot | Bin 0 -> 8768 bytes .../legacy/resources/fonts/karla-bold.svg | 346 ++ .../legacy/resources/fonts/karla-bold.ttf | Bin 0 -> 17024 bytes .../legacy/resources/fonts/karla-bold.woff | Bin 0 -> 10804 bytes .../legacy/resources/fonts/karla-bold.woff2 | Bin 0 -> 8032 bytes .../resources/fonts/karla-bolditalic.eot | Bin 0 -> 9856 bytes .../resources/fonts/karla-bolditalic.svg | 351 +++ .../resources/fonts/karla-bolditalic.ttf | Bin 0 -> 17692 bytes .../resources/fonts/karla-bolditalic.woff | Bin 0 -> 11852 bytes .../resources/fonts/karla-bolditalic.woff2 | Bin 0 -> 9008 bytes .../legacy/resources/fonts/karla-italic.eot | Bin 0 -> 8970 bytes .../legacy/resources/fonts/karla-italic.svg | 351 +++ .../legacy/resources/fonts/karla-italic.ttf | Bin 0 -> 17544 bytes .../legacy/resources/fonts/karla-italic.woff | Bin 0 -> 11004 bytes .../legacy/resources/fonts/karla-italic.woff2 | Bin 0 -> 8188 bytes .../legacy/resources/fonts/karla-regular.eot | Bin 0 -> 8392 bytes .../legacy/resources/fonts/karla-regular.svg | 351 +++ .../legacy/resources/fonts/karla-regular.ttf | Bin 0 -> 16248 bytes .../legacy/resources/fonts/karla-regular.woff | Bin 0 -> 10396 bytes .../resources/fonts/karla-regular.woff2 | Bin 0 -> 7624 bytes website/docs/legacy/resources/img/logo.png | Bin 0 -> 1505 bytes website/docs/legacy/resources/img/logo.svg | 46 + website/docs/legacy/resources/js/prism.js | 1692 ++++++++++ website/docs/legacy/resources/js/prism.min.js | 2 + website/docs/tutorials/_data.json | 29 + .../docs/tutorials/load-new-word-vectors.jade | 66 + website/docs/tutorials/mark-adverbs.jade | 121 + website/docs/tutorials/syntax-search.jade | 76 + website/docs/tutorials/twitter-filter.jade | 154 + website/feed.xml.jade | 30 + website/index.jade | 186 ++ website/legal.jade | 77 + website/src/jade/404.jade | 14 - website/src/jade/blog/_agpl_license.jade | 116 - .../dead-code-should-be-buried/index.jade | 35 - .../blog/dead-code-should-be-buried/meta.jade | 16 - website/src/jade/blog/displacy/index.jade | 50 - website/src/jade/blog/displacy/meta.jade | 12 - .../eli5-computers-learn-reading/index.jade | 37 - .../eli5-computers-learn-reading/meta.jade | 15 - .../src/jade/blog/how-spacy-works/index.jade | 154 - .../src/jade/blog/how-spacy-works/meta.jade | 7 - website/src/jade/blog/index.jade | 58 - .../jade/blog/introducing-spacy/index.jade | 22 - .../src/jade/blog/introducing-spacy/meta.jade | 15 - .../blog/parsing-english-in-python/index.jade | 621 ---- .../blog/parsing-english-in-python/meta.jade | 15 - .../index.jade | 288 -- .../meta.jade | 12 - .../src/jade/blog/sense2vec-reddit/index.jade | 131 - .../src/jade/blog/spacy-now-mit/index.jade | 56 - website/src/jade/blog/spacy-now-mit/meta.jade | 8 - .../jade/blog/writing-c-in-cython/index.jade | 100 - .../jade/blog/writing-c-in-cython/meta.jade | 15 - website/src/jade/docs/_api.jade | 658 ---- website/src/jade/docs/_spec.jade | 113 - website/src/jade/docs/index.jade | 28 - website/src/jade/header.jade | 196 -- website/src/jade/home/_comparisons.jade | 150 - website/src/jade/home/_installation.jade | 251 -- website/src/jade/home/_online_demo.jade | 22 - website/src/jade/home/_usage.jade | 65 - website/src/jade/home/index.jade | 38 - website/src/jade/mixins.jade | 32 - website/src/jade/tutorials/_teaser.jade | 16 - .../jade/tutorials/add-a-language/index.jade | 136 - .../jade/tutorials/add-a-language/meta.jade | 8 - .../bootstrap-ner-word2vec/index.jade | 253 -- .../src/jade/tutorials/customizing-spacy.jade | 5 - .../src/jade/tutorials/dan-text-class.jade | 210 -- .../load-new-word-vectors/index.jade | 75 - .../tutorials/load-new-word-vectors/meta.jade | 8 - .../jade/tutorials/mark-adverbs/index.jade | 132 - .../src/jade/tutorials/mark-adverbs/meta.jade | 8 - website/src/jade/tutorials/multilingual.jade | 29 - .../src/jade/tutorials/set-lexeme-attrs.jade | 19 - .../jade/tutorials/syntax-search/index.jade | 91 - .../jade/tutorials/syntax-search/meta.jade | 8 - .../jade/tutorials/twitter-filter/index.jade | 167 - .../jade/tutorials/twitter-filter/meta.jade | 8 - website/styleguide.jade | 391 +++ website/team.jade | 19 + 294 files changed, 30210 insertions(+), 4620 deletions(-) create mode 100644 website/404.jade create mode 100644 website/_data.json create mode 100644 website/_fabfile.py create mode 100644 website/_harp.json create mode 100644 website/_includes/_analytics.jade create mode 100644 website/_includes/_article.jade create mode 100644 website/_includes/_footer.jade create mode 100644 website/_includes/_functions.jade create mode 100644 website/_includes/_head.jade create mode 100644 website/_includes/_header.jade create mode 100644 website/_includes/_latest-posts.jade create mode 100644 website/_includes/_logo.jade create mode 100644 website/_includes/_mixins.jade create mode 100644 website/_includes/_nav.jade create mode 100644 website/_includes/_newsletter.jade create mode 100644 website/_includes/_profile.jade create mode 100644 website/_includes/_sidebar.jade create mode 100644 website/_includes/_teaser.jade create mode 100644 website/_layout.jade create mode 100644 website/assets/css/_base/_animations.sass create mode 100644 website/assets/css/_base/_fonts.sass create mode 100644 website/assets/css/_base/_grid.sass create mode 100644 website/assets/css/_base/_reset.sass create mode 100644 website/assets/css/_base/_typography.sass create mode 100644 website/assets/css/_components/_alerts.sass create mode 100644 website/assets/css/_components/_asides.sass create mode 100644 website/assets/css/_components/_boxes.sass create mode 100644 website/assets/css/_components/_buttons.sass create mode 100644 website/assets/css/_components/_cards.sass create mode 100644 website/assets/css/_components/_code.sass create mode 100644 website/assets/css/_components/_dividers.sass create mode 100644 website/assets/css/_components/_embeds.sass create mode 100644 website/assets/css/_components/_forms.sass create mode 100644 website/assets/css/_components/_icons.sass create mode 100644 website/assets/css/_components/_images.sass create mode 100644 website/assets/css/_components/_labels.sass create mode 100644 website/assets/css/_components/_links.sass create mode 100644 website/assets/css/_components/_lists.sass create mode 100644 website/assets/css/_components/_logo.sass create mode 100644 website/assets/css/_components/_misc.sass create mode 100644 website/assets/css/_components/_quotes.sass create mode 100644 website/assets/css/_components/_tables.sass create mode 100644 website/assets/css/_components/_tooltips.sass create mode 100644 website/assets/css/_layout/_article.sass create mode 100644 website/assets/css/_layout/_body.sass create mode 100644 website/assets/css/_layout/_footer.sass create mode 100644 website/assets/css/_layout/_header.sass create mode 100644 website/assets/css/_layout/_nav.sass create mode 100644 website/assets/css/_layout/_sidebar.sass create mode 100644 website/assets/css/_utils/_functions.sass create mode 100644 website/assets/css/_utils/_mixins.sass create mode 100644 website/assets/css/_variables.scss create mode 100644 website/assets/css/_vendors/_displacy.sass create mode 100644 website/assets/css/_vendors/_normalize.sass create mode 100644 website/assets/css/_vendors/_prism.sass create mode 100644 website/assets/css/style.sass create mode 100644 website/assets/css/style_blog.sass create mode 100644 website/assets/fonts/icomoon.eot create mode 100644 website/assets/fonts/icomoon.svg create mode 100644 website/assets/fonts/icomoon.ttf create mode 100644 website/assets/fonts/icomoon.woff create mode 100755 website/assets/fonts/lato-bold.eot create mode 100755 website/assets/fonts/lato-bold.svg create mode 100755 website/assets/fonts/lato-bold.ttf create mode 100755 website/assets/fonts/lato-bold.woff create mode 100755 website/assets/fonts/lato-bold.woff2 create mode 100755 website/assets/fonts/lato-bolditalic.eot create mode 100755 website/assets/fonts/lato-bolditalic.svg create mode 100755 website/assets/fonts/lato-bolditalic.ttf create mode 100755 website/assets/fonts/lato-bolditalic.woff create mode 100755 website/assets/fonts/lato-bolditalic.woff2 create mode 100755 website/assets/fonts/lato-italic.eot create mode 100755 website/assets/fonts/lato-italic.svg create mode 100755 website/assets/fonts/lato-italic.ttf create mode 100755 website/assets/fonts/lato-italic.woff create mode 100755 website/assets/fonts/lato-italic.woff2 create mode 100755 website/assets/fonts/lato-regular.eot create mode 100755 website/assets/fonts/lato-regular.svg create mode 100755 website/assets/fonts/lato-regular.ttf create mode 100755 website/assets/fonts/lato-regular.woff create mode 100755 website/assets/fonts/lato-regular.woff2 create mode 100644 website/assets/fonts/sourcecodepro-semibold.eot create mode 100644 website/assets/fonts/sourcecodepro-semibold.svg create mode 100644 website/assets/fonts/sourcecodepro-semibold.ttf create mode 100644 website/assets/fonts/sourcecodepro-semibold.woff create mode 100755 website/assets/fonts/worksans-bold.eot create mode 100755 website/assets/fonts/worksans-bold.svg create mode 100755 website/assets/fonts/worksans-bold.ttf create mode 100755 website/assets/fonts/worksans-bold.woff create mode 100755 website/assets/fonts/worksans-bold.woff2 create mode 100755 website/assets/fonts/worksans-regular.eot create mode 100755 website/assets/fonts/worksans-regular.svg create mode 100755 website/assets/fonts/worksans-regular.ttf create mode 100755 website/assets/fonts/worksans-regular.woff create mode 100755 website/assets/fonts/worksans-regular.woff2 create mode 100755 website/assets/fonts/worksans-semibold.eot create mode 100755 website/assets/fonts/worksans-semibold.svg create mode 100755 website/assets/fonts/worksans-semibold.ttf create mode 100755 website/assets/fonts/worksans-semibold.woff create mode 100755 website/assets/fonts/worksans-semibold.woff2 create mode 100644 website/assets/img/favicon.ico create mode 100644 website/assets/img/logo.png create mode 100644 website/assets/img/logo.svg create mode 100644 website/assets/img/logos/chartbeat.png create mode 100644 website/assets/img/logos/cytora.png create mode 100644 website/assets/img/logos/keyreply.png create mode 100644 website/assets/img/logos/kip.png create mode 100644 website/assets/img/logos/signaln.png create mode 100644 website/assets/img/logos/socrata.png create mode 100644 website/assets/img/pattern_blue.jpg create mode 100644 website/assets/img/pattern_red.jpg create mode 100644 website/assets/img/profile_elmar.png create mode 100644 website/assets/img/profile_henning.png create mode 100644 website/assets/img/profile_ines.png create mode 100644 website/assets/img/profile_ines_alt.png create mode 100644 website/assets/img/profile_matt.png create mode 100644 website/assets/img/profile_matt_alt.png create mode 100644 website/assets/img/profile_placeholder.png create mode 100644 website/assets/img/profile_wolfgang.png create mode 100644 website/assets/img/social.png create mode 100644 website/assets/img/spacy_screen.png create mode 100644 website/assets/js/main.js create mode 100644 website/assets/js/prism.js create mode 100644 website/blog/_data.json create mode 100644 website/blog/dead-code-should-be-buried.jade create mode 100644 website/blog/displacy.jade create mode 100644 website/blog/displacy/pizza-with-anchovies-bad.html create mode 100644 website/blog/displacy/pizza-with-anchovies-good.html create mode 100644 website/blog/displacy/robots-in-popular-culture.html create mode 100644 website/blog/eli5-computers-learn-reading.jade create mode 100644 website/blog/how-spacy-works.jade create mode 100644 website/blog/img/agpl-not-free.jpg create mode 100644 website/blog/img/agpl-not-free_large.jpg create mode 100644 website/blog/img/agpl-not-free_small.jpg create mode 100644 website/blog/img/anchovies.png create mode 100644 website/blog/img/basic-english.jpg create mode 100644 website/blog/img/basic-english_large.jpg create mode 100644 website/blog/img/basic-english_small.jpg create mode 100644 website/blog/img/cython.jpg create mode 100644 website/blog/img/cython_large.jpg create mode 100644 website/blog/img/cython_small.jpg create mode 100644 website/blog/img/deadcode.jpg create mode 100644 website/blog/img/deadcode_large.jpg create mode 100644 website/blog/img/deadcode_small.jpg create mode 100644 website/blog/img/displacy.jpg create mode 100644 website/blog/img/displacy_large.jpg create mode 100644 website/blog/img/displacy_small.jpg create mode 100644 website/blog/img/how-spacy-works.jpg create mode 100644 website/blog/img/how-spacy-works_large.jpg create mode 100644 website/blog/img/how-spacy-works_small.jpg create mode 100644 website/blog/img/introducing-spacy.jpg create mode 100644 website/blog/img/introducing-spacy_large.jpg create mode 100644 website/blog/img/introducing-spacy_small.jpg create mode 100644 website/blog/img/linguistic-structure.jpg create mode 100644 website/blog/img/markup.jpg create mode 100644 website/blog/img/markup_basscss.jpg create mode 100644 website/blog/img/markup_bootstrap.jpg create mode 100644 website/blog/img/markup_docs.jpg create mode 100644 website/blog/img/markup_large.jpg create mode 100644 website/blog/img/markup_mixins.jpg create mode 100644 website/blog/img/markup_sections.jpg create mode 100644 website/blog/img/markup_small.jpg create mode 100644 website/blog/img/markup_workflow.jpg create mode 100644 website/blog/img/pizza.jpg create mode 100644 website/blog/img/pizza_large.jpg create mode 100644 website/blog/img/pizza_small.jpg create mode 100644 website/blog/img/pos-tagger.jpg create mode 100644 website/blog/img/pos-tagger_large.jpg create mode 100644 website/blog/img/pos-tagger_small.jpg create mode 100644 website/blog/img/sense2vec.jpg create mode 100644 website/blog/img/sense2vec_large.jpg create mode 100644 website/blog/img/sense2vec_small.jpg create mode 100644 website/blog/index.jade create mode 100644 website/blog/introducing-spacy.jade create mode 100644 website/blog/modular-markup.jade create mode 100644 website/blog/parsing-english-in-python.jade create mode 100644 website/blog/part-of-speech-pos-tagger-in-python.jade create mode 100644 website/blog/sense2vec-with-spacy.jade create mode 100644 website/blog/spacy-now-mit.jade create mode 100644 website/blog/writing-c-in-cython.jade delete mode 100755 website/create_code_samples create mode 100644 website/demos/_data.json create mode 100644 website/demos/img/displacy.jpg create mode 100644 website/demos/img/sense2vec.jpg create mode 100644 website/demos/index.jade create mode 100644 website/docs/_annotation-specs.jade create mode 100644 website/docs/_api-doc.jade create mode 100644 website/docs/_api-english.jade create mode 100644 website/docs/_api-lexeme.jade create mode 100644 website/docs/_api-matcher.jade create mode 100644 website/docs/_api-span.jade create mode 100644 website/docs/_api-stringstore.jade create mode 100644 website/docs/_api-token.jade create mode 100644 website/docs/_api-vocab.jade create mode 100644 website/docs/_data.json create mode 100644 website/docs/_quickstart-examples.jade create mode 100644 website/docs/_quickstart-install.jade create mode 100644 website/docs/_tutorials.jade create mode 100644 website/docs/index.jade create mode 100644 website/docs/legacy/index.html create mode 100755 website/docs/legacy/resources/css/style.css create mode 100644 website/docs/legacy/resources/fonts/inconsolata-bold.eot create mode 100644 website/docs/legacy/resources/fonts/inconsolata-bold.svg create mode 100644 website/docs/legacy/resources/fonts/inconsolata-bold.ttf create mode 100644 website/docs/legacy/resources/fonts/inconsolata-bold.woff create mode 100644 website/docs/legacy/resources/fonts/inconsolata-bold.woff2 create mode 100644 website/docs/legacy/resources/fonts/inconsolata-regular.eot create mode 100644 website/docs/legacy/resources/fonts/inconsolata-regular.svg create mode 100644 website/docs/legacy/resources/fonts/inconsolata-regular.ttf create mode 100644 website/docs/legacy/resources/fonts/inconsolata-regular.woff create mode 100644 website/docs/legacy/resources/fonts/inconsolata-regular.woff2 create mode 100644 website/docs/legacy/resources/fonts/karla-bold.eot create mode 100644 website/docs/legacy/resources/fonts/karla-bold.svg create mode 100644 website/docs/legacy/resources/fonts/karla-bold.ttf create mode 100644 website/docs/legacy/resources/fonts/karla-bold.woff create mode 100644 website/docs/legacy/resources/fonts/karla-bold.woff2 create mode 100644 website/docs/legacy/resources/fonts/karla-bolditalic.eot create mode 100644 website/docs/legacy/resources/fonts/karla-bolditalic.svg create mode 100644 website/docs/legacy/resources/fonts/karla-bolditalic.ttf create mode 100644 website/docs/legacy/resources/fonts/karla-bolditalic.woff create mode 100644 website/docs/legacy/resources/fonts/karla-bolditalic.woff2 create mode 100644 website/docs/legacy/resources/fonts/karla-italic.eot create mode 100644 website/docs/legacy/resources/fonts/karla-italic.svg create mode 100644 website/docs/legacy/resources/fonts/karla-italic.ttf create mode 100644 website/docs/legacy/resources/fonts/karla-italic.woff create mode 100644 website/docs/legacy/resources/fonts/karla-italic.woff2 create mode 100644 website/docs/legacy/resources/fonts/karla-regular.eot create mode 100644 website/docs/legacy/resources/fonts/karla-regular.svg create mode 100644 website/docs/legacy/resources/fonts/karla-regular.ttf create mode 100644 website/docs/legacy/resources/fonts/karla-regular.woff create mode 100644 website/docs/legacy/resources/fonts/karla-regular.woff2 create mode 100644 website/docs/legacy/resources/img/logo.png create mode 100644 website/docs/legacy/resources/img/logo.svg create mode 100644 website/docs/legacy/resources/js/prism.js create mode 100644 website/docs/legacy/resources/js/prism.min.js create mode 100644 website/docs/tutorials/_data.json create mode 100644 website/docs/tutorials/load-new-word-vectors.jade create mode 100644 website/docs/tutorials/mark-adverbs.jade create mode 100644 website/docs/tutorials/syntax-search.jade create mode 100644 website/docs/tutorials/twitter-filter.jade create mode 100644 website/feed.xml.jade create mode 100644 website/index.jade create mode 100644 website/legal.jade delete mode 100644 website/src/jade/404.jade delete mode 100644 website/src/jade/blog/_agpl_license.jade delete mode 100644 website/src/jade/blog/dead-code-should-be-buried/index.jade delete mode 100644 website/src/jade/blog/dead-code-should-be-buried/meta.jade delete mode 100644 website/src/jade/blog/displacy/index.jade delete mode 100644 website/src/jade/blog/displacy/meta.jade delete mode 100644 website/src/jade/blog/eli5-computers-learn-reading/index.jade delete mode 100644 website/src/jade/blog/eli5-computers-learn-reading/meta.jade delete mode 100644 website/src/jade/blog/how-spacy-works/index.jade delete mode 100644 website/src/jade/blog/how-spacy-works/meta.jade delete mode 100644 website/src/jade/blog/index.jade delete mode 100644 website/src/jade/blog/introducing-spacy/index.jade delete mode 100644 website/src/jade/blog/introducing-spacy/meta.jade delete mode 100644 website/src/jade/blog/parsing-english-in-python/index.jade delete mode 100644 website/src/jade/blog/parsing-english-in-python/meta.jade delete mode 100644 website/src/jade/blog/part-of-speech-POS-tagger-in-python/index.jade delete mode 100644 website/src/jade/blog/part-of-speech-POS-tagger-in-python/meta.jade delete mode 100644 website/src/jade/blog/sense2vec-reddit/index.jade delete mode 100644 website/src/jade/blog/spacy-now-mit/index.jade delete mode 100644 website/src/jade/blog/spacy-now-mit/meta.jade delete mode 100644 website/src/jade/blog/writing-c-in-cython/index.jade delete mode 100644 website/src/jade/blog/writing-c-in-cython/meta.jade delete mode 100644 website/src/jade/docs/_api.jade delete mode 100644 website/src/jade/docs/_spec.jade delete mode 100644 website/src/jade/docs/index.jade delete mode 100644 website/src/jade/header.jade delete mode 100644 website/src/jade/home/_comparisons.jade delete mode 100644 website/src/jade/home/_installation.jade delete mode 100644 website/src/jade/home/_online_demo.jade delete mode 100644 website/src/jade/home/_usage.jade delete mode 100644 website/src/jade/home/index.jade delete mode 100644 website/src/jade/mixins.jade delete mode 100644 website/src/jade/tutorials/_teaser.jade delete mode 100644 website/src/jade/tutorials/add-a-language/index.jade delete mode 100644 website/src/jade/tutorials/add-a-language/meta.jade delete mode 100644 website/src/jade/tutorials/bootstrap-ner-word2vec/index.jade delete mode 100644 website/src/jade/tutorials/customizing-spacy.jade delete mode 100644 website/src/jade/tutorials/dan-text-class.jade delete mode 100644 website/src/jade/tutorials/load-new-word-vectors/index.jade delete mode 100644 website/src/jade/tutorials/load-new-word-vectors/meta.jade delete mode 100644 website/src/jade/tutorials/mark-adverbs/index.jade delete mode 100644 website/src/jade/tutorials/mark-adverbs/meta.jade delete mode 100644 website/src/jade/tutorials/multilingual.jade delete mode 100644 website/src/jade/tutorials/set-lexeme-attrs.jade delete mode 100644 website/src/jade/tutorials/syntax-search/index.jade delete mode 100644 website/src/jade/tutorials/syntax-search/meta.jade delete mode 100644 website/src/jade/tutorials/twitter-filter/index.jade delete mode 100644 website/src/jade/tutorials/twitter-filter/meta.jade create mode 100644 website/styleguide.jade create mode 100644 website/team.jade diff --git a/website/404.jade b/website/404.jade new file mode 100644 index 000000000..9ea96e237 --- /dev/null +++ b/website/404.jade @@ -0,0 +1,10 @@ +include _includes/_mixins + +//- 404 Error +//- ============================================================================ + ++lead.text-center Ooops, this page does not exist. Click #[a(href='javascript:history.go(-1)') here] to go back or check out one of the latest posts below. + ++divider('bottom') + +!=partial('_includes/_latest-posts', { max: 3 } ) diff --git a/website/README.md b/website/README.md index 928c9591e..b0db3f3ca 100644 --- a/website/README.md +++ b/website/README.md @@ -1,29 +1,22 @@ -Source for spacy.io -============================== +# Source files for the spacy.io website and docs -This directory contains the source for official spaCy website at http://spacy.io/. +The [spacy.io](https://spacy.io) website is implemented in [Jade (aka Pug)](https://www.jade-lang.org), and is built or served by [Harp](https://harpjs.com). -Fixes, updates and suggestions are welcome. +## Building the site +To build the site and start making changes: -Releases --------- -Changes made to this directory go live on spacy.io. + sudo npm install --global harp + git clone https://github.com/spacy-io/website + cd website + harp server +This will serve the site on [http://localhost:9000](http://localhost:9000). You can then edit the jade source and refresh the page to see your changes. -The Stack --------- -The site is built with the [Jade](http://jade-lang.com/) template language. +## Reading the source -See [fabfile.py](/fabfile.py) under ```web()``` for more +Jade is an extensible templating language with a readable syntax, that compiles to HTML. +The website source makes extensive use of Jade mixins, so that the design system is abstracted away from the content you're +writing. You can read more about our approach in our blog post, ["Rebuilding a Site with Modular Markup"](https://spacy.io/blog/modular-markup). - -Developing --------- -To make and test changes -``` - npm install jade --global - fab web - cd website/site; python -m SimpleHTTPServer 8000; cd - -``` -Then visit [localhost:8000](http://localhost:8000) +If you want to write or edit the pages, the site's [styleguide](http://spacy.io/styleguide) serves as a useful reference of the available mixins. diff --git a/website/_data.json b/website/_data.json new file mode 100644 index 000000000..5a7c17ff7 --- /dev/null +++ b/website/_data.json @@ -0,0 +1,51 @@ +{ + "index": { + "landing": true + }, + + "feed": { + "layout": false + }, + + "robots": { + "layout": false + }, + + "404": { + "title": "404 Error", + "asides": false + }, + + "team": { + "title": "Team" + }, + + "legal": { + "title": "Legal & Imprint", + "sidebar": true, + "asides": true + }, + + "styleguide": { + "title" : "Styleguide", + "standalone" : true, + "asides": true, + + "sidebar": { + "About": [ + ["Introduction", "#section-introduction", "introduction"] + ], + "Design": [ + ["Colors", "#section-colors", "colors"], + ["Logo", "#section-logo", "logo"], + ["Typography", "#section-typography", "typography"], + ["Grid", "#section-grid", "grid"], + ["Elements", "#section-elements", "elements"], + ["Components", "#section-components", "components"] + ], + "Code": [ + ["Source", "#section-source", "source"] + ] + } + } +} diff --git a/website/_fabfile.py b/website/_fabfile.py new file mode 100644 index 000000000..d31ea67eb --- /dev/null +++ b/website/_fabfile.py @@ -0,0 +1,94 @@ +from __future__ import print_function + +from fabric.api import local +import os +import hashlib +import mimetypes +import shutil + +import boto.s3.connection + + +mimetypes.init() + +buckets = { + 'staging': 'staging.spacy.io', + 'production': 'spacy.io', +} + + +def compile(): + shutil.rmtree('www') + local('NODE_ENV=s3 harp compile') + + +def publish(env='staging', site_path='www'): + os.environ['S3_USE_SIGV4'] = 'True' + conn = boto.s3.connection.S3Connection(host='s3.eu-central-1.amazonaws.com', + calling_format=boto.s3.connection.OrdinaryCallingFormat()) + bucket = conn.get_bucket(buckets[env], validate=False) + + keys = {k.name: k for k in bucket.list()} + keys_left = set(keys) + + for root, dirnames, filenames in os.walk(site_path): + for dirname in dirnames: + target = os.path.relpath(os.path.join(root, dirname), site_path) + source = os.path.join(target, 'index.html') + + if os.path.exists(os.path.join(root, dirname, 'index.html')): + redirect = '//%s/%s' % (bucket.name, target) + key = bucket.lookup(source) + if not key: + key = bucket.new_key(source) + key.set_redirect(redirect) + print('setting redirect for %s' % target) + elif key.get_redirect() != redirect: + key.set_redirect(redirect) + print('setting redirect for %s' % target) + + if source in keys_left: + keys_left.remove(source) + + for filename in filenames: + source = os.path.join(root, filename) + + if filename == 'index.html': + target = os.path.normpath(os.path.relpath(root, site_path)) + if target == '.': + target = filename + else: + target = os.path.normpath(os.path.join(os.path.relpath(root, site_path), filename)) + if target.endswith('.html'): + target = target[:-len('.html')] + + content_type = mimetypes.guess_type(source)[0] + cache_control = 'no-transform,public,max-age=300,s-maxage=300' + checksum = hashlib.md5(open(source).read()).hexdigest() + + if (target not in keys + or keys[target].etag.replace('"', '') != checksum): + + key = bucket.new_key(target) + if content_type: + key.content_type = content_type + key.set_contents_from_filename(source, + headers={'Cache-Control': cache_control}) + print('uploading %s' % target) + + elif content_type: + key = bucket.lookup(target) + if (key + and (key.content_type != content_type + or key.cache_control != cache_control)): + key.copy(key.bucket, key.name, preserve_acl=True, + metadata={'Content-Type': content_type, + 'Cache-Control': cache_control}) + print('update headers %s' % target) + + if target in keys_left: + keys_left.remove(target) + + for key_name in keys_left: + print('deleting %s' % key_name) + bucket.delete_key(key_name) diff --git a/website/_harp.json b/website/_harp.json new file mode 100644 index 000000000..6510f73cf --- /dev/null +++ b/website/_harp.json @@ -0,0 +1,85 @@ +{ + "globals": { + "title": "spaCy.io", + "sitename": "spaCy", + "slogan": "Industrial-strength Natural Language Processing", + "description": "spaCy is a free open-source library featuring state-of-the-art speed and accuracy and a powerful Python API.", + "url": "https://spacy.io", + "email": "contact@spacy.io", + "company": "spaCy GmbH", + "team_members": [ "henning", "matt", "wolfgang", "elmar", "ines" ], + + "navigation": { "Docs": "docs", "Demos": "demos", "Team": "team", "Blog": "blog" }, + "profiles": { "twitter": "spacy_io", "github": "spacy-io", "reddit": "spacynlp", "medium": "spacy" }, + "google_analytics": "UA-58931649-1", + + "stylesheets": { "default": "style", "blog": "style_blog" }, + "scripts" : [ "main", "prism" ], + "feed": "feed.xml", + "image_sizes" : { "small" : "640", "medium": "1440", "large": "2000" }, + "default_syntax" : "python", + + "spacy_version": "0.100.6", + "spacy_stars": "1500", + "github_settings": { "user": "spacy-io", "repo": "spacy" }, + + "apis": { + "displacy": "https://displacy.spacy.io/", + "sense2vec": "https://sense2vec.spacy.io/api/similarity/reddit/" + }, + + "authors" : { + "matt" : { + "name" : "Matthew Honnibal", + "title": "CTO", + "description" : "is co-founder and CTO of spaCy. He studied linguistics as an undergrad, and never thought he'd be a programmer. By 2009 he had a PhD in computer science, and in 2014 he left academia to write spaCy. He's from Sydney and lives in Berlin.", + "links": { + "twitter": [ "https://twitter.com/honnibal", "Twitter" ], + "website": [ "https://www.semanticscholar.org/search?q=Matthew%20Honnibal", "Semantic Scholar" ] + } + }, + + "henning": { + "name": "Henning Peters", + "title": "CEO", + "description": "is co-founder and CEO of spaCy. He holds a MSc in computer science and has been co-founder and CTO of Skoobe and Absolventa. His passions are uncommon languages and backcountry skiing.", + "links": { + "twitter": [ "https://twitter.com/henningpeters", "Twitter"], + "linkedin": [ "https://de.linkedin.com/in/hepeters", "LinkedIn"], + "github": [ "https://github.com/henningpeters", "GitHub"] + } + }, + + "ines": { + "name": "Ines Montani", + "title": "Front-End", + "description": "As Head of Front-End, Ines is in charge of showing people what spaCy can do. She develops, designs and implements our interactive demos and the spacy.io website. Ines has a degree in media, linguistics and communications, and over ten years experience in web development.", + "links": { + "twitter": [ "https://twitter.com/_inesmontani", "Twitter" ], + "codepen": [ "https://codepen.io/inesmontani", "Codepen"], + "github": [ "https://github.com/inesmontani", "GitHub"], + "website": [ "http://ines.io", "Blog" ] + } + }, + + "wolfgang": { + "name": "Wolfgang Seeker", + "title": "NLP Engineer", + "description": "is a computational linguist from Germany. He is fascinated with the complexity and variety of human language, and spent his PhD looking for ways to make NLP work well with any kind of language in the world. He joined spaCy to build effective and truly multilingual NLP software.", + "links": { + "website": [ "https://www.semanticscholar.org/search?q=Wolfgang%20Seeker", "Semantic Scholar" ] + } + }, + + "elmar": { + "name": "Elmar Haußmann", + "title": "NLP Engineer", + "description": "is an NLP engineer at spaCy, passionate about deep learning. He has a background in both, academic research, with a PhD in computer science, and industry, as a former consultant and software engineer at IBM. Originally from Stuttgart, the avid snowboarder and mountain biker doesn't only ride powder and trails but also covers distances via plane between the spaCy office in Berlin and his new home in Beijing.", + "links": { + "github": [ "https://github.com/elmar-haussmann", "GitHub"], + "twitter": [ "https://twitter.com/elhaussmann", "Twitter" ] + } + } + } + } +} diff --git a/website/_includes/_analytics.jade b/website/_includes/_analytics.jade new file mode 100644 index 000000000..ab322b800 --- /dev/null +++ b/website/_includes/_analytics.jade @@ -0,0 +1,7 @@ +if environment != 'development' + script. + (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){ + (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o), + m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m) + })(window,document,'script','//www.google-analytics.com/analytics.js','ga'); + ga('create', '#{google_analytics}', 'auto'); ga('send', 'pageview'); diff --git a/website/_includes/_article.jade b/website/_includes/_article.jade new file mode 100644 index 000000000..e39bd3649 --- /dev/null +++ b/website/_includes/_article.jade @@ -0,0 +1,34 @@ +include ../_includes/_mixins + +//- Article +//- ============================================================================ + +article.article(id=current.source) + + header.article-header + +h2.article-title=title + .article-meta + if author + | by #[a.link(href=(authors[author].url || url) target='_blank')=authors[author].name] on + | #[+date(date)] + + .article-body!=yield + + footer.article-footer + + +grid('padding', 'align-right', 'valign-center') + +tweet(title) + + if links + for link, index in links + div: +button('primary', 'small', index.toLowerCase())(href=link target='_blank') + +icon(index.toLowerCase(), 'medium', 'secondary') + | Discussion on #{index} + + if author + +divider + + !=partial('_profile', { label: 'About the Author', style: 'alt' }) + +!=partial('_newsletter', { divider: 'both' }) +!=partial('_latest-posts', { max: 2, _section: _section } ) diff --git a/website/_includes/_footer.jade b/website/_includes/_footer.jade new file mode 100644 index 000000000..017cdfa86 --- /dev/null +++ b/website/_includes/_footer.jade @@ -0,0 +1,14 @@ +include _mixins + +//- Footer +//- ============================================================================ + +footer.footer + span © #{new Date().getFullYear()} #{company} + a(href='/legal') Legal / Imprint + + a(href='https://twitter.com/' + profiles.twitter target='_blank' aria-label="Twitter") + +icon('twitter', 'secondary') + + a(href='/feed.xml' target='_blank' aria-label="RSS Feed") + +icon('feed', 'secondary') diff --git a/website/_includes/_functions.jade b/website/_includes/_functions.jade new file mode 100644 index 000000000..b58eea883 --- /dev/null +++ b/website/_includes/_functions.jade @@ -0,0 +1,101 @@ +//- Functions +//- ============================================================================ + +//- Full page title + +- function getPageTitle() { +- if(current.path[0] == 'blog' && current.source != 'index') title += ' | Blog'; +- return (current.path[0] == 'index') ? sitename + ' | ' + slogan : title + ' | ' + sitename; +- } + + +//- Get current URL + current - [string] current path + +- function getCurrentUrl() { +- var base = current.path; +- if(current.source == 'index') base.pop(); +- return url + '/' + base.join('/'); +- } + + +//- Assign flexbox order, elements are assigned negative values to always move + them to the start of a flexbox in the correct order (i.e. -3, -2, -1) + counter - [integer] index of current item + max - [integer] amount of items in total + start - [integer] index of start position, i.e. 0 -> oder: -1 (optional) + +- function assignOrder(counter, max, start) { +- if(counter >= 0 && counter < max) return "order: -" + (max - counter + (start || 0)); +- } + + +//- Create Twitter share URL + current - [string] current path + tweet - [string] text to be shared with link + +- function twitterShareUrl(current, tweet) { +- return "https://twitter.com/share?text=" + tweet + "&url=" + getCurrentUrl(current) + ";via=" + profiles.twitter; +- } + + +//- Add prefix to each item in an array (used for modifier CSS classes) + array - [array] array of strings, taken from mixin arguments + prefix - [string] class prefix (i.e. 'button--') + +- function prefixArgs(array, prefix) { +- for(var i = 0; i < array.length; i++) { +- array[i] = prefix + array[i]; +- } +- return array.join(' '); +- } + + +//- Convert date to human readable and timestamp format + input - [string] date in the format YYYY-MM-DD + +- function convertDate(input) { +- var dates = []; +- var months = [ 'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December' ]; +- var date = new Date(input); +- dates.full = months[date.getMonth()] + ' ' + date.getDate() + ', ' + date.getFullYear(); +- dates.timestamp = JSON.parse(JSON.stringify(date)); +- return dates; +- } + + +//- Convert date to valid RSS pubDate + input - [string] date in the format YYYY-MM-DD + +- function convertPubDate(input) { +- var date = new Date(input); +- var pieces = date.toString().split(' '); +- var offsetTime = pieces[5].match(/[-+]\d{4}/); +- var offset = (offsetTime) ? offsetTime : pieces[5]; +- var parts = [ pieces[0] + ',', pieces[2], pieces[1], pieces[3], pieces[4], offset ]; +- return parts.join(' '); +- } + + +//- Compile scrset attribute for hero images + image - [object] article image object from _data.json + path - [string] relative path to image folder + +- function getScrset(image, path) { +- var scrset = path + image.file + ' ' + image_sizes.medium + 'w'; +- if(image.file_small) scrset += ', ' + path + image.file_small + ' ' + image_sizes.small + 'w'; +- if(image.file_large) scrset += ', ' + path + image.file_large + ' ' + image_sizes.large + 'w'; +- return scrset; +- } + + +//- Get meta image + +- function getMetaImage() { +- if(current.path[0] == 'blog' && image && image.file) { +- return url + '/blog/img/' + image.file; +- } +- else { +- return url + '/assets/img/social.png'; +- } +- } diff --git a/website/_includes/_head.jade b/website/_includes/_head.jade new file mode 100644 index 000000000..96c4d0154 --- /dev/null +++ b/website/_includes/_head.jade @@ -0,0 +1,31 @@ +include _mixins + +- var is_blog = (_section == 'blog') + + +//- Head +//- ============================================================================ + +head + title=getPageTitle() + + meta(charset='utf-8') + meta(name="viewport" content="width=device-width, initial-scale=1.0") + meta(name='referrer' content='always') + + meta(property='og:type' content='website') + meta(property='og:site_name' content=sitename) + meta(property='og:url' content=getCurrentUrl()) + meta(property='og:title' content=title) + meta(property='og:description' content=description) + meta(property='og:image' content=getMetaImage()) + + meta(name='twitter:card' content='summary_large_image') + meta(name='twitter:site' content='@' + profiles.twitter) + meta(name='twitter:title' content=title) + meta(name='twitter:description' content=description) + meta(name='twitter:image' content=getMetaImage()) + + link(rel='icon' type='image/x-icon' href='/assets/img/favicon.ico') + link(href='/assets/css/' + ((is_blog) ? stylesheets.blog : stylesheets.default) + '.css' rel='stylesheet') + link(href='/' + feed rel='alternate' type='application/rss+xml' title='RSS') diff --git a/website/_includes/_header.jade b/website/_includes/_header.jade new file mode 100644 index 000000000..345073c71 --- /dev/null +++ b/website/_includes/_header.jade @@ -0,0 +1,21 @@ +include _mixins + +//- Header +//- ============================================================================ + +header.header(class=(image) ? 'hero' : '') + + if image + img(srcset=getScrset(image, 'img/') alt=image.alt sizes='100vw') + + if image.credit + .hero-credit + if image.url + a(href=image.url target='_blank')=image.credit + + else + !=image.credit + + else + if !is_article && headline != false + h1.header-title=title diff --git a/website/_includes/_latest-posts.jade b/website/_includes/_latest-posts.jade new file mode 100644 index 000000000..f4289866b --- /dev/null +++ b/website/_includes/_latest-posts.jade @@ -0,0 +1,17 @@ +include _mixins + +- var post_counter = 0 +- var is_docs = (_section == 'docs') + + +//- Latest Posts +//- ============================================================================ + ++grid('padding') + each post, slug in ( (_section == 'docs' ) ? public.docs.tutorials._data : public.blog._data) + if slug != 'index' && slug != current.source && post_counter < (max || 3) + + +grid-col('space-between', ((max > 2 && max % 3 == 0) ? 'third' : 'half')) + !=partial('_teaser', { teaser: post, slug: slug, _root: (is_docs) ? '/docs/tutorials/' : '/blog/' }) + + - post_counter++ diff --git a/website/_includes/_logo.jade b/website/_includes/_logo.jade new file mode 100644 index 000000000..b5d2698af --- /dev/null +++ b/website/_includes/_logo.jade @@ -0,0 +1,5 @@ +//- Logo +//- ============================================================================ + +svg.logo(class=(logo_size) ? 'logo--' + logo_size : '' viewBox='0 0 675 215' width='500') + path(d='M83.6 83.3C68.3 81.5 67.2 61 47.5 62.8c-9.5 0-18.4 4-18.4 12.7 0 13.2 20.3 14.4 32.5 17.7 20.9 6.3 41 10.7 41 33.3 0 28.8-22.6 38.8-52.4 38.8-24.9 0-50.2-8.9-50.2-31.8 0-6.4 6.1-11.3 12-11.3 7.5 0 10.1 3.2 12.7 8.4 5.8 10.2 12.3 15.6 28.3 15.6 10.2 0 20.6-3.9 20.6-12.7 0-12.6-12.8-15.3-26.1-18.4-23.5-6.6-43.6-10-46-36.1C-1 34.5 91.7 32.9 97 71.9c.1 7.1-6.5 11.4-13.4 11.4zm110.2-39c32.5 0 51 27.2 51 60.8 0 33.7-17.9 60.8-51 60.8-18.4 0-29.8-7.8-38.1-19.8v44.5c0 13.4-4.3 19.8-14.1 19.8-11.9 0-14.1-7.6-14.1-19.8V61.3c0-10.6 4.4-17 14.1-17 9.1 0 14.1 7.2 14.1 17v3.6c9.2-11.6 19.7-20.6 38.1-20.6zm-7.7 98.4c19.1 0 27.6-17.6 27.6-38.1 0-20.1-8.6-38.1-27.6-38.1-19.8 0-29 16.3-29 38.1 0 21.2 9.2 38.1 29 38.1zM266.9 76c0-23.4 26.9-31.7 52.9-31.7 36.6 0 51.7 10.7 51.7 46v34c0 8.1 5 24.1 5 29 0 7.4-6.8 12-14.1 12-8.1 0-14.1-9.5-18.4-16.3-11.9 9.5-24.5 16.3-43.8 16.3-21.3 0-38.1-12.6-38.1-33.3 0-18.4 13.2-28.9 29-32.5 0 .1 51-12 51-12.1 0-15.7-5.5-22.6-22-22.6-14.5 0-21.9 4-27.5 12.7-4.5 6.6-4 10.6-12.7 10.6-6.9-.1-13-4.9-13-12.1zm43.6 70.2c22.3 0 31.8-11.8 31.8-35.3v-5c-6 2-30.3 8-36.8 9.1-7 1.4-14.1 6.6-14.1 14.9.1 9.1 9.4 16.3 19.1 16.3zM474.5 0c31.5 0 65.7 18.8 65.7 48.8 0 7.7-5.8 14.1-13.4 14.1-10.3 0-11.8-5.5-16.3-13.4-7.6-13.9-16.5-23.3-36.1-23.3-30.2-.2-43.7 25.6-43.7 57.8 0 32.4 11.2 55.8 42.4 55.8 20.7 0 32.2-12 38.1-27.6 2.4-7.1 6.7-14.1 15.6-14.1 7 0 14.1 7.2 14.1 14.8 0 31.8-32.4 53.8-65.8 53.8-36.5 0-57.2-15.4-68.5-41-5.5-12.2-9.1-24.9-9.1-42.4-.1-49.2 28.6-83.3 77-83.3zm180.3 44.3c8 0 12.7 5.2 12.7 13.4 0 3.3-2.6 9.9-3.6 13.4L625.1 173c-8.6 22.1-15.1 37.4-44.5 37.4-14 0-26.1-1.2-26.1-13.4 0-7 5.3-10.6 12.7-10.6 1.4 0 3.6.7 5 .7 2.1 0 3.6.7 5 .7 14.7 0 16.8-15.1 22-25.5l-37.4-92.6c-2.1-5-3.6-8.4-3.6-11.3 0-8.2 6.4-14.1 14.8-14.1 9.5 0 13.3 7.5 15.6 15.6l24.7 73.5L638 65.5c3.9-10.5 4.2-21.2 16.8-21.2z') diff --git a/website/_includes/_mixins.jade b/website/_includes/_mixins.jade new file mode 100644 index 000000000..cb1207673 --- /dev/null +++ b/website/_includes/_mixins.jade @@ -0,0 +1,381 @@ +include _functions + +//- Mixins +//- ============================================================================ + +//- Sections for content pages + id - [string] id, can be headline id as it's being prefixed (optional) + block - section content (block and inline elements) + +mixin section(id) + section.section(id=(id) ? 'section-' + id : '')&attributes(attributes) + block + + +//- Flexbox grid to align children elements + ...style - [strings] flexbox CSS classes without prefix (optional) + block - container content (block and inline elements) + +mixin grid(...style) + .grid(class=prefixArgs(style, 'grid--'))&attributes(attributes) + block + +mixin grid-col(...style) + .grid-col(class=prefixArgs(style, 'grid-col--'))&attributes(attributes) + block + + +//- Aside + headline - [string] Headline of aside (optional) + block - aside content (inline elements) + +mixin aside(headline) + span.aside(data-label=headline)&attributes(attributes) + span.aside-body + block + + +//- Paragraphs + block - paragraph content (inline elements) + +mixin lead + p.text-lead&attributes(attributes) + block + + +//- Various text styles + block - text (inline elements) + +mixin example + p.text-example&attributes(attributes) + block + +mixin source + span.text-source&attributes(attributes) + block + +mixin label(...style) + span(class=(style != '') ? prefixArgs(style, 'label-') : 'label')&attributes(attributes) + block + + +//- Headings with optional permalinks + id - [string] unique id (optional, no permalink without id) + source - [string] link for source button (optional) + block - headline text (inline elements) + +mixin headline(level, id, source) + if level == 2 + +h2(id, source) + block + + else if level == 3 + +h3(id, source) + block + + else if level == 4 + +h4(id, source) + block + + else if level == 5 + +h5(id, source) + block + + else + +h6(id, source) + block + +mixin h1(id, source) + h1(id=id)&attributes(attributes) + +permalink(id, source) + block + +mixin h2(id, source) + h2(id=id)&attributes(attributes) + +permalink(id, source) + block + +mixin h3(id, source) + h3(id=id)&attributes(attributes) + +permalink(id, source) + block + +mixin h4(id, source) + h4(id=id)&attributes(attributes) + +permalink(id, source) + block + +mixin h5(id, source) + h5(id=id)&attributes(attributes) + +permalink(id, source) + block + +mixin h6(id, source) + h6(id=id)&attributes(attributes) + +permalink(id, source) + block + +mixin permalink(id, source) + if id + a.permalink(href='#' + id) + block + + else + block + + if source + +button('secondary', 'small', 'source')(href=source target='_blank') Source + + +//- Button + element - [string] specifies HTML element, 'button' or 'link' + ...style - [strings] button CSS classes without prefix (optional) + block - button text (inline elements) + +mixin button(type, ...style) + - var classname = 'button-' + type + ' ' + ((style) ? prefixArgs(style, 'button--') : '') + + a.button(class=classname)&attributes(attributes) + block + +mixin form-button(type, ...style) + - var classname = 'button-' + type + ' ' + ((style) ? prefixArgs(style, 'button--') : '') + button(class=classname)&attributes(attributes) + block + + +//- Input + placeholder - [string] placeholder for input field (optional) + value - [string] value of input field (optional) + +mixin input(placeholder, value) + input.input(placeholder=placeholder value=value)&attributes(attributes) + + +//- Icon + name - [string] icon name, refers to CSS classes + size - [string] 'medium' or 'large' (optional) + type - [string] 'button' (optional) + block - description, if as a text node to the icon element it prevents line + breaks between icon and text (inline elements) + +mixin icon(type, ...style) + span(class='icon-' + type + ' ' + prefixArgs(style, 'icon--') aria-hidden="true")&attributes(attributes) + block + + +//- Image for illustration purposes + file - [string] file name (in /img) + alt - [string] descriptive alt text (optional) + caption - [string] image caption (optional) + +mixin image(file, alt, caption) + figure.image-container&attributes(attributes) + img(src='img/' + file alt=alt) + + if caption + figcaption.text-caption=caption + + block + + +//- Illustrated code view + title - [string] title of window + +mixin code-demo(title) + .x-terminal&attributes(attributes) + .x-terminal-icons: span + .x-terminal-title=title + +code.x-terminal-code + block + + +//- Data table + head - [array] column headings (optional, without headings no table + head is displayed) + ...style - [strings] table CSS classes without prefix (optional) + block - only +row (tr) + +mixin table(head, ...style) + table.table(class=prefixArgs(style, 'table--'))&attributes(attributes) + + if head + tr.table-row + each column in head + th.table-head-cell=column + + block + + +//- Data table row + block - only +cell (td) + +mixin row(...style) + tr.table-row(class=prefixArgs(style, 'table-cell--'))&attributes(attributes) + block + + +//- Data table cell + block - table cell content (inline elements) + +mixin cell(...style) + td.table-cell(class=prefixArgs(style, 'table-cell--'))&attributes(attributes) + block + + +//- General list (ordered and unordered) + type - [string] 'numbers', 'letters', 'roman' (optional) + start - [integer] starting point of list (1 = list starts at 1 or A) + block - only +item (li) + +mixin list(type, start) + if type + ol.list(class='list--' + type style=(start === 0 || start) ? 'counter-reset: li ' + (start - 1) : '')&attributes(attributes) + block + + else + ul.list.list--bullets&attributes(attributes) + block + + +//- List item + block - item text (inline elements) + +mixin item + li.list-item&attributes(attributes) + block + + +//- Blockquote + source - [string] quote source / author (optional) + link - [string] link to quote source (only with source, optional) + block - quote text (inline elements) + +mixin quote(source, link) + blockquote.quote&attributes(attributes) + p.quote-text + block + + if source && link + | #[a.quote-source(href=link target='_blank')=source] + + else if source && !link + .quote-source !{source} + + +//- Pullquotes with optional 'tweet this' function + tweet - [string] text to be tweeted (optional) + block - pullquote text (inline elements, only shown if no tweet text) + +mixin pullquote(tweet) + blockquote.quote&attributes(attributes) + + p.quote-text-strong + if tweet + | !{tweet} #[a.quote-source(href=twitterShareUrl(current.path, tweet) target='_blank') Tweet this] + + else + block + + +//- Code block + use as +code(args). to preserve whitespace and prevent code interprettion + language - [string] language for syntax highlighting (optional, default: + 'python', see Prism for options: http://prismjs.com) + label - [string] code block headline (optional) + block - code text (inline elements) + + +mixin code(language, label) + pre.code-block(class='lang-' + (language || default_syntax) data-label=label)&attributes(attributes) + code.code-inline + block + + +//- Infobox for notes and alerts + label - [string] infobox headline (optional) + block - infobox text (inline and block elements) + +mixin infobox(label) + .box.box--info(data-label=label)&attributes(attributes) + p.box-body + block + + +//- Alerts for notes and updates + +mixin alert(button) + .alert&attributes(attributes) + block + + if button + +form-button('primary', 'small')(onclick='this.parentNode.parentNode.removeChild(this.parentNode);')=button + + else + button.alert-close(onclick='this.parentNode.parentNode.removeChild(this.parentNode);') + + + +//- Embeds + border - [boolean] add border to embed container + caption - [string] embed caption + block - embed content (inline and block elements) + +mixin embed(border, caption) + figure.embed(class=(border) ? 'embed--border' : '')&attributes(attributes) + block + + if caption + figcaption.embed-caption=caption + + +//- displaCy + filename - [string] name of file in displacy folder (no .html) + caption - [string] caption (optional) + height - [integer] iframe height in px (optional) + +mixin displacy(filename, caption, height) + +embed(true, caption).embed--displacy + iframe(src='/blog/displacy/' + filename height=height) + + +//- Logo, imports SVG + size - [string] 'tiny', 'small', 'regular' or 'large' + +mixin logo(size) + !=partial('/_includes/_logo', { logo_size: size }) + + +//-