mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	* Prepare English class for NER
This commit is contained in:
		
							parent
							
								
									f5830dc1c1
								
							
						
					
					
						commit
						220ce8bfed
					
				| 
						 | 
				
			
			@ -7,6 +7,7 @@ from ..vocab import Vocab
 | 
			
		|||
from ..tokenizer import Tokenizer
 | 
			
		||||
from ..syntax.parser import GreedyParser
 | 
			
		||||
from ..syntax.arc_eager import ArcEager
 | 
			
		||||
from ..syntax.ner import BiluoPushDown
 | 
			
		||||
from ..tokens import Tokens
 | 
			
		||||
from .pos import EnPosTagger
 | 
			
		||||
from .pos import POS_TAGS
 | 
			
		||||
| 
						 | 
				
			
			@ -58,6 +59,7 @@ class English(object):
 | 
			
		|||
            for later loading.
 | 
			
		||||
    """
 | 
			
		||||
    ParserTransitionSystem = ArcEager
 | 
			
		||||
    EntityTransitionSystem = BiluoPushDown
 | 
			
		||||
 | 
			
		||||
    def __init__(self, data_dir=''):
 | 
			
		||||
        if data_dir == '':
 | 
			
		||||
| 
						 | 
				
			
			@ -74,6 +76,7 @@ class English(object):
 | 
			
		|||
            infix_re = None
 | 
			
		||||
            self.has_parser_model = False
 | 
			
		||||
            self.has_tagger_model = False
 | 
			
		||||
            self.has_entity_model = False
 | 
			
		||||
        else:
 | 
			
		||||
            tok_data_dir = path.join(data_dir, 'tokenizer')
 | 
			
		||||
            tok_rules, prefix_re, suffix_re, infix_re = read_lang_data(tok_data_dir)
 | 
			
		||||
| 
						 | 
				
			
			@ -82,6 +85,7 @@ class English(object):
 | 
			
		|||
            infix_re = re.compile(infix_re)
 | 
			
		||||
            self.has_parser_model = path.exists(path.join(self._data_dir, 'deps'))
 | 
			
		||||
            self.has_tagger_model = path.exists(path.join(self._data_dir, 'pos'))
 | 
			
		||||
            self.has_entity_model = path.exists(path.join(self._data_dir, 'ner'))
 | 
			
		||||
 | 
			
		||||
        self.tokenizer = Tokenizer(self.vocab, tok_rules, prefix_re,
 | 
			
		||||
                                   suffix_re, infix_re,
 | 
			
		||||
| 
						 | 
				
			
			@ -89,6 +93,7 @@ class English(object):
 | 
			
		|||
        # These are lazy-loaded
 | 
			
		||||
        self._tagger = None
 | 
			
		||||
        self._parser = None
 | 
			
		||||
        self._entity = None
 | 
			
		||||
 | 
			
		||||
    @property
 | 
			
		||||
    def tagger(self):
 | 
			
		||||
| 
						 | 
				
			
			@ -103,7 +108,15 @@ class English(object):
 | 
			
		|||
                                        self.ParserTransitionSystem)
 | 
			
		||||
        return self._parser
 | 
			
		||||
 | 
			
		||||
    def __call__(self, text, tag=True, parse=parse_if_model_present):
 | 
			
		||||
    @property
 | 
			
		||||
    def entity(self):
 | 
			
		||||
        if self._entity is None:
 | 
			
		||||
            self._entity = GreedyParser(path.join(self._data_dir, 'ner'),
 | 
			
		||||
                                        self.EntityTransitionSystem)
 | 
			
		||||
        return self._entity
 | 
			
		||||
 | 
			
		||||
    def __call__(self, text, tag=True, parse=parse_if_model_present,
 | 
			
		||||
                 entity=parse_if_model_present):
 | 
			
		||||
        """Apply the pipeline to some text.  The text can span multiple sentences,
 | 
			
		||||
        and can contain arbtrary whitespace.  Alignment into the original string
 | 
			
		||||
        
 | 
			
		||||
| 
						 | 
				
			
			@ -135,21 +148,39 @@ class English(object):
 | 
			
		|||
            msg = ("Incompatible arguments: tag=False, parse=True"
 | 
			
		||||
                   "Part-of-speech tags are required for parsing.")
 | 
			
		||||
            raise ValueError(msg)
 | 
			
		||||
        if entity == True and tag == False:
 | 
			
		||||
            msg = ("Incompatible arguments: tag=False, entity=True"
 | 
			
		||||
                   "Part-of-speech tags are required for entity recognition.")
 | 
			
		||||
            raise ValueError(msg)
 | 
			
		||||
 | 
			
		||||
        tokens = self.tokenizer(text)
 | 
			
		||||
        if parse == -1 and tag == False:
 | 
			
		||||
            parse = False
 | 
			
		||||
        elif parse == -1 and not self.has_parser_model:
 | 
			
		||||
            parse = False
 | 
			
		||||
        if entity == -1 and tag == False:
 | 
			
		||||
            entity = False
 | 
			
		||||
        elif entity == -1 and not self.has_entity_model:
 | 
			
		||||
            entity = False
 | 
			
		||||
        if tag and self.has_tagger_model:
 | 
			
		||||
            self.tagger(tokens)
 | 
			
		||||
        if parse == True and not self.has_parser_model:
 | 
			
		||||
            msg = ("Receive parse=True, but parser model not found.\n\n"
 | 
			
		||||
            msg = ("Received parse=True, but parser model not found.\n\n"
 | 
			
		||||
                  "Run:\n"
 | 
			
		||||
                  "$ python -m spacy.en.download\n"
 | 
			
		||||
                  "To install the model.")
 | 
			
		||||
            raise IOError(msg)
 | 
			
		||||
        if entity == True and not self.has_entity_model:
 | 
			
		||||
            msg = ("Received entity=True, but entity model not found.\n\n"
 | 
			
		||||
                  "Run:\n"
 | 
			
		||||
                  "$ python -m spacy.en.download\n"
 | 
			
		||||
                  "To install the model.")
 | 
			
		||||
            raise IOError(msg)
 | 
			
		||||
 | 
			
		||||
        if parse and self.has_parser_model:
 | 
			
		||||
            self.parser(tokens)
 | 
			
		||||
        if entity and self.has_entity_model:
 | 
			
		||||
            self.entity(tokens)
 | 
			
		||||
        return tokens
 | 
			
		||||
 | 
			
		||||
    @property
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue
	
	Block a user