mirror of
				https://github.com/sqlmapproject/sqlmap.git
				synced 2025-11-04 01:47:37 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			146 lines
		
	
	
		
			5.0 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			146 lines
		
	
	
		
			5.0 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
######################## BEGIN LICENSE BLOCK ########################
 | 
						|
# The Original Code is Mozilla Universal charset detector code.
 | 
						|
#
 | 
						|
# The Initial Developer of the Original Code is
 | 
						|
# Netscape Communications Corporation.
 | 
						|
# Portions created by the Initial Developer are Copyright (C) 2001
 | 
						|
# the Initial Developer. All Rights Reserved.
 | 
						|
#
 | 
						|
# Contributor(s):
 | 
						|
#   Mark Pilgrim - port to Python
 | 
						|
#   Shy Shalom - original C code
 | 
						|
#
 | 
						|
# This library is free software; you can redistribute it and/or
 | 
						|
# modify it under the terms of the GNU Lesser General Public
 | 
						|
# License as published by the Free Software Foundation; either
 | 
						|
# version 2.1 of the License, or (at your option) any later version.
 | 
						|
#
 | 
						|
# This library is distributed in the hope that it will be useful,
 | 
						|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
						|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 | 
						|
# Lesser General Public License for more details.
 | 
						|
#
 | 
						|
# You should have received a copy of the GNU Lesser General Public
 | 
						|
# License along with this library; if not, write to the Free Software
 | 
						|
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
 | 
						|
# 02110-1301  USA
 | 
						|
######################### END LICENSE BLOCK #########################
 | 
						|
 | 
						|
import logging
 | 
						|
import re
 | 
						|
 | 
						|
from .enums import ProbingState
 | 
						|
 | 
						|
 | 
						|
class CharSetProber(object):
 | 
						|
 | 
						|
    SHORTCUT_THRESHOLD = 0.95
 | 
						|
 | 
						|
    def __init__(self, lang_filter=None):
 | 
						|
        self._state = None
 | 
						|
        self.lang_filter = lang_filter
 | 
						|
        self.logger = logging.getLogger(__name__)
 | 
						|
 | 
						|
    def reset(self):
 | 
						|
        self._state = ProbingState.DETECTING
 | 
						|
 | 
						|
    @property
 | 
						|
    def charset_name(self):
 | 
						|
        return None
 | 
						|
 | 
						|
    def feed(self, buf):
 | 
						|
        pass
 | 
						|
 | 
						|
    @property
 | 
						|
    def state(self):
 | 
						|
        return self._state
 | 
						|
 | 
						|
    def get_confidence(self):
 | 
						|
        return 0.0
 | 
						|
 | 
						|
    @staticmethod
 | 
						|
    def filter_high_byte_only(buf):
 | 
						|
        buf = re.sub(b'([\x00-\x7F])+', b' ', buf)
 | 
						|
        return buf
 | 
						|
 | 
						|
    @staticmethod
 | 
						|
    def filter_international_words(buf):
 | 
						|
        """
 | 
						|
        We define three types of bytes:
 | 
						|
        alphabet: english alphabets [a-zA-Z]
 | 
						|
        international: international characters [\x80-\xFF]
 | 
						|
        marker: everything else [^a-zA-Z\x80-\xFF]
 | 
						|
 | 
						|
        The input buffer can be thought to contain a series of words delimited
 | 
						|
        by markers. This function works to filter all words that contain at
 | 
						|
        least one international character. All contiguous sequences of markers
 | 
						|
        are replaced by a single space ascii character.
 | 
						|
 | 
						|
        This filter applies to all scripts which do not use English characters.
 | 
						|
        """
 | 
						|
        filtered = bytearray()
 | 
						|
 | 
						|
        # This regex expression filters out only words that have at-least one
 | 
						|
        # international character. The word may include one marker character at
 | 
						|
        # the end.
 | 
						|
        words = re.findall(b'[a-zA-Z]*[\x80-\xFF]+[a-zA-Z]*[^a-zA-Z\x80-\xFF]?',
 | 
						|
                           buf)
 | 
						|
 | 
						|
        for word in words:
 | 
						|
            filtered.extend(word[:-1])
 | 
						|
 | 
						|
            # If the last character in the word is a marker, replace it with a
 | 
						|
            # space as markers shouldn't affect our analysis (they are used
 | 
						|
            # similarly across all languages and may thus have similar
 | 
						|
            # frequencies).
 | 
						|
            last_char = word[-1:]
 | 
						|
            if not last_char.isalpha() and last_char < b'\x80':
 | 
						|
                last_char = b' '
 | 
						|
            filtered.extend(last_char)
 | 
						|
 | 
						|
        return filtered
 | 
						|
 | 
						|
    @staticmethod
 | 
						|
    def filter_with_english_letters(buf):
 | 
						|
        """
 | 
						|
        Returns a copy of ``buf`` that retains only the sequences of English
 | 
						|
        alphabet and high byte characters that are not between <> characters.
 | 
						|
        Also retains English alphabet and high byte characters immediately
 | 
						|
        before occurrences of >.
 | 
						|
 | 
						|
        This filter can be applied to all scripts which contain both English
 | 
						|
        characters and extended ASCII characters, but is currently only used by
 | 
						|
        ``Latin1Prober``.
 | 
						|
        """
 | 
						|
        filtered = bytearray()
 | 
						|
        in_tag = False
 | 
						|
        prev = 0
 | 
						|
 | 
						|
        for curr in range(len(buf)):
 | 
						|
            # Slice here to get bytes instead of an int with Python 3
 | 
						|
            buf_char = buf[curr:curr + 1]
 | 
						|
            # Check if we're coming out of or entering an HTML tag
 | 
						|
            if buf_char == b'>':
 | 
						|
                in_tag = False
 | 
						|
            elif buf_char == b'<':
 | 
						|
                in_tag = True
 | 
						|
 | 
						|
            # If current character is not extended-ASCII and not alphabetic...
 | 
						|
            if buf_char < b'\x80' and not buf_char.isalpha():
 | 
						|
                # ...and we're not in a tag
 | 
						|
                if curr > prev and not in_tag:
 | 
						|
                    # Keep everything after last non-extended-ASCII,
 | 
						|
                    # non-alphabetic character
 | 
						|
                    filtered.extend(buf[prev:curr])
 | 
						|
                    # Output a space to delimit stretch we kept
 | 
						|
                    filtered.extend(b' ')
 | 
						|
                prev = curr + 1
 | 
						|
 | 
						|
        # If we're not in a tag...
 | 
						|
        if not in_tag:
 | 
						|
            # Keep everything after last non-extended-ASCII, non-alphabetic
 | 
						|
            # character
 | 
						|
            filtered.extend(buf[prev:])
 | 
						|
 | 
						|
        return filtered
 |