#!/usr/bin/env python
#coding=utf8

'''
    Tag Query Language tools module.
'''

from __future__ import absolute_import

import re
from fnmatch import fnmatch
from ccserver.orderedset import OrderedSet

# Helpers functions:

PREFIXES = {'b': 1, 'o': 1,
            'k': 1000, 'ki': 1024,
            'm': 1000**2, 'mi': 1024**2,
            'g': 1000**3, 'gi': 1024**3,
            't': 1000**4, 'ti': 1024**4,
            'p': 1000**5, 'pi': 1024**5}

def prefix(number):
    '''
    Cast as integer a prefixed string (eg: "42M" -> 42000000)
    '''
    number = str(number).lower()
    for sym, mul in PREFIXES.items():
        if number.endswith(sym):
            num = number.rstrip(sym)
            try:
                return float(num) * mul
            except ValueError:
                return number
    else:
        try:
            return float(number)
        except ValueError:
            return number

# End of helpers.

class TqlParsingError(Exception):
    '''
    Error raised when a parsing error is occured.
    '''
    pass


class TqlObject(dict):
    '''
    A special dict used to represent a TQL object. Each key/value of the
    :class:`TqlObject` instance is a tag associated to its value.

    It differs from a classical dict by this points:
     * An 'id' key *MUST* exist and be passed to the constructor, this key
       is also read-only and can't be removed.
     * The :meth:`__hash__` method is defined and allow to store
       :class:`TqlObjects` instance in sets.
     * An :meth:`to_dict` method allow to export the object as a standard dict.
    '''

    def __init__(self, *args, **kwargs):
        super(TqlObject, self).__init__(*args, **kwargs)
        if 'id' not in self:
            raise ValueError('"id" key must be defined in constructor')

    def __hash__(self):
        return self['id'].__hash__()

    def __setitem__(self, key, value):
        if key == 'id':
            raise KeyError('Key %r in read-only.' % key)
        else:
            super(TqlObject, self).__setitem__(key, value)

    def __delitem__(self, key):
        if key == 'id':
            raise KeyError('Key %r in read-only.' % key)
        else:
            super(TqlObject, self).__delitem__(key)

    def to_dict(self, tags, deny=None):
        '''
        Export the :class:`TqlObject` as standard Python dictionnary. This
        method takes two arguments used to define the set of tags to include in
        the exported dictionnary.

        :param tags: the list of tags to export or None if you want to export
            all tags
        :param deny: useful in case where tags is None, to exclude some tags
            from the export

        .. note:
            This method doesn't export private tags (tags which starts with '_')
        '''
        
        if tags is not None:
            tags = set(tags)
            tags.add('id')

        if deny is None:
            deny = set()
        else:
            deny = set(deny)
            deny -= set(('id',))

        exported = {}

        for key, value in self.iteritems():
            if not key.startswith('_'):
                if (key not in deny and (tags is None or key in tags)
                    and value is not None):
                    exported[key] = str(value)

        return exported


class TqlLexer(object):

    '''
    Simple tokenizer for the TQL language.

    :param tql: the TQL string to tokenize.
    '''

    TOK_EOL = 1
    TOK_WORD = 2
    TOK_SEP = 3
    TOK_OP = 4
    TOK_PAR_OPEN = 5
    TOK_PAR_CLOSE = 6

    CHAR_QUOTES = '"\''
    CHAR_SEP = '&|^$%'
    CHAR_OP = '<>=:!~'
    CHAR_PAR_OPEN = '('
    CHAR_PAR_CLOSE = ')'
    
    def __init__(self, tql):
        # The string representing the query to tokenize:
        self._buf = tql

        # The current position of the cursor in the string:
        self._cursor = 0

        # The current processed token cache:
        self._tokens_cache = []

    def get_token(self):
        '''
        Get the next token of the query and forward.
        '''

        if not self._tokens_cache:
            # If the cache is empty, get the next token and return it directly:
            return self._get_next_token()
        else:
            # If the cache is not empty, return its first token:
            return self._tokens_cache.pop(0)

    def look_ahead(self, upto=1):
        '''
        Get the next token of the query but don't forward (multiple look_ahead
        calls will return the same token of get_token is not called).
        '''
        
        if len(self._tokens_cache) < upto:
            # The cache is not populated with enough tokens, we process the
            # difference between the look ahead value and the current number
            # of cached tokens available:
            for _ in xrange(upto - len(self._tokens_cache)):
                self._tokens_cache.append(self._get_next_token())

        # And return the cached token:
        return self._tokens_cache[upto - 1]

    def _get_next_token(self):
        '''
        Process the next token and return it.
        '''

        # The buffer which store the value of current token:
        ctoken = ''

        # The type of the current processed token:
        ctype = None

        # The current opened quote and it escapment flag:
        cquote = None
        cescape = False
        
        while self._cursor < len(self._buf):
            cur = self._buf[self._cursor]

            if cquote:
                # We are in "quote mode", we will eat each characted until
                # the closing quote is found:
                self._cursor += 1
                if cur == cquote and not cescape:
                    # Closing quote is reached, we can return
                    # the word token:
                    return (self.TOK_WORD, ctoken)
                elif cur == '\\': # Escapement handling
                    cescape = True
                else:
                    ctoken += cur
                    cescape = False
            elif cur in self.CHAR_SEP:
                # The current character is a separator, two cases:
                if ctoken:
                    # The current token buffer is not empty, we have reached
                    # the end of the current token, so we return it but we
                    # don't forward the cursor:
                    return (ctype, ctoken)
                else:
                    # The current token buffer is empty, we process this new
                    # separator token and return it (we directly return it since
                    # separator tokens are composed of a single character):
                    self._cursor += 1
                    return (self.TOK_SEP, cur)
            elif cur in self.CHAR_OP:
                # The current character is a part of an operator:
                if ctype is not None and ctype != self.TOK_OP:
                    # If we just entered in a new operator token, we return the
                    # last processed token without forwarding the cursor:
                    return (ctype, ctoken)
                else:
                    # Else, we process this new operator token and forward the
                    # cursor:
                    ctype = self.TOK_OP
                    ctoken += cur
                    self._cursor += 1
            elif cur in self.CHAR_QUOTES:
                # The current character is an opening quote, we enter in a mode
                # where all characters of the buffer will be eaten until the 
                # closing quote is reached:
                if ctype is not None:
                    return (ctype, ctoken)
                cquote = cur
                self._cursor += 1
            elif cur == self.CHAR_PAR_OPEN:
                # The current caracter is an opening parenthesis:
                if ctoken:
                    # The current token buffer is not empty, we have reached
                    # the end of the current token, so we return it but we
                    # don't forward the cursor:
                    return (ctype, ctoken)
                else:
                    self._cursor += 1
                    return (self.TOK_PAR_OPEN, '(')
            elif cur == self.CHAR_PAR_CLOSE:
                # The current caracter is an opening parenthesis:
                if ctoken:
                    # The current token buffer is not empty, we have reached
                    # the end of the current token, so we return it but we
                    # don't forward the cursor:
                    return (ctype, ctoken)
                else:
                    self._cursor += 1
                    return (self.TOK_PAR_CLOSE, ')')
            else:
                # The current character is a part of a word token
                if ctype is not None and ctype != self.TOK_WORD:
                    # If we just entered in a new word token, we return the
                    # last processed token without forwarding the cursor:
                    return (ctype, ctoken)
                else:
                    # Else, we process this new word token and forward the
                    # cursor:
                    ctype = self.TOK_WORD
                    ctoken += cur
                    self._cursor += 1

        # We have reached the end of the buffer:
        if cquote:
            # We are in "quote mode", we need to raise a parsing error
            # because the quote is not closed:
            raise TqlParsingError('Quote not closed')
        elif ctoken:
            # The current token buffer is not empty, we return this
            # current token:
            return (ctype, ctoken)
        else:
            # The current token buffer is empty, we return the EOL
            # token:
            return (self.TOK_EOL, 'eol')


class TqlAst(object):
    '''
    Base class for all AST components.
    '''
    pass


class TqlAstAll(object):
    '''
    Represents the complete set of objects.
    '''

    def __init__(self):
        pass

    def eval(self, objects, all_objects):
        return OrderedSet(all_objects), all_objects


class TqlAstTag(TqlAst):
    '''
    A single tag in the TQL AST. Tag name can be prefixed of a "-"
    character to invert selection behavior.
    '''

    def __init__(self, name):
        if name.startswith('-'):
            self.negate = True
            name = name[1:]
        else:
            self.negate = False
        self.name = name

    def get_name(self):
        return '-' + self.name if self.negate else self.name

    def to_dot(self):
        '''
        Return the DOT format representation of the current AST node and
        sub-nodes.
        '''

        stmts = []
        name = id(self)
        stmts.append('%s [label="%s"];' % (name, self.name))
        return stmts

    def eval(self, objects, all_objects):
        objs = OrderedSet()
        for obj in objects:
            if (not self.negate and self.name in obj
                or self.negate and not self.name in obj):
                objs.add(obj)
        return objs, all_objects


class TqlAstTagCondition(TqlAst):
    '''
    A tag attached to a condition (operator + value) which select only
    the items which match the condition.
    '''

    OPERATORS = {':': 'op_glob',
                 '=': 'op_equal',
                 '>': 'op_gt',
                 '<': 'op_lt',
                 '>=': 'op_gte',
                 '<=': 'op_lte',
                 '~': 'op_regex'}

    NEGATABLE_OPERATORS = (':', '=', '~')

    def __init__(self, name, operator, value):
        self.name = name
        self.operator = operator
        self.value = value

        if self.operator.startswith('!'):
            negate = True
            self.operator = self.operator[1:]
            if self.operator not in self.NEGATABLE_OPERATORS:
                raise TqlParsingError('Bad operator %r' % operator)
        else:
            negate = False

        op_funcname = self.OPERATORS.get(self.operator)
        if op_funcname is None:
            raise TqlParsingError('Bad operator %r' % self.operator)
        else:
            if hasattr(self, op_funcname):
                func = getattr(self, op_funcname)
                if negate:
                    self.operator_func = lambda l, r: not func(l, r)
                else:
                    self.operator_func = func
            else:
                raise TqlParsingError('Operator function not found '
                                      '%r' % op_funcname)

    def to_dot(self):
        '''
        Return the DOT format representation of the current AST node and
        sub-nodes.
        '''

        stmts = []
        name = id(self)
        stmts.append('%s [label="%s %s %s"];' % (name, self.name, self.operator,
                                                 self.value))
        return stmts

    def eval(self, objects, all_objects):
        objs = OrderedSet()
        for obj in objects:
            if self.name in obj:
                value = obj[self.name]
                if self.operator_func(value, self.value):
                    objs.add(obj)
        return objs, all_objects

    def op_glob(self, value, pattern):
        '''
        The globbing operator, handle * and ? characters.
        '''

        if value is None:
            return False
        return fnmatch(str(value), str(pattern))

    def op_lt(self, lvalue, rvalue):
        '''
        The lesser than operator.
        '''

        lvalue = prefix(lvalue)
        rvalue = prefix(rvalue)
        return lvalue < rvalue
        
    def op_lte(self, lvalue, rvalue):
        '''
        The lesser or equal than operator.
        '''

        lvalue = prefix(lvalue)
        rvalue = prefix(rvalue)
        return lvalue <= rvalue
        
    def op_gt(self, lvalue, rvalue):
        '''
        The greater than operator.
        '''

        lvalue = prefix(lvalue)
        rvalue = prefix(rvalue)
        return lvalue > rvalue
        
    def op_gte(self, lvalue, rvalue):
        '''
        The greater or equal than operator.
        '''

        lvalue = prefix(lvalue)
        rvalue = prefix(rvalue)
        return lvalue >= rvalue

    def op_regex(self, value, pattern):
        '''
        The regular expression operator.
        '''

        if value is None:
            return False
        try:
            return re.match(pattern, value) is not None
        except re.error:
            raise TqlParsingError('Error in your regex pattern: %s' % pattern)

    def op_equal(self, lvalue, rvalue):
        '''
        The strict equal operator.
        '''

        lvalue, rvalue = str(lvalue), str(rvalue)
        if lvalue is None:
            return False
        if lvalue.isdigit() and rvalue.isdigit(): # Integer comparison:
            lvalue = prefix(lvalue)
            rvalue = prefix(rvalue)

        return lvalue == rvalue


class TqlAstBinarySeparators(TqlAst):
    '''
    Base class for binary separators.
    '''

    node_token = None

    def __init__(self, left, right):
        self.left = left
        self.right = right

    def to_dot(self):
        '''
        Return the DOT format representation of the current AST node and
        sub-nodes.
        '''

        stmts = []
        name = id(self)
        stmts.append('%s [shape=circle, label="%s"];' % (name, self.node_token))
        stmts.append('%s -> %s [label="l"];' % (name, id(self.left)))
        stmts.append('%s -> %s [label="r"];' % (name, id(self.right)))
        stmts += self.left.to_dot()
        stmts += self.right.to_dot()
        return stmts

    def eval(self, objects, objects_all):
        raise NotImplementedError('This function must be implemented '
                                  'in derivated classes')


class TqlAstSorter(TqlAst):
    '''
    A sorting separator.
    '''
    
    node_token = '%'

    def __init__(self, child, name):
        self.name = name
        self.child = child

    def to_dot(self):
        '''
        Return the DOT format representation of the current AST node and
        sub-nodes.
        '''

        stmts = []
        name = id(self)
        stmts.append('%s [label="%s (%s)"];' % (name, self.node_token,
                                                self.name.name))
        stmts.append('%s -> %s;' % (name, id(self.child)))
        stmts += self.child.to_dot()
        return stmts

    def eval(self, objects, all_objects):
        objects, _ = self.child.eval(objects, all_objects)
        sorted_objects = sorted(objects, key=self._sort_getter)
        if self.name.negate:
            sorted_objects = reversed(sorted_objects)
        return sorted_objects, all_objects

    def _sort_getter(self, obj):
        value = obj.get(self.name.name)

        try:
            value = float(str(value))
        except ValueError:
            pass

        return value

class TqlAstLimit(TqlAst):
    '''
    A limitation separator.
    '''

    node_token = '^'

    def __init__(self, child, obj_slice):
        length, _, offset = obj_slice.partition(',')
        if offset:
            self.start = int(offset)
        else:
            self.start = 0
        if length:
            self.stop = self.start + int(length)
        else:
            self.stop = None
        self.child = child

    def to_dot(self):
        '''
        Return the DOT format representation of the current AST node and
        sub-nodes.
        '''

        stmts = []
        name = id(self)
        stmts.append('%s [label="%s (%s,%s)"];' % (name, self.node_token,
                                                   self.start, self.stop))
        stmts.append('%s -> %s;' % (name, id(self.child)))
        stmts += self.child.to_dot()
        return stmts

    def eval(self, objects, all_objects):
        objects, _ = self.child.eval(objects, all_objects)
        return tuple(objects)[self.start:self.stop], all_objects


class TqlAstIntersect(TqlAstBinarySeparators):
    '''
    An intersection separator.
    '''
    
    node_token = '&'

    def eval(self, objects, all_objects):
        objects = OrderedSet(objects)
        left = OrderedSet(self.left.eval(objects, all_objects)[0])
        right = OrderedSet(self.right.eval(objects, all_objects)[0])
        return left & right, all_objects


class TqlAstUnion(TqlAstBinarySeparators):
    '''
    An union separator.
    '''

    node_token = '|'

    def eval(self, objects, all_objects):
        objects = OrderedSet(objects)
        left = OrderedSet(self.left.eval(objects, all_objects)[0])
        right = OrderedSet(self.right.eval(objects, all_objects)[0])
        return left | right, all_objects


class TqlParser(object):
    '''
    Parse a TQL query and return the AST and the list of tags to show.
    '''

    def __init__(self, query):
        self._query = query
        self._lexer = TqlLexer(query)
        self._to_show = []
        self._to_get = set()
        self._to_check = set()
        
    def parse(self):
        '''
        Parse and return the AST of the TQL query. This method also returns
        the list of tags to show, to get and to check.
        '''
        
        self._to_show = []
        self._to_get.clear()
        self._to_check.clear()
        return (self._parse(), self._to_show, self._to_get, self._to_check)

    def _parse(self):
        # Watch the next token to process:
        token = self._lexer.look_ahead()

        # Token is a word, we will parse it:
        if token[0] == self._lexer.TOK_WORD:
            word = self._parse_expression()
            token = self._lexer.look_ahead()

            # The common token after a name is a separator:
            if token[0] == self._lexer.TOK_SEP:
                return self._parse_separator(word)
            elif token[0] in (self._lexer.TOK_EOL, self._lexer.TOK_PAR_CLOSE):
                return word
            else:
                raise TqlParsingError('Unexpected token %s' % token[1])

        # Token is an opening parenthesis, we recursively call the _parse
        # method to process the sub-query:
        elif token[0] == self._lexer.TOK_PAR_OPEN:
            self._lexer.get_token()
            tree = self._parse()
            token = self._lexer.look_ahead()

            # An sub-query must be closed by a closing parenthesis:
            if token[0] == self._lexer.TOK_PAR_CLOSE:
                self._lexer.get_token()

                # Check if sub-query is not followed by another sub-query:
                token = self._lexer.look_ahead()
                if token[0] == self._lexer.TOK_SEP:
                    return self._parse_separator(tree)
                elif token[0] in (self._lexer.TOK_EOL,
                                  self._lexer.TOK_PAR_CLOSE):
                    return tree
                else:
                    raise TqlParsingError('Unexpected token %r' % token[1])
            else:
                raise TqlParsingError('Unexpected token %r' % token[1])

        elif token[0] == self._lexer.TOK_SEP:
            return self._parse_separator(TqlAstAll())

        # Token is the EOL:
        elif token[0] == self._lexer.TOK_EOL:
            return None

        else:
            raise TqlParsingError('Parsing error, WORD expected.')

    def _parse_separator(self, left):
        # Getting the operator token and the token to it right:
        separator = self._lexer.get_token()
        right = self._lexer.look_ahead()

        # Checking if separator token is really a separator:
        if separator[0] != self._lexer.TOK_SEP:
            raise TqlParsingError('Unexpected token %r' % separator[1])

        # Checking if the token to the right of the separator is an opening
        # parenthesis or a word and parse it:
        if right[0] == self._lexer.TOK_PAR_OPEN:
            right = self._parse()
        elif right[0] == self._lexer.TOK_WORD:
            if separator[1] in '&|':
                # The right token of a & or | separator can be a single
                # token 'word', or an expression:
                right = self._parse_expression()
            elif separator[1] == '^':
                # The right token of a ^ separator is not a tag but
                # a slice (x,y):
                right = right[1]
                self._lexer.get_token()
            elif separator[1] == '$':
                # $ separator is not handler in AST:
                right = self._lexer.get_token()[1]
            else:
                right = self._parse_word()
        else:
            raise TqlParsingError('Unexpected token %r' % separator[1])

        # Create the AST node for each separator:
        if separator[1] == '&':
            tree = TqlAstIntersect(left, right)
        elif separator[1] == '|':
            tree = TqlAstUnion(left, right)
        elif separator[1] == '^':
            tree = TqlAstLimit(left, right)
        elif separator[1] == '%':
            tree = TqlAstSorter(left, right)
        elif separator[1] == '$':
            self._to_show.append(right)
            tree = left
        else:
            raise TqlParsingError('Bad separator %r' % separator[1])

        # Process the next token, maybe another separator, or the
        # end of line / sub-query:
        token = self._lexer.look_ahead()
        if token[0] == self._lexer.TOK_SEP:
            return self._parse_separator(tree)
        elif token[0] in (self._lexer.TOK_EOL, self._lexer.TOK_PAR_CLOSE):
            return tree
        else:
            raise TqlParsingError('Unexpected token %r' % token[1])

    def _parse_word(self):
        word = self._lexer.get_token()

        # Add the tag name to the list of tags to get:
        self._to_check.add(word[1])
        self._to_show.append(word[1])

        return TqlAstTag(word[1])

    def _parse_expression(self):
        left = self._lexer.get_token()
        next = self._lexer.look_ahead()

        # Add the tag name to the list of tags to get:
        self._to_get.add(left[1])
        self._to_show.append(left[1])

        if next[0] == self._lexer.TOK_OP:
            operator = self._lexer.get_token()
            right = self._lexer.get_token()
            
            if right[0] != self._lexer.TOK_WORD:
                raise TqlParsingError('Unexpected token %r' % right[1])
            
            return TqlAstTagCondition(left[1], operator[1], right[1])
        else:
            return TqlAstTag(left[1])
