Code refactory to move the new parser into parser.py out of optimizer.py, that will still be compatible

git-svn-id: http://galileo.dmi.unict.it/svn/relational/trunk@190 014f5005-505e-4b48-8d0a-63407b615a7c
2009-06-09 10:36:33 +07:00 · 2009-06-09 10:36:33 +07:00 · 3c4b91272b
parent 6524ea2d9f
commit 3c4b91272b
3 changed files with 252 additions and 239 deletions
--- a/3
+++ b/3
@ -85,4 +85,5 @@
 - Added stub for converting SQL to relational algebra
 - Implemented futile_union_intersection_subtraction general optimization
 - Implemented swap_rename_projection general optimization
- Replaced old relational algebra to python compiler with new one based on the new tokenizer/parser (Rev 188)
+- Replaced old relational algebra to python compiler with new one based on the new tokenizer/parser (Rev 188)
+- Code refactory to move the new parser into parser.py out of optimizer.py, that will still be compatible (Rev 190)
--- a/relational/optimizer.py
+++ b/relational/optimizer.py
@ -22,240 +22,20 @@
 For now it is highly experimental, and it shouldn't be used in 3rd party applications.'''

 import optimizations
+import parser

-RELATION=0
-UNARY=1
-BINARY=2
-b_operators=('*','-','ᑌ','ᑎ','ᐅᐊ','ᐅLEFTᐊ','ᐅRIGHTᐊ','ᐅFULLᐊ')
-u_operators=('π','σ','ρ')

-op_functions={'*':'product','-':'difference','ᑌ':'union','ᑎ':'intersection','ᐅᐊ':'join','ᐅLEFTᐊ':'outer_left','ᐅRIGHTᐊ':'outer_right','ᐅFULLᐊ':'outer','π':'projection','σ':'selection','ρ':'rename'}
-
-class node (object):
-    '''This class is a node of a relational expression. Leaves are relations and internal nodes are operations.
-    
-    The kind property says if the node is a binary operator, unary operator or relation.
-    Since relations are leaves, a relation node will have no attribute for children.
-    
-    If the node is a binary operator, it will have left and right properties.
-    
-    If the node is a unary operator, it will have a child, pointing to the child node and a prop containing
-    the string with the props of the operation.'''
-    kind=None
-    
-    def __init__(self,expression=None):
-        
-        if expression==None or len(expression)==0:
-            return
-        
-        '''Generates the tree from the tokenized expression'''
-        
-        #If the list contains only a list, it will consider the lower level list.
-        #This will allow things like ((((((a))))) to work
-        while len(expression)==1 and isinstance(expression[0],list): 
-                expression=expression[0]
-        
-        #The list contains only 1 string. Means it is the name of a relation
-        if len(expression)==1 and isinstance(expression[0],str): 
-            self.kind=RELATION
-            self.name=expression[0]
-            return
-            
-        '''Expression from right to left, searching for binary operators
-        this means that binary operators have lesser priority than
-        unary operators.
-        It find the operator with lesser priority, uses it as root of this
-        (sub)tree using everything on its left as left parameter (so building
-        a left subtree with the part of the list located on left) and doing 
-        the same on right.
-        Since it searches for strings, and expressions into parenthesis are
-        within sub-lists, they won't be found here, ensuring that they will
-        have highest priority.'''
-        for i in range(len(expression)-1,-1,-1): 
-            if expression[i] in b_operators: #Binary operator              
-                self.kind=BINARY
-                self.name=expression[i]
-                self.left=node(expression[:i]) 
-                self.right=node(expression[i+1:])
-                return
-        '''Searches for unary operators, parsing from right to left'''
-        for i in range(len(expression)-1,-1,-1):
-            if expression[i] in u_operators: #Unary operator
-                self.kind=UNARY
-                self.name=expression[i]
-                self.prop=expression[1+i].strip()
-                self.child=node(expression[2+i])
-                
-                return       
-        pass
-    def toPython(self):
-        '''This method converts the expression into python code'''
-        if self.name in b_operators:
-            return '%s.%s(%s)' % (self.left.toPython(),op_functions[self.name],self.right.toPython())
-        elif self.name in u_operators:
-            prop =self.prop
-            
-            #Converting parameters
-            if self.name=='π':#Projection
-                prop='\"%s\"' %  prop.replace(' ','').replace(',','\",\"')
-            elif self.name=="ρ": #Rename
-                prop='{\"%s\"}' % prop.replace(',','\",\"').replace('➡','\":\"').replace(' ','')
-            else: #Selection
-                prop='\"%s\"' %  prop
-                        
-            return '%s.%s(%s)' % (self.child.toPython(),op_functions[self.name],prop)
-        else:
-            return self.name
-        pass
-    def result_format(self,rels):
-        '''This function returns a list containing the fields that the resulting relation will have.
-        Since it needs to know real instances of relations, it requires a dictionary where keys are
-        the names of the relations and the values are the relation objects.'''
-        if rels==None:            
-            return
-        
-        if self.kind==RELATION:
-            return list(rels[self.name].header.attributes)
-        elif self.kind==BINARY and self.name in ('-','ᑌ','ᑎ'):
-            return self.left.result_format(rels)
-        elif self.name=='π':
-            l=[]
-            for i in self.prop.split(','):
-                l.append(i.strip())
-            return l
-        elif self.name=='*':
-            return self.left.result_format(rels)+self.right.result_format(rels)
-        elif self.name=='σ' :
-            return self.child.result_format(rels)
-        elif self.name=='ρ':
-            _vars={}
-            for i in self.prop.split(','):
-                q=i.split('➡')
-                _vars[q[0].strip()]=q[1].strip()
-            
-            _fields=self.child.result_format(rels)
-            for i in range(len(_fields)):
-                if _fields[i] in _vars:
-                    _fields[i]=_vars[_fields[i]]
-            return _fields
-        elif self.name in ('ᐅᐊ','ᐅLEFTᐊ','ᐅRIGHTᐊ','ᐅFULLᐊ'):
-            return list(set(self.left.result_format(rels)).union(set(self.right.result_format(rels))))
-            
-            
-        pass
-    
-    def __eq__(self,other):
-        if not (isinstance(other,node) and self.name==other.name and self.kind==other.kind):
-            return False
-        
-        if self.kind==UNARY:
-            if other.prop!=self.prop:
-                return False
-            return self.child==other.child
-        if self.kind==BINARY:
-            return self.left==other.left and self.right==other.right
-        return True
-    def __str__(self):
-        if (self.kind==RELATION):
-            return self.name
-        elif (self.kind==UNARY):
-            return self.name + " "+ self.prop+ " (" + self.child.__str__() +")"
-        elif (self.kind==BINARY):
-            if self.left.kind==RELATION:
-                le=self.left.__str__()
-            else:
-                le="("+self.left.__str__()+")"
-            if self.right.kind==RELATION:
-                re=self.right.__str__()
-            else:
-                re="("+self.right.__str__()+")"
-                
-            return (le+ self.name +re)
-
-def tokenize(expression):
-    '''This function converts an expression into a list where
-    every token of the expression is an item of a list. Expressions into
-    parenthesis will be converted into sublists.'''
-    items=[] #List for the tokens
-    
-    '''This is a state machine. Initial status is determined by the starting of the
-    expression. There are the following statuses:
-    
-    relation: this is the status if the expressions begins with something else than an
-        operator or a parenthesis.
-    binary operator: this is the status when parsing a binary operator, nothing much to say
-    unary operator: this status is more complex, since it will be followed by a parameter AND a
-        sub-expression.
-    sub-expression: this status is entered when finding a '(' and will be exited when finding a ')'.
-        means that the others open must be counted to determine which close is the right one.'''
-    
-    expression=expression.strip() #Removes initial and endind spaces
-    state=0
-    '''
-    0 initial and useless
-    1 previous stuff was a relation
-    2 previous stuff was a sub-expression
-    3 previous stuff was a unary operator
-    4 previous stuff was a binary operator
-    '''
-
-    while len(expression)>0:
-        if expression.startswith('('): #Parenthesis state
-            state=2
-            par_count=0 #Count of parenthesis
-            end=0
-            
-            for i in range(len(expression)):
-                if expression[i]=='(':
-                    par_count+=1
-                elif expression[i]==')':
-                    par_count-=1
-                    if par_count==0:
-                        end=i
-                        break
-            #Appends the tokenization of the content of the parenthesis
-            items.append(tokenize(expression[1:end]))
-            #Removes the entire parentesis and content from the expression
-            expression=expression[end+1:].strip()
-        
-        elif expression.startswith("σ") or expression.startswith("π") or expression.startswith("ρ"): #Unary 2 bytes
-            items.append(expression[0:2]) #Adding operator in the top of the list
-            expression=expression[2:].strip() #Removing operator from the expression
-            par=expression.find('(')
-        
-            items.append(expression[:par]) #Inserting parameter of the operator
-            expression=expression[par:].strip() #Removing parameter from the expression
-        elif expression.startswith("*") or expression.startswith("-"): # Binary 1 byte
-            items.append(expression[0])
-            expression=expression[1:].strip() #1 char from the expression
-            state=4
-        elif expression.startswith("ᑎ") or expression.startswith("ᑌ"): #Binary short 3 bytes
-            items.append(expression[0:3]) #Adding operator in the top of the list
-            expression=expression[3:].strip() #Removing operator from the expression
-
-            state=4
-        elif expression.startswith("ᐅ"): #Binary long
-            i=expression.find("ᐊ")
-            items.append(expression[:i+3])
-            expression=expression[i+3:].strip()
-            
-            state=4
-        else: #Relation (hopefully)
-            if state==1: #Previous was a relation, appending to the last token
-                i=items.pop()
-                items.append(i+expression[0])
-                expression=expression[1:].strip() #1 char from the expression
-            else:
-                state=1
-                items.append(expression[0])
-                expression=expression[1:].strip() #1 char from the expression
-    
-    return items
-
-def tree(expression):
-    '''This function parses a relational algebra expression into a tree and returns
-    the root node using the Node class defined in this module.'''
-    return node(tokenize(expression))
+#Stuff that was here before, keeping it for compatibility
+RELATION=parser.RELATION
+UNARY=parser.UNARY
+BINARY=parser.BINARY
+b_operators=parser.b_operators
+u_operators=parser.u_operators
+op_functions=parser.op_functions
+node=parser.node
+tokenize=parser.tokenize
+tree=parser.tree
+#End of the stuff

 def optimize_all(expression,rels):
    '''This function performs all the available optimizations'''
@ -281,7 +61,6 @@ def specific_optimize(expression,rels):
            total+=i(n,rels) #Performs the optimization
    return n.__str__()
            
-
 def general_optimize(expression):
    '''This function performs general optimizations. Means that it will not need to
    know the fields used by the relations'''
@ -321,9 +100,7 @@ if __name__=="__main__":
    
    '''
    σ skill=='C' (π id,name,chief,age (σ chief==i and age>a (ρ id➡i,age➡a(π id,age(people))*people)) ᐅᐊ skills)
-    (π id,name,chief,age (σ chief == i  and age > a  ((ρ age➡a,id➡i (π id,age (people)))*people)))ᐅᐊ(σ skill == 'C'  (skills))
-    
-    
+    (π id,name,chief,age (σ chief == i  and age > a  ((ρ age➡a,id➡i (π id,age (people)))*people)))ᐅᐊ(σ skill == 'C'  (skills))    
    '''
    
    #print specific_optimize("σ name==skill and age>21 and id==indice and skill=='C'(P1ᐅᐊS1)",rels)
--- a/relational/parser.py
+++ b/relational/parser.py
@ -18,7 +18,242 @@
 # 
 # author Salvo "LtWorf" Tomaselli <tiposchi@tiscali.it>

-import optimizer
+
+RELATION=0
+UNARY=1
+BINARY=2
+b_operators=('*','-','ᑌ','ᑎ','ᐅᐊ','ᐅLEFTᐊ','ᐅRIGHTᐊ','ᐅFULLᐊ')
+u_operators=('π','σ','ρ')
+
+op_functions={'*':'product','-':'difference','ᑌ':'union','ᑎ':'intersection','ᐅᐊ':'join','ᐅLEFTᐊ':'outer_left','ᐅRIGHTᐊ':'outer_right','ᐅFULLᐊ':'outer','π':'projection','σ':'selection','ρ':'rename'}
+
+class node (object):
+    '''This class is a node of a relational expression. Leaves are relations and internal nodes are operations.
+    
+    The kind property says if the node is a binary operator, unary operator or relation.
+    Since relations are leaves, a relation node will have no attribute for children.
+    
+    If the node is a binary operator, it will have left and right properties.
+    
+    If the node is a unary operator, it will have a child, pointing to the child node and a prop containing
+    the string with the props of the operation.'''
+    kind=None
+    
+    def __init__(self,expression=None):
+        
+        if expression==None or len(expression)==0:
+            return
+        
+        '''Generates the tree from the tokenized expression'''
+        
+        #If the list contains only a list, it will consider the lower level list.
+        #This will allow things like ((((((a))))) to work
+        while len(expression)==1 and isinstance(expression[0],list): 
+                expression=expression[0]
+        
+        #The list contains only 1 string. Means it is the name of a relation
+        if len(expression)==1 and isinstance(expression[0],str): 
+            self.kind=RELATION
+            self.name=expression[0]
+            return
+            
+        '''Expression from right to left, searching for binary operators
+        this means that binary operators have lesser priority than
+        unary operators.
+        It find the operator with lesser priority, uses it as root of this
+        (sub)tree using everything on its left as left parameter (so building
+        a left subtree with the part of the list located on left) and doing 
+        the same on right.
+        Since it searches for strings, and expressions into parenthesis are
+        within sub-lists, they won't be found here, ensuring that they will
+        have highest priority.'''
+        for i in range(len(expression)-1,-1,-1): 
+            if expression[i] in b_operators: #Binary operator              
+                self.kind=BINARY
+                self.name=expression[i]
+                self.left=node(expression[:i]) 
+                self.right=node(expression[i+1:])
+                return
+        '''Searches for unary operators, parsing from right to left'''
+        for i in range(len(expression)-1,-1,-1):
+            if expression[i] in u_operators: #Unary operator
+                self.kind=UNARY
+                self.name=expression[i]
+                self.prop=expression[1+i].strip()
+                self.child=node(expression[2+i])
+                
+                return       
+        pass
+    def toPython(self):
+        '''This method converts the expression into python code'''
+        if self.name in b_operators:
+            return '%s.%s(%s)' % (self.left.toPython(),op_functions[self.name],self.right.toPython())
+        elif self.name in u_operators:
+            prop =self.prop
+            
+            #Converting parameters
+            if self.name=='π':#Projection
+                prop='\"%s\"' %  prop.replace(' ','').replace(',','\",\"')
+            elif self.name=="ρ": #Rename
+                prop='{\"%s\"}' % prop.replace(',','\",\"').replace('➡','\":\"').replace(' ','')
+            else: #Selection
+                prop='\"%s\"' %  prop
+                        
+            return '%s.%s(%s)' % (self.child.toPython(),op_functions[self.name],prop)
+        else:
+            return self.name
+        pass
+    def result_format(self,rels):
+        '''This function returns a list containing the fields that the resulting relation will have.
+        Since it needs to know real instances of relations, it requires a dictionary where keys are
+        the names of the relations and the values are the relation objects.'''
+        if rels==None:            
+            return
+        
+        if self.kind==RELATION:
+            return list(rels[self.name].header.attributes)
+        elif self.kind==BINARY and self.name in ('-','ᑌ','ᑎ'):
+            return self.left.result_format(rels)
+        elif self.name=='π':
+            l=[]
+            for i in self.prop.split(','):
+                l.append(i.strip())
+            return l
+        elif self.name=='*':
+            return self.left.result_format(rels)+self.right.result_format(rels)
+        elif self.name=='σ' :
+            return self.child.result_format(rels)
+        elif self.name=='ρ':
+            _vars={}
+            for i in self.prop.split(','):
+                q=i.split('➡')
+                _vars[q[0].strip()]=q[1].strip()
+            
+            _fields=self.child.result_format(rels)
+            for i in range(len(_fields)):
+                if _fields[i] in _vars:
+                    _fields[i]=_vars[_fields[i]]
+            return _fields
+        elif self.name in ('ᐅᐊ','ᐅLEFTᐊ','ᐅRIGHTᐊ','ᐅFULLᐊ'):
+            return list(set(self.left.result_format(rels)).union(set(self.right.result_format(rels))))
+            
+            
+        pass
+    
+    def __eq__(self,other):
+        if not (isinstance(other,node) and self.name==other.name and self.kind==other.kind):
+            return False
+        
+        if self.kind==UNARY:
+            if other.prop!=self.prop:
+                return False
+            return self.child==other.child
+        if self.kind==BINARY:
+            return self.left==other.left and self.right==other.right
+        return True
+    def __str__(self):
+        if (self.kind==RELATION):
+            return self.name
+        elif (self.kind==UNARY):
+            return self.name + " "+ self.prop+ " (" + self.child.__str__() +")"
+        elif (self.kind==BINARY):
+            if self.left.kind==RELATION:
+                le=self.left.__str__()
+            else:
+                le="("+self.left.__str__()+")"
+            if self.right.kind==RELATION:
+                re=self.right.__str__()
+            else:
+                re="("+self.right.__str__()+")"
+                
+            return (le+ self.name +re)
+
+def tokenize(expression):
+    '''This function converts an expression into a list where
+    every token of the expression is an item of a list. Expressions into
+    parenthesis will be converted into sublists.'''
+    items=[] #List for the tokens
+    
+    '''This is a state machine. Initial status is determined by the starting of the
+    expression. There are the following statuses:
+    
+    relation: this is the status if the expressions begins with something else than an
+        operator or a parenthesis.
+    binary operator: this is the status when parsing a binary operator, nothing much to say
+    unary operator: this status is more complex, since it will be followed by a parameter AND a
+        sub-expression.
+    sub-expression: this status is entered when finding a '(' and will be exited when finding a ')'.
+        means that the others open must be counted to determine which close is the right one.'''
+    
+    expression=expression.strip() #Removes initial and endind spaces
+    state=0
+    '''
+    0 initial and useless
+    1 previous stuff was a relation
+    2 previous stuff was a sub-expression
+    3 previous stuff was a unary operator
+    4 previous stuff was a binary operator
+    '''
+
+    while len(expression)>0:
+        if expression.startswith('('): #Parenthesis state
+            state=2
+            par_count=0 #Count of parenthesis
+            end=0
+            
+            for i in range(len(expression)):
+                if expression[i]=='(':
+                    par_count+=1
+                elif expression[i]==')':
+                    par_count-=1
+                    if par_count==0:
+                        end=i
+                        break
+            #Appends the tokenization of the content of the parenthesis
+            items.append(tokenize(expression[1:end]))
+            #Removes the entire parentesis and content from the expression
+            expression=expression[end+1:].strip()
+        
+        elif expression.startswith("σ") or expression.startswith("π") or expression.startswith("ρ"): #Unary 2 bytes
+            items.append(expression[0:2]) #Adding operator in the top of the list
+            expression=expression[2:].strip() #Removing operator from the expression
+            par=expression.find('(')
+        
+            items.append(expression[:par]) #Inserting parameter of the operator
+            expression=expression[par:].strip() #Removing parameter from the expression
+        elif expression.startswith("*") or expression.startswith("-"): # Binary 1 byte
+            items.append(expression[0])
+            expression=expression[1:].strip() #1 char from the expression
+            state=4
+        elif expression.startswith("ᑎ") or expression.startswith("ᑌ"): #Binary short 3 bytes
+            items.append(expression[0:3]) #Adding operator in the top of the list
+            expression=expression[3:].strip() #Removing operator from the expression
+
+            state=4
+        elif expression.startswith("ᐅ"): #Binary long
+            i=expression.find("ᐊ")
+            items.append(expression[:i+3])
+            expression=expression[i+3:].strip()
+            
+            state=4
+        else: #Relation (hopefully)
+            if state==1: #Previous was a relation, appending to the last token
+                i=items.pop()
+                items.append(i+expression[0])
+                expression=expression[1:].strip() #1 char from the expression
+            else:
+                state=1
+                items.append(expression[0])
+                expression=expression[1:].strip() #1 char from the expression
+    
+    return items
+
+def tree(expression):
+    '''This function parses a relational algebra expression into a tree and returns
+    the root node using the Node class defined in this module.'''
+    return node(tokenize(expression))
+
+

 def parse(expr):
    '''This function parses a relational algebra expression, converting it into python, 
@ -50,7 +285,7 @@ def parse(expr):
    ρid➡i,name➡n(π a,b(A))
    A ᐅᐊ B
    '''
-    return optimizer.tree(expr).toPython()
+    return tree(expr).toPython()
    
 if __name__=="__main__":
    while True: