class Probleme:
    """"permet de efinir un probleme"""

    def actions(self):
        """returne la liste d'actions"""    
        return ([])

    def etats(self):
        """returne la liste d'etats"""    
        return ([])

    
    def transition(self,s,a):
        """definit les consequence d une action"""
        return(s)

   

    def recompense(self,s,a,sarr):
        """definit la recompense obtenue"""
        return(0)





class Laby(Probleme):
    """un probleme ou on se deplace
    - les actions connues sont N,S,E,O
    - l'etat est un couple (x,y)"""

    """pose des trous dans laby"""
    def __init__(self):
        self.obstacle=[]
        self.ajouterZoneObstacle(-5,-5,0,20)
        self.ajouterZoneObstacle(-5,-5,20,0)
        self.ajouterZoneObstacle(0,15,20,20)
        self.ajouterZoneObstacle(15,0,20,20)
        self.ajouterZoneObstacle(0,3,6,5)
        self.ajouterZoneObstacle(3,9,20,12)
        

        
        
    def ajouterZoneObstacle(self,dx,dy,fx,fy):
        """ajoute un obstacle dans la zone retcangle (dx,dx)->(fx,fy)"""
        for x in range(dx,fx+1):
            for y in range(dy,fy+1):
                self.obstacle+=[(x,y)]
    

      
    def printLaby(self):
        for j in range(-5,20):
            chaine=""
            for i in range(-5,20):
                etat=(i,j)
                if etat in self.obstacle:
                    chaine+="X"
                else:
                    chaine+="."
            print(chaine)
        
        

    def transition(self,s,a):
        """fait evoluer d une unite le systeme"""
        """un etat est le quadruplet ((xdep,ydep),(xprec,yprec))"""
        """action c'est (dx,dy) (delat et delty)"""

        #si on est dans un obstacle, etat = "Mort"
        if ((s=="mort") or (s[0] in self.obstacle)):
            #print("mort")
            return("mort")

        #sinon on fait calcul
        x=s[0][0]
        y=s[0][1]
        
        xPrec=s[1][0]
        yPrec=s[1][1]

        xApres=x+(x-xPrec)
        yApres=y+(y-yPrec)

        xApres=x+a[0]
        yApres=y+a[1]       
        return(((xApres,yApres),(x,y)))
    

    #les recompenses 
    def recompense(self,s,a,sarr):
        
        #si etat est mort, plus d'evolution
        if (sarr=="mort"):
            return(0)    
        
        posFin=sarr[0]
        #si l'etat d arrivee est (14,14) ==> ok
        if ((posFin[0]==14)and(posFin[1]==14)):
            return(100)

        #si l'etat d'arrivee est un obstacle, recompense negative forte
        if (posFin in self.obstacle):
            return(-200)

        #sinon, si on a un obstacle 
        if (posFin[0]==-1)and (posFin[1]==-1):
            return(0)

        #enfin, sinon, cout de 1
        return(-1)

    #la liste des actions
    def actions(self):
        return([(1,0),(1,1),(1,-1),(0,0),(0,1),(0,-1),(-1,0),(-1,-1),(-1,1)])

              
    

class Systeme:
    """permet d'executer un probleme
    - pb : attribut du probleme"""

    def __init__(self,pb):
        """construit un systeme a partir d'un probleme"""
        self.pb=pb

    def execute(self,s,a):
        #print ("etat depart: ",s)
        #print ("action: ",a)
        sArriv = self.pb.transition(s,a)
        #print ("etat arrivee: ",sArriv)
        #print ("recompense: ",self.pb.recompense(s,a,sArriv))
        #print("********")
        return(sArriv)

    def intialiseQ(self):
        """construit des Qvaleurs vides"""
        Q = {}
    

    def planifie(self,nb,s):
        gamma=0.999
        #Q=self.intialiseQ()
        Q={}
        for action in self.pb.actions():
            Q[(s,action)]=0
       
        #une iteration
        for i in range(0,nb):
            print("iteration :",i)
            Q2={}
            #pour chaque etat,action
            for etatAction in Q:            
                    etat=etatAction[0]
                    action=etatAction[1]
                    #calculer arrivee
                    sArriv=self.pb.transition(etat,action)
                    r=self.pb.recompense(etat,action,sArriv)
                    # cherche max arrivee
                    max=-100000
                    for actionMax in self.pb.actions():
                        #si la clef n'existe pas, on cree
                        if (not((sArriv,actionMax) in Q)):
                            Q2[(sArriv,actionMax)]=0
                            if (max<0):
                                max=0
                        else :
                            if (Q[(sArriv,actionMax)]>max):
                                max=Q[(sArriv,actionMax)]                  
                    #mise à jour                    
                    Q2[(etat,action)]=r+gamma*max
                    
            #on augmente iteration
            Q=Q2
            print("* nb etats:",len(Q))
        return(Q)

    def executerPi(self,pi,depart,nb):
        s=depart
        for i in range(nb):
            action=pi[s]
            sFin=self.execute(s,action)
            r=self.pb.recompense(s,action,sFin)
            print(s," -> ",action," : ",sFin,"<",r,">")
            s=sFin
                    

    def afficherQ(self,Q):
        for etatAction in Q:
            etat=etatAction[0]
            action=etatAction[1]
            chaine = ""+str(etat)+" - "+str(action)+" -> "+str(Q[(etat,action)])+", "
            print(chaine)

    def afficherQS(self,Q,etat):
        chaine = ""+str(etat)+" - "
        for action in self.pb.actions():
            chaine+=action+" -> "+str(Q[(etat,action)])+", "
        print(chaine)
            

    def politiqueFromQ(self,Q):
        pi={}
        for etatAction in Q:
            etat=etatAction[0]
            # cherche max arrivee
            max=-100000
            amax=-1;
            for actionMax in self.pb.actions():
                if (Q[(etat,actionMax)]>max):
                    max=Q[(etat,actionMax)]
                    amax=actionMax
            pi[etat]=amax
        return(pi)

    def apprentissage(self,Sdep):
        print("")
        



pb=Laby()
#print(pb.etats())
#print(pb.actions())
print("****************************************")
pb.printLaby()
print("****************************************")

systeme=Systeme(pb)
sDep=((1,1),(1,1))
s=systeme.execute(sDep,(1,1))
s=systeme.execute(s,(1,1))

print("****************************************")
print("planification")
Q=systeme.planifie(50,sDep)
print(len(Q))
#systeme.afficherQ(Q)
#systeme.afficherQS(Q,(3,2))
#systeme.afficherQS(Q,(3,3))


print("****************************************")
pi=systeme.politiqueFromQ(Q)
#print(pi)
systeme.executerPi(pi,sDep,30)

        
    


