package tmm1;

/*
 * 2019-08-05
 *  takeAction()  getActionParam() ƂɃoOBRgQƁB
 * 
 */

import java.awt.Color;
import java.awt.Dimension;
import java.awt.Font;
import java.awt.Graphics;
import java.awt.event.MouseEvent;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Stack;
import java.util.stream.Collectors;

import static tmm1.TMM2v2.Action.*;
import static tmm1.TMM2v2.Item.*;

import lab.Lab;
import lab.Lab.LabCode;

public class TMM2v2 {
    public static void main(String[] args) {
        Lab.addSelectableClass(TMM2v1.class);
        System.out.println(Lab.selectableClasses + "");

        LabCode labCode = new LabCode();
        labCode.main(TMM2Main1.class);
    }
    public static enum Action {
        //LookFront, LookLeft, LookRight,
        MoveForward,
        Load,
        Unload,
        TurnRight, TurnLeft, 
        Find1,
        Find2,
        Reset1,
        Reset2,
        LookAround1,
        LookAround2,
        Call,
        Return,
        Fail;
    }
    public static enum Item {
        Wall(''),  // 
        AgentN(''),  // 
        AgentE(''), // 
        AgentW(''), // 
        AgentS(''),  //
        Agent(''),
        Stone(''), // 
        Shell('k'), // k
        Nut(''), // 
        Meat(''), // 
        Leftovers('c'), // c
        Space('E'); // E 

        public final char code;
        private Item(char code){
            this.code = code;
        }
        public static Item agentChar(int direction) {
            return Agent;
//            switch (direction){
//            case 0: return AgentN;
//            case 1: return AgentE;
//            case 2: return AgentS;
//            case 3: return AgentW;
//            default: throw new Error();
//            }
        }
    }
    public static class StateN {
        public List<Object> elems;
        public StateN(List<Object> vars) { this.elems = vars; }
        public String toString(){
            StringBuffer buf = new StringBuffer();
            buf.append("s(");
            elems.forEach(obj -> {
                buf.append(obj.toString());
                buf.append(", ");
            });
            buf.append(")");
            return buf.toString();
        }
    }
    public static class CallN {
        public StateN m;
        public CallN(StateN m){ this.m = m; }
        public String toString() {
            return "c("+ m+ ")";
        }
    }
    public static class RuleN {
        public StateN s, g;
        public ActionN a;
        public RuleN(StateN s, StateN g, ActionN a){ 
            this.s = s; this.g = g; this.a = a;
        }
        public String toString(){
            return "rule("+ s+ ", "+ g+ ", "+ a+ ")";
        }
    }
    public static class ActionN {
        public Action a;
        public StateN m;
        public ActionN(Action a, StateN m){ 
            this.a = a; this.m = m;
        }
        public String toString(){
            if (m == null){
                return a.toString();
            } else {
                return a.toString()+ "("+ m+ ")";
            }
        }
    }
    public static class VariableN {
        public String name;
        public VariableN(String name){ this.name = name; }
    }
    public static abstract class AbstractMakeRule extends Lab.Code {
        VariableN o1 = new VariableN("o1");
        VariableN x1 = new VariableN("x1");
        VariableN y1 = new VariableN("y1");
        VariableN o2 = new VariableN("o2");
        VariableN x2 = new VariableN("x2");
        VariableN y2 = new VariableN("y2");
        //final Action C = Action.Call;
        public static final String __ = Rule.WILDCARD; // Two underscores.
        public static final String Nowhere = "Nowhere".intern();
        public StateN s(Object o1, Object x1, Object y1,
                Object o2, Object x2, Object y2){
           Object[] args = {o1, x1, y1, o2, x2, y2};
           return new StateN(Arrays.asList(args));
        }
        public CallN c(Object o1, Object x1, Object y1,
                Object o2, Object x2, Object y2){
           Object[] args = {o1, x1, y1, o2, x2, y2};
           return new CallN(new StateN(Arrays.asList(args)));
        }
        List<RuleN> ruleList = new ArrayList<>();
        public void q(StateN s, StateN g, Action a){
            ruleList.add(new RuleN(s, g, new ActionN(a, null)));
        }
        public void q(StateN s, StateN g, Action a, StateN m){
            ruleList.add(new RuleN(s, g, new ActionN(a, m)));
        }
        public void q(StateN s, StateN g, CallN c){
            ruleList.add(new RuleN(s, g, new ActionN(Action.Call, c.m)));
        }

        public abstract List<RuleN> makeRules();
    }
    //--------------------------------------------------
    public static class RuleTest1 extends AbstractMakeRule {
        public List<RuleN> makeRules(){
            // Just for abbreviation
            //final Item A = Item.AgentN; 
            final Item A = Item.Agent; 
            // Declarations of Q(s,g,a)
            //q(s(o1,x1,y1,o2,x2,y2), s(o1,x1,y1,o2,x2,y2), C,s(o1,x1,y1,o2,x2,y2));
            {   // o1 ɋ߂ÂTu[`B
                StateN g = s(o1,0,1,A,0,0);
                // ̂Qւ̒ӂZbgB
                q(s(o1,x1,y1,o2,x2,y2), s(o1,0,1,A,0,0), Reset2);
                // O o1 ɋ߂ÂB
                q(s(o1,x1,1,A,0,0), g, MoveForward);
                q(s(o1,x1,2,A,0,0), g, MoveForward);
                q(s(o1,x1,3,A,0,0), g, MoveForward);
                q(s(o1,x1,4,A,0,0), g, MoveForward);
                q(s(o1,x1,5,A,0,0), g, MoveForward);
                //  o1 ɌČςB
                q(s(o1,x1,-1,A,0,0), g, TurnRight);
                q(s(o1,x1,-2,A,0,0), g, TurnRight);
                q(s(o1,x1,-3,A,0,0), g, TurnRight);
                q(s(o1,x1,-4,A,0,0), g, TurnRight);
                q(s(o1,x1,-5,A,0,0), g, TurnRight);
                // E o1 ɌςB
                q(s(o1,1,0,A,0,0), g, TurnRight);
                q(s(o1,2,0,A,0,0), g, TurnRight);
                q(s(o1,3,0,A,0,0), g, TurnRight);
                q(s(o1,4,0,A,0,0), g, TurnRight);
                q(s(o1,5,0,A,0,0), g, TurnRight);
                //  o1 ɌςB
                q(s(o1,-1,0,A,0,0), g, TurnLeft);
                q(s(o1,-2,0,A,0,0), g, TurnLeft);
                q(s(o1,-3,0,A,0,0), g, TurnLeft);
                q(s(o1,-4,0,A,0,0), g, TurnLeft);
                q(s(o1,-5,0,A,0,0), g, TurnLeft);
            }
            {   // o1  o2 ̂PȌꏊɈړTu[`B
                StateN g = s(o1,0,0,o2,0,1);
                // O o2 ꍇB߂ÂB
                q(s(o1,0,0,o2,x2,1), g, MoveForward);
                q(s(o1,0,0,o2,x2,2), g, MoveForward);
                q(s(o1,0,0,o2,x2,3), g, MoveForward);
                q(s(o1,0,0,o2,x2,4), g, MoveForward);
                q(s(o1,0,0,o2,x2,5), g, MoveForward);
                //  o2 ꍇBςB
                q(s(o1,0,0,o2,x2,-1), g, TurnRight);
                q(s(o1,0,0,o2,x2,-2), g, TurnRight);
                q(s(o1,0,0,o2,x2,-3), g, TurnRight);
                q(s(o1,0,0,o2,x2,-4), g, TurnRight);
                q(s(o1,0,0,o2,x2,-5), g, TurnRight);
                // E o2 ꍇBEB
                q(s(o1,0,0,o2,1,0), g, TurnRight);
                q(s(o1,0,0,o2,2,0), g, TurnRight);
                q(s(o1,0,0,o2,3,0), g, TurnRight);
                q(s(o1,0,0,o2,4,0), g, TurnRight);
                q(s(o1,0,0,o2,5,0), g, TurnRight);
                //  o2 ꍇBB
                q(s(o1,0,0,o2,-1,0), g, TurnLeft);
                q(s(o1,0,0,o2,-2,0), g, TurnLeft);
                q(s(o1,0,0,o2,-3,0), g, TurnLeft);
                q(s(o1,0,0,o2,-4,0), g, TurnLeft);
                q(s(o1,0,0,o2,-5,0), g, TurnLeft);
            }
            {   // Nut HׂTu[`B
                //StateN g = s(Leftovers,0,1,A,0,0);
                StateN g = s(Leftovers,0,1,__,__,__);
                // ̂Q𖳎
                q(s(__,__,__,__,__,__), g, Reset2);
                // Nut TB
                q(s(__,__,__,__,0,0), g, c(Nut,x1,y1,__,0,0));
                {   // Nut ꍇB
                    // Nut ̂ƂɈړB
                    q(s(Nut,x1,y1,__,0,0), g, c(Nut,0,1,__,0,0));
                }
                {   // Nut ȂꍇB
                    // Stone TB
                    q(s(A,0,0,A,0,0), g, c(Stone,x1,y1,A,0,0));
                    q(s(Stone,x1,y1,A,0,0), g, c(Stone,0,1,A,0,0));
                    // Stone B
                    q(s(Stone,0,1,A,0,0), g, Load);
                    // Shell TB
                    q(s(Stone,0,0,Stone,0,0), g, c(Stone,0,0,Shell,x2,y2));
                    // Stone ܂ Shell ̑OɈړB
                    q(s(Stone,0,0,Shell,x2,y2), g, c(Stone,0,0,Shell,0,1));
                    // Stone łBڂ̑O Nut ꂻɒӂB
                    q(s(Stone,0,0,Shell,0,1), g, Unload);
                    // o1  Nut ɂȂ悤ɂB
                    q(s(Stone,0,0,Nut,0,1), g, c(Nut,0,1,Nut,0,1));
                    q(s(Nut,0,1,Nut,0,1), g, Reset2);
                }
                q(s(Nut,0,1,__,__,__), g, Load);
            }
            {   // ̂PTTu[`B
                StateN g = s(o1,__,__,o2,__,__);
                q(s(__,__,__,o2,__,__), g, Find1);
            }
            {   // ̂QTTu[`B
                StateN g = s(o1,__,__,o2,__,__);
                q(s(o1,__,__,__,__,__), g, Find2);
            }
            
            return ruleList;
        }
    }
    //--------------------------------------------------
    /**
     * Q(s,g,a) 𒊏ۉ[B 
     */
    public static class Rule {
        /**
         * Q value of this rule.
         */
        public float q;
        /**
         * Number of variables appeared in this Rule.
         */
        public int numVars;
        // \ȃIuWFNgB̃IuWFNgƐ΂ɏdȂȂIuWFNgB
        public static final Object UNBOUND = new Object[]{"UNBOUND"};
        // p^[}b`IɖϐɑftHglӁB
        public static final Object PHI = "PHI".intern();
        // ChJ[h
        public static final String WILDCARD = "__".intern(); // Two underscores.
        public Object[] env;
        public Object[] patternVec; // Concatenated pattern of s and g.
        public Action action;
        public Object[] actionPatternVec;  // Pattern of m of action C_m.
        public int idCounter = 0;
        public Map<VariableN,PatternVariable> vmap = new HashMap<>(); 
        public Rule(RuleN ruleN){
            // ruleN ƂɃp^[\zB
            List<Object> elems = transStateN(ruleN.s);
            elems.addAll(transStateN(ruleN.g));
            numVars = vmap.size() + 
                    (int)elems.stream()
                    .filter(e -> e == WILDCARD)
                    .count();
            patternVec = elems.toArray();
            action = ruleN.a.a;
            if (action == Action.Call){
                actionPatternVec = transStateN(ruleN.a.m).toArray();
            }
            env = new Object[vmap.size()];
            vmap = null;
        }
        public Rule(){
           // Implicitly called from ReturnRule().
        }
        // ϐɓ id  PatternVariable 蓖ĂB
        public List<Object> transStateN(StateN s){
            List<Object> ret = new ArrayList<>();
            s.elems.forEach(e -> {
                Object re;
                if (e == WILDCARD){
                    re = e;
                } else if (e instanceof VariableN){
                    if (vmap.containsKey(e)){
                        re = vmap.get(e);
                    } else {
                        re = new PatternVariable(((VariableN)e).name,
                                idCounter++);
                        vmap.put((VariableN)e, (PatternVariable)re);
                    }
                } else if (e instanceof Integer){
                    int i = (Integer)e;
                    // Accepts only small integers that can be compared with == operator.
                    Lab.assertTrue( -128 <= i && i <= 127); 
                    re = e;
                } else {
                    re = e;
                }
                ret.add(re);
            });
            return ret;
        }
        public void resetMatchResult(){
            for (int i = 0; i < env.length; i++) {
                env[i] = UNBOUND;
            }
        }
        public boolean match(Object[] vals){
            Lab.assertTrue(vals.length == patternVec.length);
            for (int i = 0; i < vals.length; i++) {
                //System.out.println(i+ ":"+ patternVec[i]+ ","+ vals[i]);
                if (patternVec[i] != WILDCARD){
                    Object pval;
                    if (patternVec[i] instanceof PatternVariable){
                        int id = ((PatternVariable)patternVec[i]).id;
                        if (env[id] == UNBOUND){
                            pval = env[id] = vals[i];
                        } else {
                            pval = env[id];
                        }
//                    } else if (patternVec[i] instanceof Integer){
//                        pval = patternVec[i];
                        //System.out.println(vals[i]+" == "+pval+":"+(vals[i]==pval));
                    } else {
                        pval = patternVec[i];
                    }
                    //if (vals[i] != PHI && vals[i] != pval) return false;
                    if (vals[i] != pval) return false;
                }
            }
            return true;
        }
        public Action getAction(){
            return action;
        }
        public Object[] getActionParam(){
            Lab.assertTrue(action == Action.Call);
            Object[] ret = actionPatternVec.clone();
            for (int i = 0; i < ret.length; i++) {
                if (ret[i] == WILDCARD){
                    ret[i] = PHI;
                } else if (ret[i] instanceof PatternVariable){
                    int id = ((PatternVariable)actionPatternVec[i]).id;
                    if (env[id] == UNBOUND){
                        ret[i] = PHI;
                    } else {
                        ret[i] = env[id];
                    }
                }
            }
            return ret;
        }
        public String toString(){
            StringBuffer buf = new StringBuffer();
            buf.append("rule(");
            for (int i = 0; i < patternVec.length; i++) {
                buf.append(patternVec[i]+ ",");
            }
            buf.append(action+ ",");
            if (actionPatternVec != null){
                for (int i = 0; i < actionPatternVec.length; i++) {
                    buf.append(actionPatternVec[i]+ ",");
                }
            }
            buf.append(").q = "+ q);
            return buf.toString();
        }
        public static class PatternVariable {
            String name;
            int id;
            public PatternVariable(String name, int id){ 
                this.name = name; this.id = id; 
            }
            public String toString() { return ""+ name; }
        }
        // Special instance used for Action.Return
        public static final Rule returnRule = new ReturnRule();
    }
    public static class ReturnRule extends Rule {
        public ReturnRule(){
            action = Action.Return;
            q = 0; // Q(g,g,RET) == 0
        }
        public String toString(){
            return "rule(Return).q = "+ q;
        }
    }
    public static final class Position {
        public int x, y;
        public Position(){ this.x = 0; this.y = 0; }
        public Position(int x, int y){ this.x = x; this.y = y; }
        public Position clone(){
            return new Position(x, y);
        }
        public boolean equals(Position p){
            return x == p.x && y == p.y;
        }
        /** Rotate the position counterclockwise. */
        public void rotateLeft(){
            int tx = -y; int ty = x;
            x = tx; y = ty;
        }
        /** Rotate the position clockwise. */
        public void rotateRight(){
            int tx = y; int ty = -x;
            x = tx; y = ty;
        }
        public void moveForward(){
            y += 1;
        }
        public void moveForward(Position dir){
            x += dir.x; y += dir.y;
        }
        public void moveBackward(){
            y -= 1;
        }
        public void moveBackword(Position dir){
            x -= dir.x; y -= dir.y;
        }
        public Position relativeTo(Position origin, int direction){
            Position p = new Position(x - origin.x, y - origin.y);
            for (int i = 0; i < direction; i++) {
                p.rotateLeft();
            }
            return p;
        }
        public Position unRelativeTo(Position origin, int direction){
            Position p = this.clone();
            for (int i = 0; i < direction; i++) {
                p.rotateRight();
            }
            p.x += origin.x;
            p.y += origin.y;
            return p;
        }
        public String toString(){
            return "("+ x+ ", "+ y+ ")";
        }
    }
    /**
     * Q(s,g,C_m)  s, g, m \f[^\B
     * ZT[͂ s ݒ肳B
     * 
     */
    public static class State {
        public Object[] values;
        public State(Object[] values) { this.values = values; }
        public Object[] getVec(){
            return values;
        }
        /**
         * Compares two states in order to check if the agent reaches 
         * the subgoal state x. 
         * State x may contain the special values PHI, 
         *   which matches to any values.
         */
        public boolean satisfies(State x){
            Object[] xv = x.values;
            Lab.assertTrue(values.length == xv.length);
            for (int i = 0; i < xv.length; i++) {
                if (xv[i] != Rule.PHI){
                    if (values[i] != xv[i]) return false;
                }
            }
            return true;
        }
        public String toString(){
            StringBuffer buf = new StringBuffer();
            buf.append("State(");
            for (int i = 0; i < values.length; i++) {
                buf.append(values[i].toString());
                buf.append(",");
            }
            buf.append(")");
            return buf.toString();
        }
    }
    
    
    
    //--------------------------------------------------
    public static class TMM2Main1 extends Lab.MainCode {
        //public int maxEpisodes = panel.getInt("max episodes", 1000000, 1, 100000);
        public int maxSteps = panel.getInt("max steps", 100, 1, 10000);
        public float alpha = panel.getFloat("alpha", 0.3f, 0, 1);
        //public float epsilon = panel.getFloat("elsilon", 0.1f, 0, 1);
        public float mChangeReward = panel.getFloat("m change R", -1, -10, 0);
        public int sizeX = panel.getInt("map size x", 8, 1, 100);
        public int sizeY = panel.getInt("map size Y", 8, 1, 100);
        public float vScale; 
        //public int viewSizeX = panel.getInt("View size x", sizeX, 1, 1000);
        //public int viewSizeY = panel.getInt("View size y", sizeY, 1, 1000);
        public lab.Lab.WTextArea qView = null;
        
        //  main
        public void main() {
            World world = new World();
            if (panel.flag("test main", false)){
                world.testMainLoop();
            } else {
                world.main();
            }
        }
        public class Agent {
            public State newS; // state
            public State newG; // subgoal
            public Rule newR; // rule 
            public State oldS;
            public State oldG;
            public Rule oldR;
            public float reward;
            public Stack<State> stack;
            public State start, goal;
            public World world;
            public List<Rule> rules;
            public float initVal = panel.getFloat("Table init value", -10, -50, 0);
            public float beta = panel.getFloat("beta", 1, 0.01f, 100); // for softmax
            // Agent ̍WƌBSWB
            public Position pos = new Position();
            public int headDirection; // 0,1,2,3
            // Agent ڂ镨̂̍WBg̒SWB
            public Position o1pos;
            public Position o2pos;
            public boolean objectNotFound = false;;
            //
            public Agent(World world){
                this.world = world;
                initTable();
            }
            public void initTable() {
                rules = new RuleTest1().makeRules().stream().map(
                    ruleN ->  new Rule(ruleN)
                ).collect(Collectors.toList());
                // KvȂ q lBƂ肠Ô܂܂ƂB
            }
            public void setStartAndGoal(State start, State goal){
                this.start = oldS = newS = start;
                this.goal = oldG = newG = goal;
                initHist();
            }
            public void chooseFirstAction(){
                stack = new Stack<State>();
                chooseAction();
                oldR = newR;
                addToHist(oldR);
            }
            // 
            public void takeAction(){
                //  state  reward ̗󂯎KvB
                //  primitive action ID B̓[hcƂ͕ʁB
                Action action = oldR.getAction();
                if (action == Action.Return){
                    newS = oldS;
                    newG = stack.pop();
                    reward = 0;
                } else if (action == Action.Call){
                    newS = oldS;
                    stack.push(oldG);
                    /*
                     * BUG:
                     *  update() ̂Ƃł match Ăł̂ŁA
                     *  getActionParam() ͂̑OɌĂ΂ȂƒlB
                     */
                    newG = new State(oldR.getActionParam());
                    reward = mChangeReward;
                } else {
                    reward = world.takePrimitiveAction(action, this);
                    newS = world.observe(this);
                    newG = oldG;
                }
            }
            public void chooseAction(){
                if (newS.satisfies(newG)){
                    newR = Rule.returnRule;
                } else {
                    List<Rule> matched = selectMatchedRules(newS, newG);
                    float[] q = calcRulePriorities(matched);
                    if (q.length == 0){
                        System.out.println("q.length == 0, (news,newG)="+ 
                                newS+ ", "+ newG);
                    }
                    // softmax  Rule PIB
                    int index = softmax(q);
                    if (panel.flag("Show matched rules", true)){
                        for (int i = 0; i < matched.size(); i++) {
                            env.viewPanel.println("matched", i+ ":"+ matched.get(i));
                        }
                        for (int i = 0; i < q.length; i++) {
                            env.viewPanel.println("priority", i+ ":"+ q[i]);
                        }
                        for (int i = 0; i < probTable.length; i++) {
                            env.viewPanel.println("probTable", i+ ":"+ probTable[i]);
                        }
                    }
                    newR = matched.get(index);
                }
            }
            public List<Rule> selectMatchedRules(State s, State g){
                // s,g ̒lzB
                Object[] vals = new Object[s.values.length + g.values.length];
                for (int i = 0; i < s.values.length; i++) {
                    vals[i] = s.values[i];
                }
                for (int i = 0; i < g.values.length; i++) {
                    vals[i + s.values.length] = g.values[i];
                }
                rules.forEach(r -> r.resetMatchResult());
                // (s,g) Ƀ}b`郋[IB
                // [̐ parallelStream gĂ݂B
                List<Rule> matched = rules.stream().filter(
                        r -> r.match(vals)
                ).collect(Collectors.toList());
                return matched;
            }
            public float genericityPenalty = panel.getFloat("gen penalty", 100, 0, 100);
            public float[] calcRulePriorities(List<Rule> matched){
                float[] q = new float[matched.size()];
                for (int i = 0; i < q.length; i++) {
                    Rule r = matched.get(i);
                    // numVars ɉyieB^Bϐ̐Ȃ[DB
                    float val = r.q - genericityPenalty * r.numVars;
                    q[i] = val;
                }
                return q;
            }
            public boolean eTrace = panel.flag("eTrace", false); 
            public void update() {
                if (oldR == Rule.returnRule){
                    // Do nothing.
                } else {
                    //q[oldS][oldA] += alpha * (reward + q[newS][newA] - q[oldS][oldA]);
                    float vg; // V_g(g')
                    if (oldG == newG){
                        vg = 0;
                    } else {
                        vg = evalValue(oldG, newG);
                    }
                    //System.out.println(vg);
                    float delta = reward + newR.q - oldR.q + vg;
                    //System.out.println(delta);
                    if (eTrace){
                        updateWithEligibilityTrace(delta);
                    } else {
                        oldR.q += alpha * delta;
                    }
                    if (newR != Rule.returnRule){
                        addToHist(newR);
                    }
                }
                oldS = newS;
                oldG = newG;
                oldR = newR;
            }
            public boolean approxValueEvalFlag = panel.flag("approxValueEvalFlag", false);
            /** Returns V_g(s) */
            public float evalValue(State g, State s){
                List<Rule> matched = selectMatchedRules(s, g);
                float[] q = calcRulePriorities(matched);
                if (approxValueEvalFlag){
                    // V_g(s) \approx max_a Q(s,g,a)
                    int i = Lab.argmax(q); 
                    return matched.get(i).q;
                } else {
                    // V_g(s) = \Sigma_a \pi((s,g),a)Q(s,g,a)
                    calcProbTable(q, 0, q.length);
                    float val = 0;
                    for (int i = 0; i < probTable.length; i++) {
                        // To avoid 0 * -Infinity = NaN
                        float value = matched.get(i).q;
                        if (value != Float.NEGATIVE_INFINITY){
                            val += probTable[i] * value;
                        }
                    }
                    return val; 
                }
            }
            
            // Kixg[X
            public float lambda = panel.getFloat("lambda", 0.9f, 0, 1);
            public int histSize = panel.getInt("histSize", 100, 1, 100);
            public Rule histR[];
            public int hTop;
            // This method should be called before starting each episode.
            public void initHist(){
                hTop = 0;
                histR = new Rule[histSize * 2];
            }
            public void addToHist(Rule r){
                histR[hTop] = r;
                hTop++;
                if (hTop >= histR.length) {
                    // Forget histories older than histSize.
                    for (int i = 0; i < histSize; i++) {
                        histR[i] = histR[i + histSize];
                    }
                    hTop = histSize;
                }
            }
            public void updateWithEligibilityTrace(float delta){
                float d = delta;
                int index = hTop - 1;
                for (int i = 0; i < histSize; i++) {
                    histR[index].q += alpha * d;
                    d *= lambda;
                    if (--index < 0) break;
                }
            }
            
            // Softmax
            public double[] probTable = new double[0]; /** \pi(a) \in [0,1] */
            public int softmax(float[] q){ return softmax(q, 0, q.length); }
            public int softmax(float[] q, int from, int to){
                calcProbTable(q, from, to);
//                System.out.println("probTable=");
//                for (int i = 0; i < probTable.length; i++) {
//                    System.out.print(probTable[i]+ ", ");
//                }
//                System.out.println();
                float r = Lab.rand();
                double sum = 0;
                for (int i = from; i < to; i++){
                    sum += probTable[i]; 
                    if (sum > r) {
                        Lab.assertTrue(q[i] != Float.NEGATIVE_INFINITY); 
                        return i;
                    }
                }
                Lab.assertTrue(sum - 0.001f < 1);
                Lab.assertTrue(q[to - 1] != Float.NEGATIVE_INFINITY); 
                return to - 1;
            }
            // \pi((s,g),a) = exp(beta * Q(s,g,a)) / a' exp(beta * Q(s,g,a'))
            public void calcProbTable(float[] q, int from, int to){
                if (q.length != probTable.length){
                    probTable = new double[q.length];
                }
                float max = Lab.max(q);
                double total = 0;
                for (int i = from; i < to; i++){
                    // To avoid overflow, subtract max.
                    // exp(a-c)/\Sigma_i exp(ai-c) = exp(a)/\Sigma_i exp(ai)  
                    double val = Math.exp(beta * (q[i] - max));
                    probTable[i] = val;
                    total += val;
//                     System.out.println("q["+ i+ "]="+ q[i]);
//                     System.out.println("val="+ val);
                }
//                System.out.println("total="+ total);
                Lab.assertTrue(total > 0);
                for (int i = from; i < to; i++){
                    probTable[i] /= total;
                }
            }
            
            
            //
            public void visualizeVals(){
                visualizeAsCanvas();
            }
            public void visualizeAsCanvas(){
                // _Otł悢H
                env.viewPanel.paint("QTable", tablePainter);
            }
            public QTablePainter tablePainter = new QTablePainter();
            public int charSize = panel.getInt("charSize", 12, 1, 40);
            public Font f = new Font("lr SVbN", Font.PLAIN, charSize);
            public class QTablePainter extends Lab.Code implements Lab.Painter {
                public Dimension getSize(){
                    return new Dimension(charSize * sizeX, charSize * sizeY);
                }
                int counter = 0;
                public void paintComponent(Graphics g, MouseEvent lastEvent) {
                    // 
                }
            }
        }
        //--------------------------------------------------
        public boolean visualizeFlag;
        public class World {
            public Item[][] map;
            // G[WFg͂Ƃ肠PB
            public Agent agent;
            public World(){
            }
            public void main(){
                agent = new Agent(this);
                int counter = 0;
                for (;;){
                    visualizeFlag = panel.flag("visualizeFlag", true);
                    panel.speedControl("Episode loop", 0);
                    initEpisode();
                    State start = observe(agent);
                    //State goal = new State(new Object[]{Leftovers,0,1,Item.AgentN,0,0});
                    State goal = new State(new Object[]{Leftovers,0,1,
                            Rule.PHI,Rule.PHI,Rule.PHI});
                    agent.setStartAndGoal(start, goal);
                    agent.chooseFirstAction();
                    int steps = 0;
                    while (! agent.oldS.satisfies(goal) && steps++ < maxSteps){
                        env.viewPanel.print1("counter=", ""+ counter++);
                        if (visualizeFlag){
                            panel.speedControl("Step loop", 100);
                            visualizeMap();
                            visualizeAgentState();
                        }
                        
                        agent.takeAction();
                        agent.chooseAction();
                        agent.update();

                    }
                    if (visualizeFlag){
                        visualizeMap();
                        visualizeAgentState();
                    }
                }
            }
            public void visualizeAgentState(){
                env.viewPanel.print1("agent.pos", ""+ agent.pos);
                env.viewPanel.print1("agent.o1pos", ""+ agent.o1pos);
                env.viewPanel.print1("agent.o2pos", ""+ agent.o2pos);
                {
                    String goalsLabel = "Goals";
                    env.viewPanel.setText(goalsLabel, ""); // Clear text.
                    for (int i = 0; i < agent.stack.size(); i++) {
                        // Print elements from bottom to top.
                        env.viewPanel.println(goalsLabel, ""+ agent.stack.get(i));
                    }
                    env.viewPanel.println(goalsLabel, ""+ agent.oldG);
                }
                {
                    String logLabel = "Log";
                    env.viewPanel.println(logLabel, "---");
                    String s = "stack size="+ agent.stack.size()+ ":";
                    for (int i = agent.stack.size() - 1; i >= 0 ; i--) {
                        // Add elements from top to bottom.
                        s += agent.stack.get(i)+ ", ";
                    }
                    env.viewPanel.println(logLabel, s);
                    env.viewPanel.println(logLabel, "s,g="+ agent.oldS+
                            ", "+ agent.oldG);
                    env.viewPanel.println(logLabel, ""+ agent.oldR);
                }
                env.viewPanel.plotWithFixedY("rule.q", 0, -10, 0);// dummy
                env.viewPanel.resetGraphData("rule.q");
                agent.rules.forEach(r -> {
                    env.viewPanel.plot("rule.q", r.q);
                });
            }
            public void testMainLoop(){
                agent = new Agent(this);
                initEpisode();
                for (Action a : Action.values()) {
                    panel.button(a.name());
                }
                //agent.chooseFirstAction();
                for (;;){
                    panel.speedControl("World mainLoop", 100);
                    //agent.chooseAction();
                    //agent.takeAction();
                    for (Action a : Action.values()) {
                        if (panel.button(a.name())){
                            takePrimitiveAction(a, agent);
                        }
                    }
                    // 
                    visualizeMap();
                    env.viewPanel.print1("agent.pos", ""+agent.pos);
                    env.viewPanel.print1("agent.o1pos", ""+agent.o1pos);
                    env.viewPanel.print1("agent.o2pos", ""+agent.o2pos);
                }
            }
            public void initEpisode(){
                map = new Item[sizeX][sizeY];
                for (int x = 0; x < map.length; x++) {
                    for (int y = 0; y < map[x].length; y++) {
                        map[x][y] = Item.Space;
                    }
                }
                // 
                for (int x = 0; x < sizeX; x++) {
                    map[x][0] = map[x][sizeY - 1] = Item.Wall; 
                }
                for (int y = 0; y < sizeY; y++) {
                    map[0][y] = map[sizeX - 1][y] = Item.Wall;
                }
                //map[3][3] = Item.Nut;
//                map[6][4] = Item.Stone;
//                map[4][6] = Item.Shell;
                agent.headDirection = 0;
                agent.pos.x = 4;
                agent.pos.y = 5;
                agent.o1pos = new Position(0,0);
                agent.o2pos = new Position(0,0);
                map[agent.pos.x][agent.pos.y] = Item.agentChar(agent.headDirection); 
                putItemAtRandomPosition(Item.Stone);
                putItemAtRandomPosition(Item.Shell);
            }
            public void putItemAtRandomPosition(Item item){
                for(;;){
                    int x = Lab.irand(map.length);
                    int y = Lab.irand(map[0].length);
                    if (map[x][y] == Item.Space){
                        map[x][y] = item;
                        return;
                    }
                }
            }
            public void visualizeMap(){
                env.viewPanel.paint("Map", mapPainter);
            }
            public MapPainter mapPainter = new MapPainter();
            public int charSize = panel.getInt("charSize", 24, 1, 40);
            public Font f = new Font("lr SVbN", Font.PLAIN, charSize);
            public class MapPainter extends Lab.Code implements Lab.Painter {
                public Dimension getSize(){
                    return new Dimension(charSize * sizeX + 1, charSize * sizeY + 2);
                }
                int counter = 0;
                public void paintComponent(Graphics g, MouseEvent lastEvent) {
                    g.setFont(f);
                    {
                        g.setColor(Color.BLUE);
                        int hx, hy;
                        switch (agent.headDirection) {
                        case 0: { hx = 0; hy = -1; } break;
                        case 1: { hx = 1; hy = 0; } break;
                        case 2: { hx = 0; hy = 1; } break;
                        case 3: { hx = -1; hy = 0; } break;
                        default: throw new Error();
                        }
                        double cx = (agent.pos.x + 0.5) * charSize;
                        double cy = (sizeY - agent.pos.y - 0.5) * charSize;
                        g.drawLine((int)cx, (int)cy,
                                (int)(cx + hx * 0.7 * charSize),
                                (int)(cy + hy * 0.7 * charSize));
                    }
                    
                    for (int y = 0; y < sizeY; y++) {
                        for (int x = 0; x < sizeX; x++) {
                            char c = map[x][y].code;
                            g.setColor(Color.BLACK);
                            g.drawString(Character.toString(c),
                                        x * charSize, (sizeY - y) * charSize);
                        }
                    }

                    g.setColor(Color.GREEN);
                    Position p1 = agent.o1pos.unRelativeTo(agent.pos, agent.headDirection);
                    g.drawRect(p1.x * charSize, (sizeY - p1.y - 1) * charSize,
                            charSize - 2, charSize - 2);
                    g.setColor(Color.BLUE);
                    Position p2 = agent.o2pos.unRelativeTo(agent.pos, agent.headDirection);
                    g.drawRect(p2.x * charSize + 2, (sizeY - p2.y - 1) * charSize + 2,
                            charSize - 2, charSize - 2);
                }
            }
        
            /**
             * action:
             *   0     |y
             *  3@1    | 
             *   2     O--->x
             * 
             *  (x,y)=(0,0) is lower left corner
             *  (x,y)=(sizeX-1,sizeY-1) is upper right corner
             *
             */
            public float takePrimitiveAction(Action action, Agent a) {
                float reward = 0;
                int dx, dy;
                switch (a.headDirection){
                case 0: dx = 0; dy = 1; break; // Front
                case 1: dx = 1; dy = 0; break; // Right
                case 2: dx = 0; dy = -1; break; // Back
                case 3: dx = -1; dy = 0; break; // Left
                default: throw new Error();
                }
                Position f = new Position(dx, dy);

                switch (action) {
                case MoveForward: {
                    switch (map[a.pos.x + dx][a.pos.y + dy]) {
                    case Wall: {
                        reward += -1;
                    } break;
                    case Space: {
                        map[a.pos.x + dx][a.pos.y + dy] = map[a.pos.x][a.pos.y];
                        map[a.pos.x][a.pos.y] = Item.Space;
                        if (a.o1pos.x == 0 && a.o1pos.y == 0){
                              // ɂ͒ӂB
                        } else {
                            a.o1pos.moveBackward();
                        }
                        if (a.o2pos.x == 0 && a.o2pos.y == 0){
                              // ɂ͒ӂB
                        } else {
                            a.o2pos.moveBackward(); 
                        }
                        // G[WFg̊SWςB
                        a.pos.moveForward(f);
                        reward += -1;
                    } break;
                    case Stone:
                    case Shell: 
                    case Nut: 
                    case Meat: 
                    case Leftovers: 
                    {
                        // Ԃ邾BǂƓB
                        reward += -1;
                    } break;

                    default:
                        Lab.assertTrue(false);
                        break;
                    }
                } break;
                
                case Load: {
                    switch (map[a.pos.x + dx][a.pos.y + dy]) {
                    case Wall: 
                    case Space:{
                        reward += -1;
                    } break;
                    case Stone:
                    case Shell:
                    {
                        map[a.pos.x][a.pos.y] = map[a.pos.x + dx][a.pos.y + dy];
                        map[a.pos.x + dx][a.pos.y + dy] = Item.Space;
                        // Load 镨̂ɂ͒ӂB
                        if (a.o1pos.x == 0 && a.o1pos.y == 1){ 
                            a.o1pos.moveBackward();
                        }
                        if (a.o2pos.x == 0 && a.o2pos.y == 1){ 
                            a.o2pos.moveBackward(); 
                        }
                        reward += -1;
                    } break;
                    case Nut: 
                    case Meat: {
                        map[a.pos.x + dx][a.pos.y + dy] = Item.Leftovers;
                        reward += 10;
                    } break;
                    case Leftovers: {
                        map[a.pos.x + dx][a.pos.y + dy] = Item.Space;
                        reward += 1;
                    } break;

                    default:
                        Lab.assertTrue(false);
                        break;
                    }
                } break;
                    
                case Unload: {
                    if (map[a.pos.x][a.pos.y] == Item.agentChar(agent.headDirection)) {
                        // Do nothing if nothing is loaded.
                        reward += -1;
                    } else if (map[a.pos.x][a.pos.y] == Stone && 
                            map[a.pos.x + dx][a.pos.y + dy] == Shell) {
                        map[a.pos.x + dx][a.pos.y + dy] = Nut;
                        // Ă̂ o1,  ڂ̑Ô̕ o2 ƂĒӂB
                        a.o1pos = new Position(0,0);
                        a.o2pos = new Position(0,1);
                        reward += -1;
                    } else if (map[a.pos.x + dx][a.pos.y + dy] == Space){
                        map[a.pos.x + dx][a.pos.y + dy] = map[a.pos.x][a.pos.y];
                        map[a.pos.x][a.pos.y] = Item.agentChar(agent.headDirection);
                        // Unload 镨̂ɒӂB
                        if (a.o1pos.x == 0 && a.o1pos.y == 0){ 
                            a.o1pos.moveForward();
                        }
                        if (a.o2pos.x == 0 && a.o2pos.y == 0){ 
                            a.o2pos.moveForward(); 
                        }
                        reward += -1;
                    } else {
                        // Do nothing.
                        reward += -1;
                    }
                } break;

                case TurnRight:{
                    Item oldSelf = Item.agentChar(agent.headDirection);
                    a.headDirection = (a.headDirection + 1) % 4;
                    if (map[a.pos.x][a.pos.y] == oldSelf) {
                        map[a.pos.x][a.pos.y] = Item.agentChar(a.headDirection);
                    }
                    // G[WFgڂĂ镨̂̐g̒SWςB
                    a.o1pos.rotateLeft();
                    a.o2pos.rotateLeft();
                    reward += -1;
                } break;
                
                case TurnLeft:{
                    Item oldSelf = Item.agentChar(agent.headDirection);
                    a.headDirection = (a.headDirection + 3) % 4;
                    if (map[a.pos.x][a.pos.y] == oldSelf) {
                        map[a.pos.x][a.pos.y] = Item.agentChar(a.headDirection);
                    }
                    // G[WFgڂĂ镨̂̐g̒SWςB
                    a.o1pos.rotateRight();
                    a.o2pos.rotateRight();
                    reward += -1;
                } break;

                case Reset1: {
                    agent.o1pos = new Position(0,0);
                    reward += -1;
                } break;

                case Reset2: {
                    agent.o2pos = new Position(0,0);
                    reward += -1;
                } break;

                case Find1: {
                    // newG = s(item,__,__,__,__,__)
                    Position pos = findItem(agent.pos, agent.newG.values[0]);
                    agent.o1pos = pos.relativeTo(agent.pos, agent.headDirection);
                    reward += -1;
                } break;

                case Find2: {
                    // newG = s(__,__,__,item,__,__)
                    Position pos = findItem(agent.pos, agent.newG.values[3]);
                    agent.o2pos = pos.relativeTo(agent.pos, agent.headDirection);
                    reward += -1;
                } break;

                case LookAround1: {
                    Position salientPos = findSalientPos(agent.pos);
                    agent.o1pos = salientPos.relativeTo(agent.pos, agent.headDirection);
                    reward += -1;
                } break;

                case LookAround2: {
                    Position salientPos = findSalientPos(agent.pos);
                    agent.o2pos = salientPos.relativeTo(agent.pos, agent.headDirection);
                    reward += -1;
                } break;

                default:
                    Lab.assertTrue(false);
                    break;
                }
                return reward;
            }
            public int lookAroundRadius = panel.getInt("lookAroundRadius", 10, 1, 100);
            public Position findItem(Position pos, Object target){
                if (target instanceof Item){
                    Item item = (Item)target;
                    for (int x = pos.x - lookAroundRadius;
                            x <= pos.x + lookAroundRadius; x++) {
                        for (int y = pos.y - lookAroundRadius;
                                y <= pos.y + lookAroundRadius; y++) {
                            if (x < 0 || sizeX <= x || y < 0 || sizeY <= y) continue;
                            if (map[x][y] == item){
                                return new Position(x,y);
                            }
                        }
                    }
                }
                //agent.objectNotFound = true;
                return pos.clone(); // Not found.
            }
            public Position findSalientPos(Position pos){
                int x = 0, y = -1;
                for (int i = 0; i < lookAroundRadius * lookAroundRadius; i++) {
                    x = pos.x + Lab.irand(lookAroundRadius * 2) - lookAroundRadius;
                    y = pos.y + Lab.irand(lookAroundRadius * 2) - lookAroundRadius;
                    if (x < 0 || sizeX <= x || y < 0 || sizeY <= y) continue;
                    if (x == 0 || y == 0) continue;
                    Item item = map[x][y];
                    if (item == Item.Space) continue;
                    break;
                }
                return new Position(x, y);
            }
            public State observe(Agent a){
                // ݂̊̏ԂG[WFgϑʂԂB
                Position p1 = a.o1pos.unRelativeTo(a.pos, a.headDirection);
                Position p2 = a.o2pos.unRelativeTo(a.pos, a.headDirection);
                Object o1, o2;
//                if (a.o1pos.x == 0 && a.o1pos.y == 0){
//                    o1 = adjustItemDirection(map[p1.x][p1.y]);
//                } else {
//                    o1 = map[p1.x][p1.y];
//                }
//                if (a.o2pos.x == 0 && a.o2pos.y == 0){
//                    o2 = adjustItemDirection(map[p2.x][p2.y]);
//                } else {
//                    o2 = map[p2.x][p2.y];
//                }
                o1 = map[p1.x][p1.y];
                o2 = map[p2.x][p2.y];
                Object[] values = {o1, a.o1pos.x, a.o1pos.y,
                        o2, a.o2pos.x, a.o2pos.y};
                return new State(values);
            }
            public Item adjustItemDirection(Item item) {
                return Item.Agent;
//                switch  (item) {
//                case AgentN:
//                case AgentE:
//                case AgentW:
//                case AgentS:{
//                    // g͎猩ďɑOĂB
//                    return Item.AgentN;
//                }
//                default:
//                    return item;
//                }
            }
        }
    }
}
