A Jelenlegi állás, legutolsó beszélgetését küldeném önnek PDF-ben, 
illetve a kapott kódot, .py fájlban. 
Az elmúlt 2 évhez képest jelenleg úgy látom, hogy a ChatGPT megfelelően promtolva, átgondolva a szükséges lépéseket és strukturáltan felépitve a feladatot, a határ végeláthatatlan. 
Mármint itt arra akarok kilyukadni, hogy 2 évvel ezelőtt, még helyes prompt mellett is elég sokszor hibázott a GPT, vagy a promptok nem voltak megfelelőek
(lehet azóta én fejlődtem ebből a szempontból), de jelenleg kisebb korrigálásokkal, nagyon jól tudunk ,,együtt dolgozni".
2026.01.13.

*************************************
import random
from collections import defaultdict

# =====================================================
# GAME STATE (JĂTĂK ADATSTRUKTĂRA + LOGIKA)
# =====================================================

class GameState:
    def __init__(self):
        self.size = 6
        self.start_pos = (0, 0)
        self.goal_pos = (5, 5)

        # Statikus, megoldhatĂł falstruktĂşra
        self.walls = {
            (0, 3),
            (1, 0), (1, 1), (1, 3), (1, 5),
            (2, 5),
            (3, 1), (3, 2), (3, 3),
            (4, 4),
            (5, 0), (5, 2),
        }

        self.reset()

    def reset(self):
        self.player_pos = self.start_pos
        self.last_pos = None
        self.steps = 0
        self.visits = defaultdict(int)
        self.visits[self.player_pos] += 1
        return self.player_pos

    def move(self, direction):
        x, y = self.player_pos
        dx, dy = direction
        nx, ny = x + dx, y + dy

        if not (0 <= nx < self.size and 0 <= ny < self.size):
            return False  # invalid
        if (nx, ny) in self.walls:
            return False  # wall

        self.last_pos = self.player_pos
        self.player_pos = (nx, ny)
        return True

    def is_goal(self):
        return self.player_pos == self.goal_pos


# =====================================================
# ENV ADAPTER (AI â JĂTĂK KAPCSOLĂ)
# =====================================================

ACTIONS = [
    (-1, 0),  # UP
    (1, 0),   # DOWN
    (0, -1),  # LEFT
    (0, 1),   # RIGHT
]

class LabyrinthEnv:
    def __init__(self):
        self.game = GameState()
        self.max_steps = self.game.size * self.game.size * 2

    def reset(self):
        return self.game.reset()

    def step(self, action_idx):
        action = ACTIONS[action_idx]
        reward = 0
        done = False

        # ProgresszĂ­v step bĂźntetĂŠs
        reward -= 1 + (self.game.steps / self.max_steps)

        moved = self.game.move(action)
        if not moved:
            reward -= 5  # fal / invalid

        # VisszalĂŠpĂŠs bĂźntetĂŠs
        if self.game.last_pos == self.game.player_pos:
            reward -= 2

        self.game.steps += 1

        # Loop bĂźntetĂŠs
        self.game.visits[self.game.player_pos] += 1
        if self.game.visits[self.game.player_pos] >= 3:
            reward -= min(10, 2 * self.game.visits[self.game.player_pos])

        # CĂŠl
        if self.game.is_goal():
            reward += 100
            done = True

        # Step limit
        if self.game.steps >= self.max_steps:
            reward -= 20
            done = True

        return self.game.player_pos, reward, done


# =====================================================
# Q-LEARNING AGENT
# =====================================================

class QLearningAgent:
    def __init__(self):
        self.q = defaultdict(lambda: [0.0] * len(ACTIONS))
        self.alpha = 0.1
        self.gamma = 0.95
        self.epsilon = 1.0
        self.epsilon_decay = 0.995
        self.epsilon_min = 0.05

    def choose_action(self, state):
        if random.random() < self.epsilon:
            return random.randrange(len(ACTIONS))
        return max(range(len(ACTIONS)), key=lambda a: self.q[state][a])

    def update(self, state, action, reward, next_state):
        best_next = max(self.q[next_state])
        td_target = reward + self.gamma * best_next
        self.q[state][action] += self.alpha * (td_target - self.q[state][action])

    def decay_epsilon(self):
        self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)


# =====================================================
# TRAINING / TESZT FUTTATĂS
# =====================================================

def train(episodes=2000):
    env = LabyrinthEnv()
    agent = QLearningAgent()

    wins = 0
    total_steps = 0

    for ep in range(1, episodes + 1):
        state = env.reset()
        done = False

        while not done:
            action = agent.choose_action(state)
            next_state, reward, done = env.step(action)
            agent.update(state, action, reward, next_state)
            state = next_state

        agent.decay_epsilon()
        total_steps += env.game.steps
        if env.game.is_goal():
            wins += 1

        if ep % 100 == 0:
            print(
                f"Episode {ep:4d} | "
                f"Win rate: {wins/ep:.2f} | "
                f"Avg steps: {total_steps/ep:.1f} | "
                f"Epsilon: {agent.epsilon:.2f}"
            )

    return agent


# =====================================================
# ENTRY POINT
# =====================================================

if __name__ == "__main__":
    trained_agent = train(episodes=2000)
