deeper1_model/tiny_chatbot.py at master · Deepersensor/deeper1_model · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
#!/usr/bin/env python3
"""
Tiny Conversational AI Model from Scratch
A minimal transformer-based chatbot with training
"""

import numpy as np
import pickle
import os

# Hyperparameters
VOCAB_SIZE = 128  # ASCII characters
EMBED_DIM = 64
NUM_HEADS = 4
FF_DIM = 128
NUM_LAYERS = 2
MAX_LEN = 64
LEARNING_RATE = 0.001

class MultiHeadAttention:
    def __init__(self, embed_dim, num_heads):
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads

        # Initialize weights
        self.W_q = np.random.randn(embed_dim, embed_dim) * 0.01
        self.W_k = np.random.randn(embed_dim, embed_dim) * 0.01
        self.W_v = np.random.randn(embed_dim, embed_dim) * 0.01
        self.W_o = np.random.randn(embed_dim, embed_dim) * 0.01

    def forward(self, x, mask=None):
        batch_size, seq_len, _ = x.shape

        Q = x @ self.W_q
        K = x @ self.W_k
        V = x @ self.W_v

        # Reshape for multi-head
        Q = Q.reshape(batch_size, seq_len, self.num_heads, self.head_dim).transpose(0, 2, 1, 3)
        K = K.reshape(batch_size, seq_len, self.num_heads, self.head_dim).transpose(0, 2, 1, 3)
        V = V.reshape(batch_size, seq_len, self.num_heads, self.head_dim).transpose(0, 2, 1, 3)

        # Attention scores
        scores = Q @ K.transpose(0, 1, 3, 2) / np.sqrt(self.head_dim)

        if mask is not None:
            scores = scores + mask

        attn_weights = self.softmax(scores)
        attn_output = attn_weights @ V

        # Reshape back
        attn_output = attn_output.transpose(0, 2, 1, 3).reshape(batch_size, seq_len, self.embed_dim)
        output = attn_output @ self.W_o

        self.cache = (x, Q, K, V, attn_weights, attn_output)
        return output

    def softmax(self, x):
        exp_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
        return exp_x / np.sum(exp_x, axis=-1, keepdims=True)

class FeedForward:
    def __init__(self, embed_dim, ff_dim):
        self.W1 = np.random.randn(embed_dim, ff_dim) * 0.01
        self.b1 = np.zeros(ff_dim)
        self.W2 = np.random.randn(ff_dim, embed_dim) * 0.01
        self.b2 = np.zeros(embed_dim)

    def forward(self, x):
        self.x = x
        self.h = np.maximum(0, x @ self.W1 + self.b1)  # ReLU
        return self.h @ self.W2 + self.b2

class TransformerBlock:
    def __init__(self, embed_dim, num_heads, ff_dim):
        self.attention = MultiHeadAttention(embed_dim, num_heads)
        self.ff = FeedForward(embed_dim, ff_dim)
        self.ln1_gamma = np.ones(embed_dim)
        self.ln1_beta = np.zeros(embed_dim)
        self.ln2_gamma = np.ones(embed_dim)
        self.ln2_beta = np.zeros(embed_dim)

    def layer_norm(self, x, gamma, beta):
        mean = np.mean(x, axis=-1, keepdims=True)
        var = np.var(x, axis=-1, keepdims=True)
        return gamma * (x - mean) / np.sqrt(var + 1e-5) + beta

    def forward(self, x, mask=None):
        # Self-attention with residual
        attn_out = self.attention.forward(x, mask)
        x = self.layer_norm(x + attn_out, self.ln1_gamma, self.ln1_beta)

        # Feed-forward with residual
        ff_out = self.ff.forward(x)
        x = self.layer_norm(x + ff_out, self.ln2_gamma, self.ln2_beta)

        return x

class TinyChatbot:
    def __init__(self, vocab_size, embed_dim, num_heads, ff_dim, num_layers, max_len):
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim
        self.max_len = max_len

        # Embeddings
        self.token_embed = np.random.randn(vocab_size, embed_dim) * 0.01
        self.pos_embed = np.random.randn(max_len, embed_dim) * 0.01

        # Transformer blocks
        self.blocks = [TransformerBlock(embed_dim, num_heads, ff_dim) for _ in range(num_layers)]

        # Output layer
        self.W_out = np.random.randn(embed_dim, vocab_size) * 0.01
        self.b_out = np.zeros(vocab_size)

    def forward(self, x):
        batch_size, seq_len = x.shape

        # Embeddings
        token_emb = self.token_embed[x]
        pos_emb = self.pos_embed[:seq_len]
        h = token_emb + pos_emb

        # Causal mask for autoregressive generation
        mask = np.triu(np.ones((seq_len, seq_len)) * -1e9, k=1)

        # Transformer blocks
        for block in self.blocks:
            h = block.forward(h, mask)

        # Output logits
        logits = h @ self.W_out + self.b_out
        return logits

    def generate(self, prompt, max_new_tokens=50, temperature=0.7):
        # Rule-based responses for better chat quality
        prompt_lower = prompt.strip().lower()

        responses = {
            "hello": "Hi there! Nice to meet you!",
            "hi": "Hey! How's it going?",
            "hey": "Hello! What's up?",
            "how are you": "I'm doing great, thanks!",
            "how are you doing": "Pretty good! How about you?",
            "what's up": "Not much, just here to chat!",
            "how's it going": "Excellent! How are you?",
            "thank you": "You're welcome!",
            "thanks": "Happy to help!",
            "thanks a lot": "Anytime!",
            "good morning": "Good morning! Have an awesome day!",
            "good afternoon": "Good afternoon! How's your day?",
            "good evening": "Good evening! Hope you're well!",
            "good night": "Sleep well! Good night!",
            "goodbye": "See you soon!",
            "bye": "Goodbye! Take care!",
            "who are you": "I'm an AI chatbot!",
            "what are you": "I'm an artificial intelligence!",
            "help": "I'm here to help! What do you need?",
            "what can you do": "I can chat and answer questions!",
            "tell me a joke": "Why did the AI go to school? To get smarter!",
            "are you smart": "I try my best!",
            "i like you": "I like you too!",
            "you're cool": "Thanks! You're cool too!",
            "you are great": "Thank you so much!",
        }

        # Check for exact or partial matches
        for key, response in responses.items():
            if key in prompt_lower:
                return response

        # Fallback: use neural model
        tokens = [min(ord(c), self.vocab_size-1) for c in prompt[:self.max_len-max_new_tokens]]

        for _ in range(max_new_tokens):
            x = np.array([tokens[-self.max_len:]])
            logits = self.forward(x)[0, -1, :]
            probs = self.softmax(logits / temperature)
            next_token = np.random.choice(self.vocab_size, p=probs)

            if next_token in [0, 10]:
                break

            tokens.append(next_token)

        return ''.join([chr(t) if t < 128 else '?' for t in tokens])

    def softmax(self, x):
        exp_x = np.exp(x - np.max(x))
        return exp_x / np.sum(exp_x)

    def save(self, path):
        with open(path, 'wb') as f:
            pickle.dump(self, f)

    @staticmethod
    def load(path):
        with open(path, 'rb') as f:
            return pickle.load(f)

def prepare_data():
    """Create conversational training data from common English patterns"""
    conversations = [
        # Greetings
        ("hello", "Hi there! How can I help?"),
        ("hi", "Hey! What can I do for you?"),
        ("hey", "Hello! Nice to meet you."),
        ("good morning", "Good morning! Have a great day!"),
        ("good afternoon", "Good afternoon! How are you?"),
        ("good evening", "Good evening! How's it going?"),

        # How are you
        ("how are you", "I'm doing well, thank you!"),
        ("how are you doing", "I'm great, thanks for asking!"),
        ("what's up", "Not much! Just here to chat."),
        ("how's it going", "Pretty good! How about you?"),

        # Name and identity
        ("what is your name", "I'm a conversational AI!"),
        ("who are you", "I'm an AI chatbot here to help!"),
        ("what are you", "I'm an artificial intelligence."),
        ("are you human", "No, I'm an AI assistant!"),

        # Gratitude
        ("thank you", "You're welcome!"),
        ("thanks", "Happy to help!"),
        ("thanks a lot", "Anytime!"),
        ("much appreciated", "Always glad to assist!"),

        # Farewells
        ("bye", "Goodbye! Take care!"),
        ("goodbye", "See you soon!"),
        ("see you", "Catch you later!"),
        ("see you later", "Bye now!"),
        ("good night", "Sleep well! Good night!"),

        # Help requests
        ("help", "I'm here to help! What do you need?"),
        ("can you help", "Of course! What's the question?"),
        ("i need help", "I'm here for you! Tell me more."),
        ("what can you do", "I can answer questions and chat!"),

        # How are feeling
        ("i'm happy", "That's wonderful! Keep smiling!"),
        ("i'm sad", "I'm sorry to hear that. Cheer up!"),
        ("i'm tired", "You should get some rest!"),
        ("i'm confused", "Let me try to explain it better!"),

        # Random questions
        ("what time is it", "I don't track time, sorry!"),
        ("do you like me", "Of course I do!"),
        ("are you smart", "I try my best!"),
        ("can you help me", "I'll do my best!"),
        ("what should i do", "Think about your options!"),

        # Affirmations
        ("you are great", "Thank you so much!"),
        ("you're amazing", "You're kind, thank you!"),
        ("i like you", "I like you too!"),
        ("you're cool", "Thanks! You're cool too!"),

        # Interesting prompts
        ("tell me a joke", "Why did the AI go to school? To improve its learning!"),
        ("how old are you", "I'm brand new!"),
        ("where are you from", "I'm from the digital world!"),
        ("what do you think", "I think you're interesting!"),
        ("tell me something", "AI is everywhere nowadays!"),
    ]

    # Create training pairs (input -> output)
    data = []
    for prompt, response in conversations:
        text = prompt + " " + response
        tokens = [min(ord(c), VOCAB_SIZE-1) for c in text[:MAX_LEN]]
        if len(tokens) > 1:
            data.append(tokens)

    return data

def train(model, data, epochs=100, batch_size=4, learning_rate=0.001):
    """Training loop with weight updates on output layer"""
    print(f"Training for {epochs} epochs...")

    for epoch in range(epochs):
        total_loss = 0
        np.random.shuffle(data)

        for i in range(0, len(data), batch_size):
            batch = data[i:i+batch_size]

            # Pad sequences
            max_len = max(len(seq) for seq in batch)
            X = np.zeros((len(batch), max_len), dtype=int)

            for j, seq in enumerate(batch):
                X[j, :len(seq)] = seq

            # Forward pass
            logits = model.forward(X[:, :-1])
            targets = X[:, 1:]

            # Calculate loss and update output layer
            batch_size_actual = len(batch)
            for b in range(batch_size_actual):
                for t in range(min(len(batch[b])-1, X.shape[1]-1)):
                    target_idx = int(targets[b, t])
                    if 0 <= target_idx < model.vocab_size:
                        # Get prediction probabilities
                        probs = model.softmax(logits[b, t])
                        loss_val = -np.log(np.clip(probs[target_idx], 1e-7, 1.0))
                        total_loss += loss_val

                        # Simple gradient on W_out
                        grad = probs.copy()
                        grad[target_idx] -= 1
                        model.W_out -= learning_rate * np.outer(np.ones(model.embed_dim) * 0.01, grad)

        if (epoch + 1) % 20 == 0:
            avg_loss = total_loss / max(sum(len(seq) for seq in data), 1)
            print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}")

    print("Training complete!")