diff --git a/seq2seq/attention_decoder.ipynb b/seq2seq/attention_decoder.ipynb index f4e92d2..dddbcb6 100644 --- a/seq2seq/attention_decoder.ipynb +++ b/seq2seq/attention_decoder.ipynb @@ -30,7 +30,39 @@ "cell_type": "code", "execution_count": 2, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " % Total % Received % Xferd Average Speed Time Time Time Current\n", + " Dload Upload Total Spent Left Speed\n", + "100 2814k 100 2814k 0 0 507k 0 0:00:05 0:00:05 --:--:-- 631k\n", + "Archive: data.zip\n", + " creating: data/\n", + " inflating: data/eng-fra.txt \n", + " creating: data/names/\n", + " inflating: data/names/Arabic.txt \n", + " inflating: data/names/Chinese.txt \n", + " inflating: data/names/Czech.txt \n", + " inflating: data/names/Dutch.txt \n", + " inflating: data/names/English.txt \n", + " inflating: data/names/French.txt \n", + " inflating: data/names/German.txt \n", + " inflating: data/names/Greek.txt \n", + " inflating: data/names/Irish.txt \n", + " inflating: data/names/Italian.txt \n", + " inflating: data/names/Japanese.txt \n", + " inflating: data/names/Korean.txt \n", + " inflating: data/names/Polish.txt \n", + " inflating: data/names/Portuguese.txt \n", + " inflating: data/names/Russian.txt \n", + " inflating: data/names/Scottish.txt \n", + " inflating: data/names/Spanish.txt \n", + " inflating: data/names/Vietnamese.txt \n" + ] + } + ], "source": [ "# download the needed data\n", "if not os.path.isfile('data.zip'):\n", @@ -39,24 +71,68 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " de question !\n", + "Really?\tVraiment ?\n", + "Really?\tVrai ?\n", + "Really?\tAh bon ?\n", + "Thanks.\tMerci !\n", + "We try.\tOn essaye.\n", + "We won.\tNous avons gagné.\n", + "We won.\tNous gagnâmes.\n", + "We won.\tNous l'avons emporté.\n", + "We won.\tNous l'empor\n" + ] + } + ], + "source": [ + "# Take a quick view of the data.\n", + "with open('data/eng-fra.txt') as f:\n", + " f.seek(1000)\n", + " print(f.read(200))" + ] + }, + { + "cell_type": "code", + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "\n", "class Lang:\n", + " \"\"\"\n", + " Utility class that serves as a language dictionary\n", + " \"\"\"\n", " def __init__(self, name):\n", " self.name = name\n", + " # Count how often a word occurs in the language data.\n", " self.word2count = {}\n", + " # Words are mapped to indices and vice versa\n", " self.index2word = {0: \"SOS\", 1: \"EOS\"}\n", " self.word2index = {v:k for k, v in self.index2word.items()}\n", + " # Total word count\n", " self.n_words = 2 # Count SOS and EOS\n", "\n", " def add_sentence(self, sentence):\n", + " \"\"\"\n", + " Process words in a sentence string.\n", + " \n", + " :param sentence: (str) \n", + " \"\"\"\n", " for word in sentence.split(' '):\n", " self.add_word(word)\n", "\n", " def add_word(self, word):\n", + " \"\"\"\n", + " Process words\n", + " :param word: (str)\n", + " \"\"\"\n", " if word not in self.word2index:\n", " self.word2index[word] = self.n_words\n", " self.word2count[word] = 1\n", @@ -66,6 +142,9 @@ " self.word2count[word] += 1\n", " \n", " def translate_indexes(self, idx):\n", + " \"\"\"\n", + " Takes in a vector of indices and returns the sentence.\n", + " \"\"\"\n", " return [self.index2word[i] for i in idx]\n", " \n", "# Turn a Unicode string to plain ASCII, thanks to\n", @@ -106,7 +185,7 @@ }, { "cell_type": "code", - "execution_count": 90, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -137,7 +216,7 @@ }, { "cell_type": "code", - "execution_count": 91, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -165,7 +244,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -187,18 +266,23 @@ "array(['we are even EOS', 'nous sommes a egalite EOS'], dtype='