From a8728921852a13633ad9167e12182867bc18f15b Mon Sep 17 00:00:00 2001 From: Jeremy Howard Date: Thu, 5 Mar 2020 13:57:14 -0800 Subject: [PATCH] rename --- .gitignore | 3 + 11_midlevel_data.ipynb | 1267 +++++++++ 11_nlp_dive.ipynb | 1315 --------- 12_better_rnn.ipynb | 1154 -------- 12_nlp_dive.ipynb | 2350 +++++++++++++++++ 14_deep_conv.ipynb | 1044 -------- 15_resnet.ipynb => 14_resnet.ipynb | 68 +- ...rch_details.ipynb => 15_arch_details.ipynb | 50 +- 17_accel_sgd.ipynb => 16_accel_sgd.ipynb | 376 ++- 19_foundations.ipynb => 17_foundations.ipynb | 58 +- 20_CAM.ipynb => 18_CAM.ipynb | 20 +- 18_callbacks.ipynb | 419 --- 21_learner.ipynb => 19_learner.ipynb | 214 +- 22_conclusion.ipynb => 20_conclusion.ipynb | 14 +- 14 files changed, 4220 insertions(+), 4132 deletions(-) create mode 100644 11_midlevel_data.ipynb delete mode 100644 11_nlp_dive.ipynb delete mode 100644 12_better_rnn.ipynb create mode 100644 12_nlp_dive.ipynb delete mode 100644 14_deep_conv.ipynb rename 15_resnet.ipynb => 14_resnet.ipynb (98%) rename 16_arch_details.ipynb => 15_arch_details.ipynb (89%) rename 17_accel_sgd.ipynb => 16_accel_sgd.ipynb (86%) rename 19_foundations.ipynb => 17_foundations.ipynb (96%) rename 20_CAM.ipynb => 18_CAM.ipynb (99%) delete mode 100644 18_callbacks.ipynb rename 21_learner.ipynb => 19_learner.ipynb (98%) rename 22_conclusion.ipynb => 20_conclusion.ipynb (95%) diff --git a/.gitignore b/.gitignore index 87620ac..794a30f 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,4 @@ +__pycache__/ +.last_checked +.gitconfig .ipynb_checkpoints/ diff --git a/11_midlevel_data.ipynb b/11_midlevel_data.ipynb new file mode 100644 index 0000000..40f0f67 --- /dev/null +++ b/11_midlevel_data.ipynb @@ -0,0 +1,1267 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#hide\n", + "from utils import *\n", + "from IPython.display import display,HTML" + ] + }, + { + "cell_type": "raw", + "metadata": {}, + "source": [ + "[[chapter_midlevel_data]]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Data munging with fastai's mid-level API" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We have seen what `Tokenizer` or a `Numericalize` do to a collection of texts, and how they're used inside the data block API, which handles those transforms for us directly using the `TextBlock`. But what if we want to only apply one of those transforms, either to see intermediate results or because we have already tokenized texts. More generally, what can we do when the data block API is not flexible enough to accommodate our particular use case? For this, we need to use fastai's *mid-level API* for processing data. The data block API is built on top of that layer, so it will allow you to do everything the data block API does, and much much more." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Going deeper into fastai's layered API" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The fastai library is built on a *layered API*. At the very top layer, there are *applications* that allow us to train a model in five lines of codes, as we saw in <>. In the case of creating `DataLoaders` for a text classifier, for instance, we used the line:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from fastai2.text.all import *\n", + "\n", + "dls = TextDataLoaders.from_folder(untar_data(URLs.IMDB), valid='test')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The factory method `TextDataLoaders.from_folder` is very convenient when your data is arranged the exact same way as the IMDb dataset, but in practice, that often won't be the case. The data block API offers more flexibility. As we saw in the last chapter, we can ge the same result with:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "path = untar_data(URLs.IMDB)\n", + "dls = DataBlock(\n", + " blocks=(TextBlock.from_folder(path),CategoryBlock),\n", + " get_y = parent_label,\n", + " get_items=partial(get_text_files, folders=['train', 'test']),\n", + " splitter=GrandparentSplitter(valid_name='test')\n", + ").dataloaders(path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "But it's sometimes not flexible enough. For debugging purposes for instance, we might need to apply just parts of the transforms that come with this data block. Or, we might want to create `DataLoaders` for some application that isn't directly supported by fastai. In this section, we'll dig into the pieces that are used inside fastai to implement the data block API. By understanding these pieces, you'll be able to leverage the power and flexibility of this mid-tier API." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "> note: The mid-level API in general does not only contain functionality for creating `DataLoaders`. It also has the *callback* system , which allows us to customize the training loop any way we like, and the *general optimizer*. Both will be covered in <>." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Transforms" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "When we studied tokenization and numericalization in the last chapter, we started by grabbing a bunch of texts:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "files = get_text_files(path, folders = ['train', 'test'])\n", + "txts = L(o.open().read() for o in files[:2000])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We then showed how to tokenize them with a `Tokenizer`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(#228) ['xxbos','xxmaj','this','movie',',','which','i','just','discovered','at'...]" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tok = Tokenizer.from_folder(path)\n", + "tok.setup(txts)\n", + "toks = txts.map(tok)\n", + "toks[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "tensor([ 2, 8, 20, 27, 11, 88, 18, 53, 3286, 45])" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "num = Numericalize()\n", + "num.setup(toks)\n", + "nums = toks.map(num)\n", + "nums[0][:10]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And how to numericalize, including automatically creating the vocab for our corpus:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "tensor([ 2, 8, 20, 27, 11, 88, 18, 53, 3286, 45])" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "num = Numericalize()\n", + "num.setup(toks)\n", + "nums = toks.map(num)\n", + "nums[0][:10]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The classes also have a *decode* method. For instance, `Numericalize.decode` gives us back the string tokens:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(#10) ['xxbos','xxmaj','this','movie',',','which','i','just','discovered','at']" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "nums_dec = num.decode(nums[0][:10]); nums_dec" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "...and `Tokenizer.decode` turns this back into a single string (it may not, however, be exactly the same as the original string; this depends on whether the tokenizer is *reversible*, which the default word tokenizer is not at the time we're writing this book):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'xxbos xxmaj this movie , which i just discovered at'" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tok.decode(nums_dec)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`decode` is used by fastai's `show_batch` and `show_results`, as well as some other inference methods, to convert predictions and mini-batches into a human-understandable representation.\n", + "\n", + "For each of `tok` or `num` above, we created an object, called the setup method (which trains the tokenizer if needed for `tok` and creates the vocab for `num`), applied it to our raw texts (by calling the object as a function), and then finally decoded it back to an understandable representation. These steps are needed for most data preprocessing tasks, so fastai provides a class that encapsulates them. This is the `Transform` class. Both `Tokenize` and `Numericalize` are `Transform`s.\n", + "\n", + "In general, a `Transform` is an object that behaves like a function, has an optional *setup* that will initialize some inner state (like the vocab inside `num` for instance), and has an optional *decode* that will reverse the function (this reversal may not be perfect, as we saw above for `tok`).\n", + "\n", + "A good example of `decode` is found in the `Normalize` transform that we saw in <>: to be able to plot the images its `decode` method undoes the normalization (i.e. it multiplies by the std and adds back the mean). On the other hand, data augmentation transforms do not have a `decode` method, since we want to show the effects on images, to make sure the data augmentation is working as we want.\n", + "\n", + "The second special behavior of `Transform`s is that they always get applied over tuples: in general, our data is always a tuple `(input,target)` (sometimes with more than one input or more than one target). When applying a transform on an item like this, such as `Resize`, we don't want to resize the tuple, but resize the input (if applicable) and the target (if applicable). It's the same for the batch transforms that do data augmentation: when the input is an image and the target is a segmentation mask, the transform needs to be applied (the same way) to the input and the target.\n", + "\n", + "We can see this behavior if we pass a tuple of texts to `tok`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "((#374) ['xxbos','xxmaj','well',',','\"','cube','\"','(','1997',')'...],\n", + " (#207) ['xxbos','xxmaj','conrad','xxmaj','hall','went','out','with','a','bang'...])" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tok((txts[0], txts[1]))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Writing your own Transform" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you want to write a custom transform to apply to your data, the easiest way is to write a function:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "3" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def f(x): return x+1\n", + "tfm = Transform(f)\n", + "tfm(2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`tfm` will automatically convert `f` to a `Transform` with no setup and no decode method. If you need either of those, you will need to subclass `Transform`. When writing this subclass, you need to implement the actual function in `encodes`, then (optionally), the setup behavior in `setups` and the decoding behavior in `decodes`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class NormalizeMean(Transform):\n", + " def setups(self, items): self.mean = sum(items)/len(items)\n", + " def encodes(self, x): return x-self.mean\n", + " def decodes(self, x): return x+self.mean" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here `NormalizeMean` will initialize some state during the setup (the mean of all elements passed), then the transformation is to subtract that mean. For decoding purposes, we implement the reverse of that transformation by adding the mean. Here is an example of `NormalizeMean` in action:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(3.0, 5.0, 2.0)" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tfm = NormalizeMean()\n", + "tfm.setup([1,2,3,4,5])\n", + "start = 2\n", + "y = tfm(start)\n", + "z = tfm.decode(y)\n", + "tfm.mean,y,z" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that the method called and the method implemented are different, for each of these methods:\n", + "\n", + "```asciidoc\n", + "[options=\"header\"]\n", + "|======\n", + "| Class | To call | To implement\n", + "| `nn.Module` (PyTorch) | `()` (i.e. call as function) | `forward`\n", + "| `Transform` | `()` | `encodes`\n", + "| `Transform` | `decode()` | `decodes`\n", + "| `Transform` | `setup()` | `setups`\n", + "|======\n", + "```\n", + "\n", + "So, for instance, you would never call `setups` directly, but instead would call `setups`. The reason for this is that `setup` does some work before and after calling `setups` for you. To learn more about `Transform`s and how you can use them to have different behavior depending on the type of the input, be sure to check the tutorials in the fastai docs." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Pipeline" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To compose several transforms together, fastai provides `Pipeline`. We define a `Pipeline` by passing it a list of `Transform`s; it will then compose the transforms inside it. When you call a `Pipeline` on an object, it will automatically call the transforms inside, in order:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "tensor([ 2, 8, 76, 10, 23, 3112, 23, 34, 3113, 33, 10, 8, 4477, 22, 88, 32, 10, 27, 42, 14])" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tfms = Pipeline([tok, num])\n", + "t = tfms(txts[0]); t[:20]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And you can call decode on the result of your encoding, to get back something you can display and analyze:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'xxbos xxmaj well , \" cube \" ( 1997 ) , xxmaj vincenzo \\'s first movie , was one of the most interesti'" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tfms.decode(t)[:100]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The only part that doesn't work the same way as in `Transform` is the setup. To properly setup a `Pipeline` of `Transform`s on some data, you need to use a `TfmdLists`." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## TfmdLists and Datasets: Transformed collections" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Your data is usually a set of raw items (like filenames, or rows in a dataframe) to which you want to apply a succession of transformations. We just saw that the succession of transformations was represented by a `Pipeline` in fastai. The class that groups together this pipeline with your raw items is called `TfmdLists`." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### TfmdLists" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here is the short way of doing the transformation we saw in the previous section:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tls = TfmdLists(files, [Tokenizer.from_folder(path), Numericalize])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "At initialization, the `TfmdLists` will automatically call the setup method of each transform in order, providing them not with the raw items but the items transformed by all the previous `Transform`s in order. We can get the result of our pipeline on any raw element just by indexing into the `TfmdLists`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "tensor([ 2, 8, 91, 11, 22, 5793, 22, 37, 4910, 34, 11, 8, 13042, 23, 107, 30, 11, 25, 44, 14])" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "t = tls[0]; t[:20]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And the `TfmdLists` knows how to decode for showing purposing:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'xxbos xxmaj well , \" cube \" ( 1997 ) , xxmaj vincenzo \\'s first movie , was one of the most interesti'" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tls.decode(t)[:100]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In fact, it even has a `show` method:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "xxbos xxmaj well , \" cube \" ( 1997 ) , xxmaj vincenzo 's first movie , was one of the most interesting and tricky ideas that xxmaj i 've ever seen when talking about movies . xxmaj they had just one scenery , a bunch of actors and a plot . xxmaj so , what made it so special were all the effective direction , great dialogs and a bizarre condition that characters had to deal like rats in a labyrinth . xxmaj his second movie , \" cypher \" ( 2002 ) , was all about its story , but it was n't so good as \" cube \" but here are the characters being tested like rats again . \n", + "\n", + " \" nothing \" is something very interesting and gets xxmaj vincenzo coming back to his ' cube days ' , locking the characters once again in a very different space with no time once more playing with the characters like playing with rats in an experience room . xxmaj but instead of a thriller sci - fi ( even some of the promotional teasers and trailers erroneous seemed like that ) , \" nothing \" is a loose and light comedy that for sure can be called a modern satire about our society and also about the intolerant world we 're living . xxmaj once again xxmaj xxunk amaze us with a great idea into a so small kind of thing . 2 actors and a blinding white scenario , that 's all you got most part of time and you do n't need more than that . xxmaj while \" cube \" is a claustrophobic experience and \" cypher \" confusing , \" nothing \" is completely the opposite but at the same time also desperate . \n", + "\n", + " xxmaj this movie proves once again that a smart idea means much more than just a millionaire budget . xxmaj of course that the movie fails sometimes , but its prime idea means a lot and offsets any flaws . xxmaj there 's nothing more to be said about this movie because everything is a brilliant surprise and a totally different experience that i had in movies since \" cube \" .\n" + ] + } + ], + "source": [ + "tls.show(t)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The `TfmdLists` is named with an \"s\" because it can handle a training and validation set with a splits argument. You just need to pass the indices of which elemets are in the training set, and which are in the validation set:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cut = int(len(files)*0.8)\n", + "splits = [list(range(cut)), list(range(cut,len(files)))]\n", + "tls = TfmdLists(files, [Tokenizer.from_folder(path), Numericalize], splits=splits)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can then access them through the `train` and `valid` attribute:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "tensor([ 2, 8, 20, 30, 87, 510, 1570, 12, 408, 379, 4196, 10, 8, 20, 30, 16, 13, 12216, 202, 509])" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tls.valid[0][:20]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you have manually written a `Transform` that returns your whole data (input and target) from the raw items you had, then `TfmdLists` is the class you need. You can directly convert it to a `DataLoaders` object with the `dataloaders` method. This is what we will do in our Siamese example further in this chapter.\n", + "\n", + "In general though, you have two (or more) parallel pipelines of transforms: one for processing your raw items into inputs and one to process your raw items into targets. For instance, here, the pipeline we defined only processes the input. If we want to do text classification, we have to process the labels as well. \n", + "\n", + "Here we need to do two things: first take the label name from the parent folder. There is a function `parent_label` for this:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(#50000) ['pos','pos','pos','pos','pos','pos','pos','pos','pos','pos'...]" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "lbls = files.map(parent_label)\n", + "lbls" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Then we need a `Transform` that will grab the unique items and build a vocab with it during setup, then will transform the string labels into integers when called. fastai provides this transform, it's called `Categorize`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "((#2) ['neg','pos'], TensorCategory(1))" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cat = Categorize()\n", + "cat.setup(lbls)\n", + "cat.vocab, cat(lbls[0])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To do the whole setup automatically on our list of files, we can create a `TfmdLists` as before:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "TensorCategory(1)" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tls_y = TfmdLists(files, [parent_label, Categorize()])\n", + "tls_y[0]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "But then we end up with two separate objects for our inputs and targets, which is not what we want. This is where `Datasets` comes to the rescue." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Datasets" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`Datasets` will apply two (or more) pipelines in parallel to the same raw object and build a tuple with the result. Like `TfmdLists`, it will automatically do the setup for us, and when we index into a `Datasets`, it will return us a tuple with the results of each pipeline:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "x_tfms = [Tokenizer.from_folder(path), Numericalize]\n", + "y_tfms = [parent_label, Categorize()]\n", + "dsets = Datasets(files, [x_tfms, y_tfms])\n", + "x,y = dsets[0]\n", + "x[:20],y" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Like a `TfmdLists`, we can pass along `splits` to a `Datasets` to split our data between training and validation:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(tensor([ 2, 8, 20, 30, 87, 510, 1570, 12, 408, 379, 4196, 10, 8, 20, 30, 16, 13, 12216, 202, 509]),\n", + " TensorCategory(0))" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "x_tfms = [Tokenizer.from_folder(path), Numericalize]\n", + "y_tfms = [parent_label, Categorize()]\n", + "dsets = Datasets(files, [x_tfms, y_tfms], splits=splits)\n", + "x,y = dsets.valid[0]\n", + "x[:20],y" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It can also decode any processed tuple or show it directly:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "('xxbos xxmaj this movie had horrible lighting and terrible camera movements . xxmaj this movie is a jumpy horror flick with no meaning at all . xxmaj the slashes are totally fake looking . xxmaj it looks like some 17 year - old idiot wrote this movie and a 10 year old kid shot it . xxmaj with the worst acting you can ever find . xxmaj people are tired of knives . xxmaj at least move on to guns or fire . xxmaj it has almost exact lines from \" when a xxmaj stranger xxmaj calls \" . xxmaj with gruesome killings , only crazy people would enjoy this movie . xxmaj it is obvious the writer does n\\'t have kids or even care for them . i mean at show some mercy . xxmaj just to sum it up , this movie is a \" b \" movie and it sucked . xxmaj just for your own sake , do n\\'t even think about wasting your time watching this crappy movie .',\n", + " 'neg')" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "t = dsets.valid[0]\n", + "dsets.decode(t)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The last step is to convert your `Datasets` object to a `DataLoaders`, which can be done with the `dataloaders` method. Here we need to pass along special arguments to take care of the padding problem (as we saw in the last chapter). This needs to happen just before we batch the elements, so we pass it to `before_batch`: " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dls = dsets.dataloaders(bs=64, before_batch=pad_input)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`dataloaders` directly calls `DataLoader` on each subset of our `Datasets`. fastai's `DataLoader` expands the PyTorch class of the same name and is responsible for collating the items from our datasets into batches. It has a lot of points of customization but the most important you should know are:\n", + "\n", + "- `after_item`: applied on each item after grabbing it inside the dataset. This is the equivalent of the `item_tfms` in `DataBlock`.\n", + "- `before_batch`: applied on the list of items before they are collated. This is the ideal place to pad items to the same size.\n", + "- `after_batch`: applied on the batch as a whole after its construction. This is the equivalent of the `batch_tfms` in `DataBlock`." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As a conclusion, here is the full code necessary to prepare the data for text classification:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tfms = [[Tokenizer.from_folder(path), Numericalize], [parent_label, Categorize]]\n", + "files = get_text_files(path, folders = ['train', 'test'])\n", + "splits = GrandparentSplitter(valid_name='test')(files)\n", + "dsets = Datasets(files, tfms, splits=splits)\n", + "dls = dsets.dataloaders(dl_type=SortedDL, before_batch=pad_input)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The two differences with what we had above is the use of `GrandParentSplitter` to split our training and validation data, and the `dl_type` argument. This is to tell `dataloaders` to use the `SortedDL` class of `DataLoader`, and not the usual one. This is the class that will handle the construction of batches by putting samples of roughly the same lengths into batches.\n", + "\n", + "This does the exact same thing as our `DataBlock` from above:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "path = untar_data(URLs.IMDB)\n", + "dls = DataBlock(\n", + " blocks=(TextBlock.from_folder(path),CategoryBlock),\n", + " get_y = parent_label,\n", + " get_items=partial(get_text_files, folders=['train', 'test']),\n", + " splitter=GrandparentSplitter(valid_name='test')\n", + ").dataloaders(path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "...except that now, you know how to customize every single piece of it!\n", + "\n", + "Let's practice what we just learned on this mid-level API for data preprocessing on a computer vision example now, with a Siamese Model input pipeline." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Applying the mid-tier data API: SiamesePair" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A Siamese model takes two images and has to determine if they are of the same classe or not. For this example, we will use the pets dataset again, and prepare the data for a model that will have to predict if two images of pets are of the same breed or not. We will explain here how to prepare the data for such a model, then we will train that model in <>." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Firs things first, let's get all the images in our dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from fastai2.vision.all import *\n", + "path = untar_data(URLs.PETS)\n", + "files = get_image_files(path/\"images\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If we didn't care about showing our objects at all, we could directly create one transform to completely preprocess that list of files. We will want to look at those images though, so we need to create a custom type. When you call the `show` method on a `TfmdLists` or a `Datasets` object, it will decode items until it reaches a type that contains a `show` method and use it to show the object. That `show` method gets passed a `ctx`, which could be a matplotlib axes for images, or the row of a dataframe for texts.\n", + "\n", + "Here we create a `SiameseImage` object that subclasses tuples and is inteneded to be contain three things: two images and a boolean to know if they are the same or not. We implement the `show` method that concatenates the two images with a black line in the middle. You can skip the part that is in the if test (which is to show the `SiameseImage` when the images are pillow images and not tensors), the important part is in the last three lines." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class SiameseImage(Tuple):\n", + " def show(self, ctx=None, **kwargs): \n", + " img1,img2,same_breed = self\n", + " if not isinstance(img1, Tensor):\n", + " if img2.size != img1.size: img2 = img2.resize(img1.size)\n", + " t1,t2 = tensor(img1),tensor(img2)\n", + " t1,t2 = t1.permute(2,0,1),t2.permute(2,0,1)\n", + " else: t1,t2 = img1,img2\n", + " line = t1.new_zeros(t1.shape[0], t1.shape[1], 10)\n", + " return show_image(torch.cat([t1,line,t2], dim=2), \n", + " title=same_breed, ctx=ctx)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's then create a first `SiameseImage` and check our `show` method works:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "img = PILImage.create(files[0])\n", + "s = SiameseImage(img, img, True)\n", + "s.show();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can also try with a second image that's not from the same class:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "img1 = PILImage.create(files[1])\n", + "s1 = SiameseImage(img, img1, False)\n", + "s1.show();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The important thing with `Transform`s we saw before is that they dispatch over tuples or their subclasses. That's precisely why we chose to subclass tuple in this instance: this way we can apply any transform that work on images to our `SiameseImage` and it will be applied on each image in the tuple:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "s2 = Resize(224)(s1)\n", + "s2.show();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here the resize transform is applied to each of the two images, but not the boolean flag. Even if we have a custom type, we can thus benefit form all the data augmentation transforms inside the library.\n", + "\n", + "We are now ready to build the `Transform` that we will use to get our data ready for a Siamese model. First, we will need a function to determine the class of all our images:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def label_func(fname):\n", + " return re.match(r'^(.*)_\\d+.jpg$', fname.name).groups()[0]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Then here is our main transform. For each image, il will, with a probability of 0.5, draw an image from the same class and return a `SiameseImage` with a true label, or draw an image from another class and a return a `SiameseImage` with a false label. This is all done in the private `_draw` function. There is one difference between the training and validation set, which is why the transform needs to be initialized with the splits: on the training set, we will make that random pick each time we read an image, whereas on the validation set, we make this random pick once and for all at initialization. This way, we get more varied samples during training, but always the same validation set." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class SiameseTransform(Transform):\n", + " def __init__(self, files, label_func, splits):\n", + " self.labels = files.map(label_func).unique()\n", + " self.lbl2files = {l: L(f for f in files if label_func(f) == l) for l in self.labels}\n", + " self.label_func = label_func\n", + " self.valid = {f: self._draw(f) for f in files[splits[1]]}\n", + " \n", + " def encodes(self, f):\n", + " f2,t = self.valid.get(f, self._draw(f))\n", + " img1,img2 = PILImage.create(f),PILImage.create(f2)\n", + " return SiameseImage(img1, img2, t)\n", + " \n", + " def _draw(self, f):\n", + " same = random.random() < 0.5\n", + " cls = self.label_func(f)\n", + " if not same: cls = random.choice(L(l for l in self.labels if l != cls)) \n", + " return random.choice(self.lbl2files[cls]),same" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can then create our main transform:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "splits = RandomSplitter()(files)\n", + "tfm = SiameseTransform(files, label_func, splits)\n", + "tfm(files[0]).show();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the middle level API for data collection, we have two objects that can help us apply transforms on a set of items, `TfmdLists` and `Datasets`. If you remember what we have just seen, one applies a `Pipeline` of transforms and the other applies several `Pipeline` of transforms in parallel, to build tuples. Here, our main transform already builds the tuples, so we use `TfmdLists`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "tls = TfmdLists(files, tfm, splits=splits)\n", + "show_at(tls.valid, 0);" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And we can finally get our data in `DataLoaders` by calling the `dataloaders` method. One thing to be careful here is that this method does not take `item_tfms` and `batch_tfms` like a `DataBlock`. The fastai `DataLoader` has several hooks that are named after events: here what we apply on the items after they are grabbed is called `after_item`, and what we apply on the batch once it's buils is called `after_batch`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dls = tls.dataloaders(after_item=[Resize(224), ToTensor], \n", + " after_batch=[IntToFloatTensor, Normalize.from_stats(*imagenet_stats)])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that we need to pass more transforms than usual: that's because the data block API usually adds them automatically:\n", + "\n", + "- `ToTensor` is the one that converts images to tensors (again, it's applied on every part of the tuple)\n", + "- `IntToFloatTensor` convert the tensor of images that have integers from 0 to 255 to a tensor of floats, and divides by 255 to make the values between 0 and 1." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And we have can now train a model using those `DataLoaders`. It needs a bit more customization than the usual model provided by `cnn_learner` since it has to take two images instead of one. We will see how to create such a model and train it in <>." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "TK conclusion and questionnaire" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Becoming a deep learning practitioner" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Congratulations — you've completed all of the chapters in this book which cover the key practical parts of training and using deep learning! You know how to use all of fastai's built in applications, and how to customise them using the data blocks API and loss functions. You even know how to create a neural network from scratch, and train it! (And hopefully you now know some of the questions to ask to help make sure your creations help improve society too.)\n", + "\n", + "The knowledge you already have is enough to create full working prototypes of many types of neural network application. More importantly, it will help you understand the capabilities and limitations of deep learning models, and how to design a system which best handles these capabilities and limitations.\n", + "\n", + "In the rest of this book we will be pulling apart these applications, piece by piece, to understand all of the foundations they are built on. This is important knowledge for a deep learning practitioner, because it is the knowledge which allows you to inspect and debug models that you build, and to create new applications which are customised for your particular projects." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "jupytext": { + "split_at_heading": true + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.5" + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": false, + "sideBar": true, + "skip_h1_title": true, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": {}, + "toc_section_display": true, + "toc_window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/11_nlp_dive.ipynb b/11_nlp_dive.ipynb deleted file mode 100644 index 030c726..0000000 --- a/11_nlp_dive.ipynb +++ /dev/null @@ -1,1315 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#hide\n", - "from utils import *" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "[[chapter_nlp_dive]]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# A language model from scratch" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We're now ready to go deep... deep into deep learning! You already learned how to train a basic neural network, but how do you go from there to creating state of the art models? In this part of the book we're going to uncover all of the mysteries, starting with language models." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## The data" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Whenever we start working on a new problem, we always first try to think of the simplest dataset we can which would allow us to try out methods quickly and easily, and interpret the results. When we started working on language modelling a few years ago, we didn't find any datasets that would allow for quick prototyping, so we made one. We call it *human numbers*, and it simply contains the first 10,000 numbers written out in English." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "> j: One of the most common practical mistakes I see even amongst highly experienced practitioners is failing to use appropriate datasets at appropriate times during the analysis process. In particular, most people tend to start with datasets which are too big and too complicated." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can download, extract, and take a look at our dataset in the usual way:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from fastai2.text.all import *\n", - "path = untar_data(URLs.HUMAN_NUMBERS)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#hide\n", - "Path.BASE_PATH = path" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(#2) [Path('train.txt'),Path('valid.txt')]" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "path.ls()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's open those two files and see what's inside. At first we'll join all of those texts together and ignore the split train/valid given by the dataset, we will come back to it later on:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(#9998) ['one \\n','two \\n','three \\n','four \\n','five \\n','six \\n','seven \\n','eight \\n','nine \\n','ten \\n'...]" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "lines = L()\n", - "with open(path/'train.txt') as f: lines += L(*f.readlines())\n", - "with open(path/'valid.txt') as f: lines += L(*f.readlines())\n", - "lines" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We take all those lines and concatenate them in one big stream. To mark when we go from one number to the next, we use a '.' as separation:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'one . two . three . four . five . six . seven . eight . nine . ten . eleven . twelve . thirteen . fo'" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "text = ' . '.join([l.strip() for l in lines])\n", - "text[:100]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's use word tokenization for this dataset, by splitting on spaces:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['one', '.', 'two', '.', 'three', '.', 'four', '.', 'five', '.']" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tokens = text.split(' ')\n", - "tokens[:10]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To numericalize, we have to create a list of all the unique tokens (our *vocab*):" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(#30) ['one','.','two','three','four','five','six','seven','eight','nine'...]" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "vocab = L(*tokens).unique()\n", - "vocab" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Then we can convert our tokens into numbers by looking up the index of each in the vocab:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(#63095) [0,1,2,1,3,1,4,1,5,1...]" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "word2idx = {w:i for i,w in enumerate(vocab)}\n", - "nums = L(word2idx[i] for i in tokens)\n", - "nums" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Our first language model from scratch" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "One simple way to turn this into a neural network would be to specify that we are going to predict each word based on the previous three words. Therefore, we could create a list of every sequence of three words as independent variables, and the next word after each sequence as the dependent variable. \n", - "\n", - "We can do that with plain Python. Let us do it first with tokens just to confirm what it looks like:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(#21031) [(['one', '.', 'two'], '.'),(['.', 'three', '.'], 'four'),(['four', '.', 'five'], '.'),(['.', 'six', '.'], 'seven'),(['seven', '.', 'eight'], '.'),(['.', 'nine', '.'], 'ten'),(['ten', '.', 'eleven'], '.'),(['.', 'twelve', '.'], 'thirteen'),(['thirteen', '.', 'fourteen'], '.'),(['.', 'fifteen', '.'], 'sixteen')...]" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "L((tokens[i:i+3], tokens[i+3]) for i in range(0,len(tokens)-4,3))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now we will do it with tensors of the numericalized values, which is what the model will actually use:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(#21031) [(tensor([0, 1, 2]), 1),(tensor([1, 3, 1]), 4),(tensor([4, 1, 5]), 1),(tensor([1, 6, 1]), 7),(tensor([7, 1, 8]), 1),(tensor([1, 9, 1]), 10),(tensor([10, 1, 11]), 1),(tensor([ 1, 12, 1]), 13),(tensor([13, 1, 14]), 1),(tensor([ 1, 15, 1]), 16)...]" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "seqs = L((tensor(nums[i:i+3]), nums[i+3]) for i in range(0,len(nums)-4,3))\n", - "seqs" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Then we can batch those easily using the `DataLoader` class. For now we will split randomly the sequences." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "bs = 64\n", - "cut = int(len(seqs) * 0.8)\n", - "dls = DataLoaders.from_dsets(seqs[:cut], seqs[cut:], bs=64, shuffle=False)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can now create a neural network architecture that takes three words as input, and returns a prediction of the probability of each possible next word in the vocab. We will use three standard linear layers, but with two tweaks.\n", - "\n", - "The first tweak is that the first linear layer will use only the first word's embedding as activations, the second layer will use the second word's embedding plus the first layer's output activations, and the third layer will use the third word's embedding plus the second layer's output activations. The key effect of this is that every word is interpreted in the information context of any words preceding it. \n", - "\n", - "The second tweak is that each of these three layers will use the same weight matrix. The way that one word impacts the activations from previous words should not change depending on the position of a word. In other words, activation values will change as data moves through the layers, but the layer weights themselves will not change from layer to layer. So a layer does not learn one sequence position; it must learn to handle all positions.\n", - "\n", - "Since layer weights do not change, you might think of the sequential layers as the \"same layer\" repeated. In fact PyTorch makes this concrete; we can just create one layer, and use it multiple times." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Our language model in PyTorch" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can now create the language model module that we described earlier:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "class LMModel1(Module):\n", - " def __init__(self, vocab_sz, n_hidden):\n", - " self.i_h = nn.Embedding(vocab_sz, n_hidden) \n", - " self.h_h = nn.Linear(n_hidden, n_hidden) \n", - " self.h_o = nn.Linear(n_hidden,vocab_sz)\n", - " \n", - " def forward(self, x):\n", - " h = F.relu(self.h_h(self.i_h(x[:,0])))\n", - " h = h + self.i_h(x[:,1])\n", - " h = F.relu(self.h_h(h))\n", - " h = h + self.i_h(x[:,2])\n", - " h = F.relu(self.h_h(h))\n", - " return self.h_o(h)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "As you see, we have created three layers:\n", - "\n", - "- The embedding layer (`i_h` for *input* to *hidden*)\n", - "- The linear layer to create the activations for the next word (`h_h` for *hidden* to *hidden*)\n", - "- A final linear layer to predict the fourth word (`h_o` for *hidden* to *output*)\n", - "\n", - "This might be easier to represent in pictorial form. Let's define a simple pictorial representation of basic neural networks. Here's how we're going to represent a neural net with one hidden layer:" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\"Pictorial" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Each shape represents activations: rectangle for input, circle for hidden (inner) layer activations, and triangle for output activations:" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\"Shapes" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "An arrow represents the actual layer computation—i.e. the linear layer followed by the activation layers. Using this notation, here's what our simple language model looks like:" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\"Representation" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To simplify things, we've removed the details of the layer computation from each arrow. We've also color-coded the arrows, such that all arrows with the same color have the same weight matrix. For instance, all the input layers use the same embedding matrix, so they all have the same color (green).\n", - "\n", - "Let's try training this model and see how it goes:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
epochtrain_lossvalid_lossaccuracytime
01.8242971.9709410.46755400:02
11.3869731.8232420.46755400:02
21.4175561.6544970.49441400:02
31.3764401.6508490.49441400:02
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "learn = Learner(dls, LMModel1(len(vocab), 64), loss_func=F.cross_entropy, metrics=accuracy)\n", - "learn.fit_one_cycle(4, 1e-3)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To see if this is any good, let's check what would a very simple model give us. In this case we could always predict the most common token, so let's find out which token is the most often the target in our validation set:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(tensor(29), 'thousand', 0.15165200855716662)" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "n,counts = 0,torch.zeros(len(vocab))\n", - "for x,y in dls.valid:\n", - " n += y.shape[0]\n", - " for i in range_of(vocab): counts[i] += (y==i).long().sum()\n", - "idx = torch.argmax(counts)\n", - "idx, vocab[idx.item()], counts[idx].item()/n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The most common token has the index 29, which corresponds to the token 'thousand'. Always predicting this token would give us an accuracy of roughly 15\\%, so we are faring way better!" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "> A: My first guess was that the separator would be the most common token, since there is one for every number. But looking at `tokens` reminded me that large numbers are written with many words, so on the way to 10,000 you write \"thousand\" a lot: five thousand, five thousand and one, five thousand and two, etc.. Oops! Looking at your data is great for noticing subtle features and also embarrassingly obvious ones." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Our first recurrent neural network" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Looking at the code for our module, we could simplify it by replacing the duplicated code that calls the layers with a for loop. As well as making our code simpler, this will also have the benefit that we could apply our module equally well to token sequences of different lengths; we would not be restricted to token lists of length three." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "class LMModel2(Module):\n", - " def __init__(self, vocab_sz, n_hidden):\n", - " self.i_h = nn.Embedding(vocab_sz, n_hidden) \n", - " self.h_h = nn.Linear(n_hidden, n_hidden) \n", - " self.h_o = nn.Linear(n_hidden,vocab_sz)\n", - " \n", - " def forward(self, x):\n", - " h = 0\n", - " for i in range(3):\n", - " h = h + self.i_h(x[:,i])\n", - " h = F.relu(self.h_h(h))\n", - " return self.h_o(h)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's check that we get the same results using this refactoring:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
epochtrain_lossvalid_lossaccuracytime
01.8162741.9641430.46018500:02
11.4238051.7399640.47325900:02
21.4303271.6851720.48538200:02
31.3883901.6570330.47040600:02
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "learn = Learner(dls, LMModel2(len(vocab), 64), loss_func=F.cross_entropy, metrics=accuracy)\n", - "learn.fit_one_cycle(4, 1e-3)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can also refactor our pictorial representation in exactly the same way (we're also removing the details of activation sizes here, and using the same arrow colors as the previous diagram):" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\"Basic" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You will see that there is a set of activations which are being updated each time through the loop, and are stored in the variable `h` — this is called the *hidden state*." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "> Jargon: hidden state: the activations that are updated at each step of a recurrent neural network" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In `LMModel2` we only have one weight matrix, `h_h`, to calculate the next hidden state from the previous hidden state. Therefore the hidden state isn't able to easily calculate anything much more complex than a linear relationship. In next chapter we'll see how to create truely deep RNNs.\n", - "\n", - "A neural network which is defined using a loop like this is called a *recurrent neural network*, also known as an RNN. It is important to realise that an RNN is not a complicated new architecture, but is simply a refactoring of a multilayer neural network using a for loop.\n", - "\n", - "> A: My true opinion: if they were called \"looping neural networks\", or LNNs, they would seem 50% less daunting!" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Improving the RNN" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Maintaining the state of an RNN" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Looking at the code for our RNN, one thing that seems problematic is that we are initialising our hidden state to zero for every new input sequence. Why is that a problem? We made our sample sequences short so they would fit easily into batches. But if we order those samples correctly, those sample sequences will be read in order by the model, exposing the model to long stretches of the original sequence. \n", - "\n", - "But because we initialize the model's hidden state to zero for each new sample, we are throwing away all the information we have about the sentences we have seen so far, which means that our model doesn't actually know where we are up to in the overall counting sequence. This is easily fixed; we can simply move the initialisation of the hidden state to `__init__`.\n", - "\n", - "But this fix will create its own subtle, but important, problem. It effectively makes our neural network as deep as the entire number of tokens in our document. For instance, if there were 10,000 tokens in our dataset, we would be creating a 10,000 layer neural network.\n", - "\n", - "To see this, consider the original pictorial representation of our recurrent neural network, before refactoring it with a for loop. You can see each layer corresponds with one token input. When we talk about the representation of a recurrent neural network before refactoring with the for loop, we call this the *unrolled representation*. It is often helpful to consider the unrolled representation when trying to understand an RNN.\n", - "\n", - "The problem with a 10,000 layer neural network is that if and when you get to the 10,000th word of the dataset, you will still need to calculate the derivatives all the way back to the first layer. This is going to be very slow indeed, and very memory intensive. It is unlikely that you could store even one mini batch on your GPU.\n", - "\n", - "The solution to this is to tell PyTorch that we do not want to back propagate the derivatives through the entire implicit neural network. Instead, we will just keep the last three layers of gradients. To remove all of the gradient history in PyTorch, we use the `detach` method.\n", - "\n", - "Here is the new version of our RNN. It is now stateful, because it remembers its activations between different calls to `forward`, which represent its use for different samples in the batch:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "class LMModel3(Module):\n", - " def __init__(self, vocab_sz, n_hidden):\n", - " self.i_h = nn.Embedding(vocab_sz, n_hidden) \n", - " self.h_h = nn.Linear(n_hidden, n_hidden) \n", - " self.h_o = nn.Linear(n_hidden,vocab_sz)\n", - " self.h = 0\n", - " \n", - " def forward(self, x):\n", - " for i in range(3):\n", - " self.h = self.h + self.i_h(x[:,i])\n", - " self.h = F.relu(self.h_h(self.h))\n", - " out = self.h_o(self.h)\n", - " self.h = self.h.detach()\n", - " return out\n", - " \n", - " def reset(self): self.h = 0" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "If you think about it, this model will have the same activations whatever the sequence length we pick, because the hidden state will remember the last activation from the previous batch. The only thing that will be different are the gradients computed at each step: they will only be calculated on sequence length tokens in the past, instead of the whole stream. That is why this sequence length is often called *bptt* for back-propagation through time." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "* jargon: Back propagation through time (BPTT): Treating a neural net with effectively one layer per time step (usually refactored using a loop) as one big model, and calculating gradients on it in the usual way. To avoid running out of memory and time, we usually use _truncated_ BPTT, which \"detaches\" the history of computation steps in the hidden state every few time steps." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To use `LMModel3`, we need to make sure the samples are going to be seen in a certain order. As we saw in the previous chapter, if the first line of the first batch is our `dset[0]` then the second batch should have `dset[1]` as the first line, so that the model sees the text flowing.\n", - "\n", - "`LMDataLoader` was doing this for us in the previous chapter. This time we're going to do it ourselves.\n", - "\n", - "To do this, we are going to rearrange our dataset. First we divide the samples into `m = len(dset) // bs` groups (this is the equivalent of splitting the whole concatenated dataset into, for instance, 64 equally sized pieces, since we're using `bs=64` here). `m` is the length of each of these pieces. For instance, if we're using our whole dataset (although we'll actually split it into train vs valid in a moment), that will be:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(328, 64, 21031)" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "m = len(seqs)//bs\n", - "m,bs,len(seqs)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The first batch will be composed of the samples:\n", - "\n", - " (0, m, 2*m, ..., (bs-1)*m)\n", - "\n", - "then the second batch of the samples: \n", - "\n", - " (1, m+1, 2*m+1, ..., (bs-1)*m+1)\n", - "\n", - "and so forth. This way, at each epoch, the model will see a chunk of contiguous text of size `3*m` (since each text is of size 3) on each line of the batch.\n", - "\n", - "The following function does that reindexing:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def group_chunks(ds, bs):\n", - " m = len(ds) // bs\n", - " new_ds = L()\n", - " for i in range(m): new_ds += L(ds[i + m*j] for j in range(bs))\n", - " return new_ds" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Then we just pass `drop_last=True` when building our `DataLoaders` to drop the last batch that has not a shape of `bs`, we also pass `shuffle=False` to make sure the texts are read in order." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "cut = int(len(seqs) * 0.8)\n", - "dls = DataLoaders.from_dsets(group_chunks(seqs[:cut], bs), group_chunks(seqs[cut:], bs), bs=bs, drop_last=True, shuffle=False)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The last thing we add is a little tweak of the training loop via a `Callback`. We will talk more about callbacks in <>; this one will call the `reset` method of our model at the beginning of each epoch and before each validation phase. Since we implemented that method to zero the hidden state of the model, this will make sure we start we a clean state before reading those continuous chunks of text. We can also start training a bit longer:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
epochtrain_lossvalid_lossaccuracytime
01.6770741.8273670.46754800:02
11.2827221.8709130.38894200:02
21.0907051.6517930.46250000:02
31.0050921.6137940.51658700:02
40.9659751.5607750.55120200:02
50.9161821.5958570.56057700:02
60.8976571.5397330.57427900:02
70.8362741.5851410.58317300:02
80.8058771.6298080.58677900:02
90.7950961.6512670.58894200:02
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "learn = Learner(dls, LMModel3(len(vocab), 64), loss_func=F.cross_entropy,\n", - " metrics=accuracy, cbs=ModelReseter)\n", - "learn.fit_one_cycle(10, 3e-3)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Creating more signal" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Another problem with our current approach is that we only predict one output word for each three input words. That means that the amount of signal that we are feeding back to update weights with is not as large as it could be. It would be better if we predicted the next word after every single word, rather than every three words. Here's the pictorial version:" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\"RNN" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This is easy enough to add. We need to first change our data so that the dependent variable has each of the three next words after each of our three input words. Instead of 3, we use an attribute, `sl` (for sequence length) and make it a bit bigger:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sl = 16\n", - "seqs = L((tensor(nums[i:i+sl]), tensor(nums[i+1:i+sl+1]))\n", - " for i in range(0,len(nums)-sl-1,sl))\n", - "cut = int(len(seqs) * 0.8)\n", - "dls = DataLoaders.from_dsets(group_chunks(seqs[:cut], bs),\n", - " group_chunks(seqs[cut:], bs),\n", - " bs=bs, drop_last=True, shuffle=False)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Looking at the first element of `seqs`, we can see that it contains two lists of the same size. The second list is the same as the first, but offset by one element:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[(#16) ['one','.','two','.','three','.','four','.','five','.'...],\n", - " (#16) ['.','two','.','three','.','four','.','five','.','six'...]]" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "[L(vocab[o] for o in s) for s in seqs[0]]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now we need to modify our model so that it outputs a prediction after every word, rather than just at the end of a three word sequence:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "class LMModel4(Module):\n", - " def __init__(self, vocab_sz, n_hidden):\n", - " self.i_h = nn.Embedding(vocab_sz, n_hidden) \n", - " self.h_h = nn.Linear(n_hidden, n_hidden) \n", - " self.h_o = nn.Linear(n_hidden,vocab_sz)\n", - " self.h = 0\n", - " \n", - " def forward(self, x):\n", - " outs = []\n", - " for i in range(sl):\n", - " self.h = self.h + self.i_h(x[:,i])\n", - " self.h = F.relu(self.h_h(self.h))\n", - " outs.append(self.h_o(self.h))\n", - " self.h = self.h.detach()\n", - " return torch.stack(outs, dim=1)\n", - " \n", - " def reset(self): self.h = 0" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This model will return outputs of shape `bs x sl x vocab_sz` (since we stacked on `dim=1`). Our targets are of shape `bs x sl`, so we need to flatten those before using them in `F.cross_entropy`:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def loss_func(inp, targ):\n", - " return F.cross_entropy(inp.view(-1, len(vocab)), targ.view(-1))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can now use this loss function to train the model:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
epochtrain_lossvalid_lossaccuracytime
03.1032982.8743410.21256500:01
12.2319641.9712800.46215800:01
21.7113581.8135470.46118200:01
31.4485161.8281760.48323600:01
41.2886301.6595640.52067100:01
51.1614701.7140230.55493200:01
61.0555681.6609160.57503300:01
70.9607651.7196240.59106400:01
80.8701531.8395600.61466500:01
90.8085451.7702780.62434900:01
100.7580841.8429310.61075800:01
110.7193201.7995270.64656600:01
120.6834391.9179280.64982100:01
130.6602831.8747120.62858100:01
140.6461541.8775190.64005500:01
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "learn = Learner(dls, LMModel4(len(vocab), 64), loss_func=loss_func,\n", - " metrics=accuracy, cbs=ModelReseter)\n", - "learn.fit_one_cycle(15, 3e-3)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We need to train for longer, since the task has changed a bit and is more complicated now. But we end up with a good result... At least, sometimes. If you run it a few times, you'll see that you can get quite different results on different runs. That's because effectively we have a very deep network here, which can result in very large or very small gradients. We'll see in the next chapter how to resolve this, by using the `LSTM` architecture.\n", - "\n", - "We can also see that `valid_loss` is getting worse, so it may help to add some additional regularization. That will be provided by the `AWD` variant of `LSTM`, which we'll also see in the next chapter.\n", - "\n", - "By combining these techniques, we'll see how to get around 85% accuracy on this dataset!" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Questionnaire" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "1. If the dataset for your project is so big and complicated that working with it takes a significant amount of time, what should you do?\n", - "1. Why do we concatenating the documents in our dataset before creating a language model?\n", - "1. To use a standard fully connected network to predict the fourth word given the previous three words, what two tweaks do we need to make?\n", - "1. How can we share a weight matrix across multiple layers in PyTorch?\n", - "1. Write a module which predicts the third word given the previous two words of a sentence, without peeking.\n", - "1. What is a recurrent neural network?\n", - "1. What is hidden state?\n", - "1. What is the equivalent of hidden state in ` LMModel1`?\n", - "1. To maintain the state in an RNN why is it important to pass the text to the model in order?\n", - "1. What is an unrolled representation of an RNN?\n", - "1. Why can maintaining the hidden state in an RNN lead to memory and performance problems? How do we fix this problem?\n", - "1. What is BPTT?\n", - "1. Write code to print out the first few batches of the validation set, including converting the token IDs back into English strings, as we showed for batches of IMDb data in the previous chapter.\n", - "1. What does the `ModelReseter` callback do? Why do we need it?\n", - "1. What are the downsides of predicting just one output word for each three input words?\n", - "1. Why do we need a custom loss function for `LMModel4`?\n", - "1. Why is the training of `LMModel4` unstable?" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Further research" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "1. In ` LMModel2` why can `forward` start with `h=0`? Why don't we need to say `h=torch.zeros(…)`?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "jupytext": { - "split_at_heading": true - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.5" - }, - "toc": { - "base_numbering": 1, - "nav_menu": {}, - "number_sections": false, - "sideBar": true, - "skip_h1_title": true, - "title_cell": "Table of Contents", - "title_sidebar": "Contents", - "toc_cell": false, - "toc_position": {}, - "toc_section_display": true, - "toc_window_display": false - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/12_better_rnn.ipynb b/12_better_rnn.ipynb deleted file mode 100644 index 22b14ca..0000000 --- a/12_better_rnn.ipynb +++ /dev/null @@ -1,1154 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#hide\n", - "from utils import *" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "hide_input": false - }, - "outputs": [], - "source": [ - "#hide\n", - "from fastai2.text.all import *\n", - "path = untar_data(URLs.HUMAN_NUMBERS)\n", - "lines = L()\n", - "with open(path/'train.txt') as f: lines += L(*f.readlines())\n", - "with open(path/'valid.txt') as f: lines += L(*f.readlines())\n", - "text = ' . '.join([l.strip() for l in lines])\n", - "tokens = text.split(' ')\n", - "vocab = L(*tokens).unique()\n", - "word2idx = {w:i for i,w in enumerate(vocab)}\n", - "nums = L(word2idx[i] for i in tokens)\n", - "\n", - "def group_chunks(ds, bs):\n", - " m = len(ds) // bs\n", - " new_ds = L()\n", - " for i in range(m): new_ds += L(ds[i + m*j] for j in range(bs))\n", - " return new_ds" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "[[chapter_better_rnn]]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Making our RNN state of the art" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We saw in the last chapter how to build a basic RNN from scratch. Now we will see how to make it better up until the AWD LSTM architecture we used in <> on this text classification problem.\n", - "\n", - "We won't go other the whole data preparation process again. To make the comparison fair against our last example, we use the same batch size and sequence length:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sl,bs = 16,64\n", - "seqs = L((tensor(nums[i:i+sl]), tensor(nums[i+1:i+sl+1]))\n", - " for i in range(0,len(nums)-sl-1,sl))\n", - "cut = int(len(seqs) * 0.8)\n", - "dls = DataLoaders.from_dsets(group_chunks(seqs[:cut], bs),\n", - " group_chunks(seqs[cut:], bs),\n", - " bs=bs, drop_last=True, shuffle=False)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The obvious way to get a better model is to go deeper: as we discussed in the last chapter, we only have one linear layer between the hidden state and the output activations in our basic RNN, so maybe we would get better results with more." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Multilayer RNNs" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### The model" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In a multilayer RNN, we pass the activations from our recurrent neural network into a second recurrent neural network, like so:" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\"2-layer" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "…or in an unrolled representation:" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\"2-layer" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's save some time by using PyTorch's RNN class, which implements exactly what we have created above, but also gives us the option to stack multiple RNNs, as we have discussed:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "class LMModel5(Module):\n", - " def __init__(self, vocab_sz, n_hidden, n_layers):\n", - " self.i_h = nn.Embedding(vocab_sz, n_hidden)\n", - " self.rnn = nn.RNN(n_hidden, n_hidden, n_layers, batch_first=True)\n", - " self.h_o = nn.Linear(n_hidden, vocab_sz)\n", - " self.h = torch.zeros(n_layers, bs, n_hidden)\n", - " \n", - " def forward(self, x):\n", - " res,h = self.rnn(self.i_h(x), self.h)\n", - " self.h = h.detach()\n", - " return self.h_o(res)\n", - " \n", - " def reset(self): self.h.zero_()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
epochtrain_lossvalid_lossaccuracytime
03.0558532.5916400.43790700:01
12.1623591.7873100.47159800:01
21.7106631.9418070.32177700:01
31.5207831.9997260.31201200:01
41.3308462.0129020.41324900:01
51.1632971.8961920.45068400:01
61.0338132.0052090.43481400:01
70.9190902.0470830.45670600:01
80.8229392.0680310.46883100:01
90.7501802.1360640.47509800:01
100.6951202.1391400.48543300:01
110.6557522.1550810.49365200:01
120.6296502.1625830.49853500:01
130.6135832.1716490.49104800:01
140.6043092.1803550.48787400:01
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "learn = Learner(dls, LMModel5(len(vocab), 64, 2), loss_func=CrossEntropyLossFlat(), metrics=accuracy, cbs=ModelReseter)\n", - "learn.fit_one_cycle(15, 3e-3)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now that's disappointing... we are doing more poorly than the single-layer RNN from the end of last chapter. The reason is that we have a deeper model, leading to exploding or disappearing activations." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Exploding or disappearing activations" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In practice, creating accurate models from this kind of RNN is difficult. We will get better results if we call `detach` less often, and have more layers — this gives our RNN a longer time horizon to learn from, and richer features to create. But it also means we have a deeper model to train. The key challenge in the development of deep learning has been figuring out how to train these kinds of models.\n", - "\n", - "The reason this is challenging is because of what happens when you multiply by a matrix many times. Think about what happens when you multiply by a number many times. For example, if you multiply by two, starting at one, you get the sequence 1, 2, 4, 8,… after 32 steps you are already at 4,294,967,296. A similar issue happens if we multiply by 0.5: we get 0.5, 0.25, 0.125… and after 32 steps it's 0.00000000023. As you can see, a number even slightly higher or lower than one results in an explosion or disappearance of our number, after just a few repeated multiplications.\n", - "\n", - "Because matrix multiplication is just multiplying numbers and adding them up, exactly the same thing happens with repeated matrix multiplications. And a deep neural network is just repeated matrix multiplications--each extra layer is another matrix multiplication. This means that it is very easy for a deep neural network to end up with extremely large, or extremely small numbers.\n", - "\n", - "This is a problem, because the way computers store numbers (known as \"floating point\") means that they become less and less accurate the further away the numbers get from zero. This diagram, from the excellent article [What you never wanted to know about floating point but will be forced to find out](http://www.volkerschatz.com/science/float.html), shows how the precision of floating point numbers varies over the number line:" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\"Precision" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This inaccuracy means that often the gradients calculated for updating the weights end up as zero or infinity for deep networks. This is commonly refered to as *vanishing gradients* or *exploding gradients*. That means that in SGD, the weights are updated either not at all, or jump to infinity. Either way, they won't improve with training.\n", - "\n", - "Researchers have developed a number of ways to tackle this problem, which we will be discussing later in the book. One way to tackle the problem is to change the definition of a layer in a way that makes it less likely to have exploding activations. We'll look at the details of how this is done in <>, when we discuss *batch normalization*, and <>, when we discuss *ResNets*, although these details don't generally matter in practice (unless you are a researcher that is creating new approaches to solving this problem). Another way to deal with this is by being careful about *initialization*, which is a topic we'll investigate in <>.\n", - "\n", - "For RNNs, there are two types of layers frequently used to avoid exploding activations, and they are: *gated recurrent units* (GRU), and *Long Short-Term Memory* (LSTM). Both of these are available in PyTorch, and are drop-in replacements for the RNN layer. We will only cover LSTMs in this book, there are plenty of good tutorials online explaining GRUs, which are a minor variant on the LSTM design." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## LSTM" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "LSTM (for long short-term memory) is an architecture that was introduced back in 1997 by Jurgen Schmidhuber and Sepp Hochreiter. In this architecture, there are not one but two hidden states. In our base RNN, the hidden state is the output of the RNN at the previous time step. That hidden state is then responsible for doing two things at a time:\n", - "\n", - "- having the right information for the output layer to predict the correct next token\n", - "- retaining memory of everything that happened in the sentence\n", - "\n", - "Consider, for example, the sentences \"Henry has a dog and he likes his dog very much\" and \"Sophie has a dog and she likes her dog very much\". It's very clear that the RNN needs to remember the name at the beginning of the sentence to be able to predict *he/she* or *his/her*. \n", - "\n", - "In practice, RNNs are really bad at retaining memory of what happened much earlier in the sentence, which is the motivation to have another hidden state (called cell state) in the LSTM. The cell state will be responsible for keeping *long short-term memory*, while the hidden state will focus on the next token to predict. Let's have a closer look and how this is achieved and build one LSTM from scratch." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Building an LSTM from scratch" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The schematic of an LSTM is given like so:\n", - "\n", - "\"A" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In this picture, our input $x_{t}$ enters on the bottom with the previous hidden state ($h_{t-1}$) and cell state ($x_{t-1}$). The four orange boxes represent four layers with the activation being either sigmoid (for $\\sigma$) or tanh. tanh is just a sigmoid rescaled to the range -1 to 1. Its mathematical expression can be written like this:\n", - "\n", - "$$\\tanh(x) = \\frac{e^{x} + e^{-x}}{e^{x}-e^{-x}} = 2 \\sigma(2x) - 1$$\n", - "\n", - "where $\\sigma$ is the sigmoid function. The green boxes are elementwise operations. What goes out is the new hidden state ($h_{t}$) and new cell state ($c_{t}$) on the left, ready for our next input. The new hidden state is also use as output, which is why the arrow splits to go up.\n", - "\n", - "Let's go over the four neural nets (called *gates*) one by one and explain the diagram, but before this, notice how very little the cell state (on the top) is changed. It doesn't even go directly through a neural net! This is exactly why it will carry on a longer-term state.\n", - "\n", - "First, the arrows for input and old hidden state are joined together. In the RNN we wrote in the past chapter, we were adding them together. In the LSTM, we stack them in one big tensor. This means the dimension of our embeddings (which is the dimension of $x_{t}$) can be different than the dimension of our hidden state. If we call those `n_in` and `n_hid`, the arrow at the bottom is of size `n_in + n_hid`, thus all the neural nets (orange boxes) are linear layers with `n_in + n_hid` inputs and `n_hid` outputs.\n", - "\n", - "The first gate (looking from the left to right) is called the *forget gate*. Since it's a linear layer followed by a sigmoid, its output will have scalars between 0 and 1. We multiply this result by the cell gate, so for all the values close to 0, we will forget what was inside that cell state (and for the values close to 1 it doesn't do anything). This gives the ability to the LSTM to forget things about its longterm state. For instance, when crossing a period or an `xxbos` token, we would expect to it to (have learned to) reset its cell state.\n", - "\n", - "The second gate is called the *input gate*. It works with the third gate (which doesn't really have a name but is sometimes called the *cell gate*) to update the cell state. For instance we may see a new gender pronoun, so we must replace the information about gender that the forget gate removed by the new one. Like the forget gate, the input gate ends up on a product, so it jsut decides which element of the cell state to update (valeus close to 1) or not (values close to 0). The third gate will then fill those values with things between -1 and 1 (thanks to the tanh). The result is then added to the cell state.\n", - "\n", - "The last gate is the *output gate*. It will decides which information take in the cell state to generate the output. The cell state goes through a tanh before this and the output gate combined with the sigmoid decides which values to take inside it.\n", - "\n", - "In terms of code, we can write the same steps like this:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "class LSTMCell(Module):\n", - " def __init__(self, ni, nh):\n", - " self.forget_gate = nn.Linear(ni + nh, nh)\n", - " self.input_gate = nn.Linear(ni + nh, nh)\n", - " self.cell_gate = nn.Linear(ni + nh, nh)\n", - " self.output_gate = nn.Linear(ni + nh, nh)\n", - "\n", - " def forward(self, input, state):\n", - " h,c = state\n", - " h = torch.stack([h, input], dim=1)\n", - " forget = torch.sigmoid(self.forget_gate(h))\n", - " c = c * forget\n", - " inp = torch.sigmoid(self.input_gate(h))\n", - " cell = torch.tanh(self.cell_gate(h))\n", - " c = c + inp * cell\n", - " out = torch.sigmoid(self.output_gate(h))\n", - " h = outgate * torch.tanh(c)\n", - " return h, (h,c)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In practice, we can then refactor the code. Also, in terms of performance, it's better to do one big matrix multiplication than four smaller ones (that's because we only launch the special fast kernel on GPU once, and it gives the GPU more work to do in parallel). The stacking takes a bit of time (since we have to move one of the tensors around on the GPU to have it all in a contiguous array), so we use two separate layers for the input and the hidden state. The optimized and refactored code then looks like that:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "class LSTMCell(Module):\n", - " def __init__(self, ni, nh):\n", - " self.ih = nn.Linear(ni,4*nh)\n", - " self.hh = nn.Linear(nh,4*nh)\n", - "\n", - " def forward(self, input, state):\n", - " h,c = state\n", - " #One big multiplication for all the gates is better than 4 smaller ones\n", - " gates = (self.ih(input) + self.hh(h)).chunk(4, 1)\n", - " ingate,forgetgate,outgate = map(torch.sigmoid, gates[:3])\n", - " cellgate = gates[3].tanh()\n", - "\n", - " c = (forgetgate*c) + (ingate*cellgate)\n", - " h = outgate * c.tanh()\n", - " return h, (h,c)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Here we use the PyTorch `chunk` method to split our tensor into 4 pieces, e.g.:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "t = torch.arange(0,10); t" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(tensor([0, 1, 2, 3, 4]), tensor([5, 6, 7, 8, 9]))" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "t.chunk(2)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Training a language model using LSTMs" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Here is the same network as before, using a two-layer LSTM. We can train it at a higher learning rate, for a shorter time, and get better accuracy:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "class LMModel6(Module):\n", - " def __init__(self, vocab_sz, n_hidden, n_layers):\n", - " self.i_h = nn.Embedding(vocab_sz, n_hidden)\n", - " self.rnn = nn.LSTM(n_hidden, n_hidden, n_layers, batch_first=True)\n", - " self.h_o = nn.Linear(n_hidden, vocab_sz)\n", - " self.h = [torch.zeros(2, bs, n_hidden) for _ in range(n_layers)]\n", - " \n", - " def forward(self, x):\n", - " res,h = self.rnn(self.i_h(x), self.h)\n", - " self.h = [h_.detach() for h_ in h]\n", - " return self.h_o(res)\n", - " \n", - " def reset(self): \n", - " for h in self.h: h.zero_()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
epochtrain_lossvalid_lossaccuracytime
03.0008212.6639420.43831400:02
12.1396422.1847800.24047900:02
21.6072751.8126820.43977900:02
31.3477111.8309820.49747700:02
41.1231131.9377660.59440100:02
50.8520422.0121270.63159200:02
60.5654941.3127420.72574900:02
70.3474451.2979340.71126300:02
80.2081911.4412690.73120100:02
90.1263351.5699520.73730500:02
100.0797611.4271870.75415000:02
110.0529901.4949900.74511700:02
120.0390081.3937310.75789400:02
130.0315021.3732100.75846400:02
140.0280681.3680830.75846400:02
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "learn = Learner(dls, LMModel6(len(vocab), 64, 2), loss_func=CrossEntropyLossFlat(), metrics=accuracy, cbs=ModelReseter)\n", - "learn.fit_one_cycle(15, 1e-2)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now that's better than a multilayer RNN! We can still see there is a bit of overfitting, which is a sign that a bit of regularization might help." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Regularizing an LSTM" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Recurrent neural networks, in general, are hard to train. Using LSTMs (or GRUs) cell make training easier than vanilla RNNs, but there are still very prone to overfitting. Data augmentation, while it exists for text data, is less often used because in most cases, it requires another model to generate random augmentation (by translating in another language and back to the language used for instance). Overall, data augmentation for text data is currently not a well explored space.\n", - "\n", - "However, there are other regularization techniques we can use instead, which were thoroughly studied for use with LSTMs in the paper [Regularizing and Optimizing LSTM Language Models](https://arxiv.org/abs/1708.02182). This paper showed how effective use of *dropout*, *activation regularization*, and *temporal activation regularization* could allow an LSTM to beat state of the art results that previously required much more complicated models. They called an LSTM using these techniques an *AWD LSTM*. We'll look at each of these techniques in turn." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Dropout" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Dropout is a regularization technique that was introduce by Geoffrey Hinton et al. in [Improving neural networks by preventing co-adaptation of feature detectors](https://arxiv.org/abs/1207.0580). The basic idea is to randomly change some activations to zero at training time. This makes sure all neurons actively work toward the output as seen in this figure from the original paper:\n", - "\n", - "\"A\n", - "\n", - "Hinton used a nice metaphor when he explained, in an interview, the inspiration for dropout:\n", - "\n", - "> : \"I went to my bank. The tellers kept changing and I asked one of them why. He said he didn’t know but they got moved around a lot. I figured it must be because it would require cooperation between employees to successfully defraud the bank. This made me realize that randomly removing a different subset of neurons on each example would prevent conspiracies and thus reduce overfitting\"\n", - "\n", - "In the same interview, he also explained that neuroscience provided additional inspiration:\n", - "\n", - "> : \"We don't really know why neurons spike. One theory is that they want to be noisy so as to regularize, because we have many more parameters than we have data points. The idea of dropout is that if you have noisy activations, you can afford to use a much bigger model.\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can see there that if we just zero those activations without doing anything else, our model will have problems to train: if we go from the sum of 5 activations (that are all positive numbers since we apply a ReLU) to just 2, this won't have the same scale. Therefore if we dropout with a probability `p`, we rescale all activation by dividing them by `1-p` (on average `p` will be zeroed, so it leaves `1-p`), as shown in this diagram from the original paper:\n", - "\n", - "\"A\n", - "\n", - "This is a full implementation of the dropout layer in PyTorch (although PyTorch's native layer is actually written in C, not Python):" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "class Dropout(Module):\n", - " def __init__(self, p): self.p = p\n", - " def forward(self, x):\n", - " if not self.training: return x\n", - " mask = x.new(*x.shape).bernoulli_(1-p)\n", - " return x * mask.div_(1-p)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The `bernoulli_` method is creating a tensor with random zeros (with probability p) and ones (with probability 1-p), which is then multiplied with our input before dividing by `1-p`. Note the use of the `training` attribute, which is available in any PyTorch `nn.Module`, and tells us if we are doing training or inference.\n", - "\n", - "> note: In previous chapters of the book we'd be adding a code example for `bernoulli_` here, so you can see exactly how it works. But now that you know enough to do this yourself, we're going to be doing fewer and fewer examples for you, and instead expecting you to do your own experiments to see how things work. In this case, you'll see in the end-of-chapter questionnaire that we're asking you to experiment with `bernoulli_`--but don't wait for us to ask you to experiment to develop your understanding of the code we're studying, go ahead and do it anyway!\n", - "\n", - "Using dropout before passing the output of our LSTM to the final layer will help reduce overfitting. Dropout is also used in many other models, including the default CNN head used in `fastai.vision`, and is also available in `fastai.tabular` by passing the `ps` parameter (where each \"p\" is passed to each added `Dropout` layer), as we'll see in <>." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Dropout has a different behavior in training and validation mode, which we achieved using the `training` attribute in `Dropout` above. Calling the `train()` method on a `Module` sets `training` to `True` (both for the module you call the method on, and for every module it recursively contains), and `eval()` sets it to `False`. This is done automatically when calling the methods of `Learner`, but if you are not using that class, remember to switch from one to the other as needed." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### AR and TAR regularization" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "AR (for *activation regularization*) and TAR (for *temporal activation regularization*) are two regularization methods very similar to weight decay. When applying weight decay, we add a small penalty to the loss that aims at making the weights as small as possible. For the activation regularization, it's the final activations produced by the LSTM that we will try to make as small as possible, instead of the weights.\n", - "\n", - "To regularize the final activations, we have to store those somewhere, then add the means of the squares of them to the loss (along with a multiplier `alpha`, which is just like `wd` for weight decay):\n", - "\n", - "``` python\n", - "loss += alpha * activations.pow(2).mean()\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Temporal activation regularization is linked to the fact we are predicting tokens in a sentence. That means it's likely that the outputs of our LSTMs should somewhat make sense when we read them in order. TAR is there to encourage that behavior by adding a penalty to the loss to make the difference between two consecutive activations as small as possible: our activations tensor has a shape `bs x sl x n_hid`, and we read consecutive activation on the sequence length axis (so the dimension in the middle). With this, TAR can be expressed as:\n", - "\n", - "``` python\n", - "loss += beta * (activations[:,1:] - activations[:,:-1]).pow(2).mean()\n", - "```\n", - "\n", - "`alpha` and `beta` are then two hyper-parameters to tune. To make this work, we need our model with dropout to return three things: the proper output, the activations of the LSTM pre-dropout and the activations of the LSTM post-dropout. AR is often applied on the dropped out activations (to not penalize the activations we turned in 0s afterward) while TAR is applied on the non-dropped out activations (because those 0s create big differences between two consecutive timesteps). There is then a callback called `RNNRegularizer` that will apply this regularization for us." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Training a weight-tied regularized LSTM" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can combine dropout (applied before we go in our output layer) with the AR and TAR regularization to train our previous LSTM. We just need to return three things instead of one: the normal output of our LSTM, the dropped-out activations and the activations from our LSTMs. Those last two will be picked up by the callback `RNNRegularization` for the contributions it has to make to the loss.\n", - "\n", - "Another useful trick we can add from the AWD LSTM paper is *weight tying*. In a language model, the input embeddings represent a mapping from English words to activations, and the output hidden layer represents a mapping from activations to English words. We might expect, intuitively, that these mappings could be the same. We can represent this in PyTorch by assigning the same weight matrix to each of these layers:\n", - "\n", - " self.h_o.weight = self.i_h.weight\n", - "\n", - "In `LMMModel7`, we include these final tweaks:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "class LMModel7(Module):\n", - " def __init__(self, vocab_sz, n_hidden, n_layers, p):\n", - " self.i_h = nn.Embedding(vocab_sz, n_hidden)\n", - " self.rnn = nn.LSTM(n_hidden, n_hidden, n_layers, batch_first=True)\n", - " self.drop = nn.Dropout(p)\n", - " self.h_o = nn.Linear(n_hidden, vocab_sz)\n", - " self.h_o.weight = self.i_h.weight\n", - " self.h = [torch.zeros(2, bs, n_hidden) for _ in range(n_layers)]\n", - " \n", - " def forward(self, x):\n", - " raw,h = self.rnn(self.i_h(x), self.h)\n", - " out = self.drop(raw)\n", - " self.h = [h_.detach() for h_ in h]\n", - " return self.h_o(out),raw,out\n", - " \n", - " def reset(self): \n", - " for h in self.h: h.zero_()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can create a regularized `Learner` using the `RNNRegularizer` callback:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "learn = Learner(dls, LMModel7(len(vocab), 64, 2, 0.5),\n", - " loss_func=CrossEntropyLossFlat(), metrics=accuracy,\n", - " cbs=[ModelReseter, RNNRegularizer(alpha=2, beta=1)])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "A `TextLearner` automatically adds those two callbacks for us (with default for `alpha` and `beta` as above) so we can simplify the line above to:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "learn = TextLearner(dls, LMModel7(len(vocab), 64, 2, 0.4),\n", - " loss_func=CrossEntropyLossFlat(), metrics=accuracy)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can the train the model, and add additional regularization by increasing the weight decay to `0.1`:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
epochtrain_lossvalid_lossaccuracytime
02.6938852.0134840.46663400:02
11.6855491.1873100.62931300:02
20.9733070.7913980.74560500:02
30.5558230.6404120.79410800:02
40.3518020.5572470.83610000:02
50.2449860.5949770.80729200:02
60.1922310.5116900.84676100:02
70.1624560.5203700.85807300:02
80.1426640.5259180.84228500:02
90.1284930.4950290.85807300:02
100.1175890.4642360.86718800:02
110.1098080.4665500.86930300:02
120.1042160.4551510.87182600:02
130.1002710.4526590.87361700:02
140.0981210.4583720.86938500:02
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "learn.fit_one_cycle(15, 1e-2, wd=0.1)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now this is far better than our previous model!" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Conclusion" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You have now seen everything that is inside the AWD-LSTM architecture we used in text classification in <>. It uses dropouts in a lot more places:\n", - "\n", - "- embedding dropout (just after the embedding layer)\n", - "- input dropout (after the embedding layer)\n", - "- weight dropout (applied to the weights of the LSTM at each training step)\n", - "- hidden dropout (applied to the hidden state between two layers)\n", - "\n", - "which makes it even more regularized. Since fine-tuning those five dropout values (adding the dropout before the output layer) is complicated, so we have determined good defaults, and allow the magnitude of dropout to be tuned overall with the `drop_mult` parameter you saw (which is multiplied by each dropout).\n", - "\n", - "Another architecture that is very powerful, especially in \"sequence to sequence\" problems (that is, problems where the dependent variable is itself a variable length sequence, such as language translation), is the Transformers architecture. You can find it in an online bonus chapter on the book website." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Questionnaire" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "1. In the unrolled representation, we can see that a recurrent neural network actually has many layers. So why do we need to stack RNNs to get better results?\n", - "1. Draw a representation of a stacked (multilayer) RNN.\n", - "1. Why should we get better results in an RNN if we call `detach` less often? Why might this not happen in practice with a simple RNN?\n", - "1. Why can a deep network result in very large or very small activations? Why does this matter?\n", - "1. In a computer's floating point representation of numbers, which numbers are the most precise?\n", - "1. Why do vanishing gradients prevent training?\n", - "1. Why does it help to have two hidden states in the LSTM architecture? What is the purpose of each one?\n", - "1. What are these two states called in an LSTM?\n", - "1. What is tanh, and how is it related to sigmoid?\n", - "1. What is the purpose of this code in `LSTMCell`?: `h = torch.stack([h, input], dim=1)`\n", - "1. What does `chunk` to in PyTorch?\n", - "1. Study the refactored version of `LSTMCell` carefully to ensure you understand how and why it does the same thing as the non-refactored version.\n", - "1. Why can we use a higher learning rate for `LMModel6`?\n", - "1. What are the three regularisation techniques used in an AWD-LSTM model?\n", - "1. What is dropout?\n", - "1. Why do we scale the weights with dropout? Is this applied during training, inference, or both?\n", - "1. What is the purpose of this line from `Dropout`?: `if not self.training: return x`\n", - "1. Experiment with `bernoulli_` to understand how it works.\n", - "1. How do you set your model in training mode in PyTorch? In evaluation mode?\n", - "1. Write the equation for activation regularization (in maths or code, as you prefer). How is it different to weight decay?\n", - "1. Write the equation for temporal activation regularization (in maths or code, as you prefer). Why wouldn't we use this for computer vision problems?\n", - "1. What is \"weight tying\" in a language model?" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Further research" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "1. Write the code for an LSTM from scratch (but you may refer to <>).\n", - "1. Search on the Internet for the GRU architecture and implement it from scratch, and try training a model. See if you can get the similar results as we saw in this chapter. Compare it to the results of PyTorch's built in GRU module.\n", - "1. Have a look at the source code for AWD-LSTM in fastai, and try to map each of the lines of code to the concepts shown in this chapter." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "jupytext": { - "split_at_heading": true - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.5" - }, - "toc": { - "base_numbering": 1, - "nav_menu": {}, - "number_sections": false, - "sideBar": true, - "skip_h1_title": true, - "title_cell": "Table of Contents", - "title_sidebar": "Contents", - "toc_cell": false, - "toc_position": {}, - "toc_section_display": true, - "toc_window_display": false - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/12_nlp_dive.ipynb b/12_nlp_dive.ipynb new file mode 100644 index 0000000..2fcb33e --- /dev/null +++ b/12_nlp_dive.ipynb @@ -0,0 +1,2350 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#hide\n", + "from utils import *" + ] + }, + { + "cell_type": "raw", + "metadata": {}, + "source": [ + "[[chapter_nlp_dive]]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# A language model from scratch" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We're now ready to go deep... deep into deep learning! You already learned how to train a basic neural network, but how do you go from there to creating state of the art models? In this part of the book we're going to uncover all of the mysteries, starting with language models.\n", + "\n", + "We saw in <> how to finetune a pretrained language model to build a text classifier, in this chapter, we will explain to you what exactly is inside that model, and what an RNN is. First, let's gather some data that will allow us to quickly prototype our various models. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## The data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Whenever we start working on a new problem, we always first try to think of the simplest dataset we can which would allow us to try out methods quickly and easily, and interpret the results. When we started working on language modelling a few years ago, we didn't find any datasets that would allow for quick prototyping, so we made one. We call it *human numbers*, and it simply contains the first 10,000 words written out in English." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "> j: One of the most common practical mistakes I see even amongst highly experienced practitioners is failing to use appropriate datasets at appropriate times during the analysis process. In particular, most people tend to start with datasets which are too big and too complicated." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can download, extract, and take a look at our dataset in the usual way:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from fastai2.text.all import *\n", + "path = untar_data(URLs.HUMAN_NUMBERS)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#hide\n", + "Path.BASE_PATH = path" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(#2) [Path('train.txt'),Path('valid.txt')]" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "path.ls()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's open those two files and see what's inside. At first we'll join all of those texts together and ignore the split train/valid given by the dataset, we will come back to it later on:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(#9998) ['one \\n','two \\n','three \\n','four \\n','five \\n','six \\n','seven \\n','eight \\n','nine \\n','ten \\n'...]" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "lines = L()\n", + "with open(path/'train.txt') as f: lines += L(*f.readlines())\n", + "with open(path/'valid.txt') as f: lines += L(*f.readlines())\n", + "lines" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We take all those lines and concatenate them in one big stream. To mark when we go from one number to the next, we use a '.' as separation:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'one . two . three . four . five . six . seven . eight . nine . ten . eleven . twelve . thirteen . fo'" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "text = ' . '.join([l.strip() for l in lines])\n", + "text[:100]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's use word tokenization for this dataset, by splitting on spaces:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['one', '.', 'two', '.', 'three', '.', 'four', '.', 'five', '.']" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokens = text.split(' ')\n", + "tokens[:10]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To numericalize, we have to create a list of all the unique tokens (our *vocab*):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(#30) ['one','.','two','three','four','five','six','seven','eight','nine'...]" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "vocab = L(*tokens).unique()\n", + "vocab" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Then we can convert our tokens into numbers by looking up the index of each in the vocab:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(#63095) [0,1,2,1,3,1,4,1,5,1...]" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "word2idx = {w:i for i,w in enumerate(vocab)}\n", + "nums = L(word2idx[i] for i in tokens)\n", + "nums" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now that we have some small dataset on which language modelling should be an easy task, we can build our first model." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Our first language model from scratch" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "One simple way to turn this into a neural network would be to specify that we are going to predict each word based on the previous three words. Therefore, we could create a list of every sequence of three words as independent variables, and the next word after each sequence as the dependent variable. \n", + "\n", + "We can do that with plain Python. Let us do it first with tokens just to confirm what it looks like:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(#21031) [(['one', '.', 'two'], '.'),(['.', 'three', '.'], 'four'),(['four', '.', 'five'], '.'),(['.', 'six', '.'], 'seven'),(['seven', '.', 'eight'], '.'),(['.', 'nine', '.'], 'ten'),(['ten', '.', 'eleven'], '.'),(['.', 'twelve', '.'], 'thirteen'),(['thirteen', '.', 'fourteen'], '.'),(['.', 'fifteen', '.'], 'sixteen')...]" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "L((tokens[i:i+3], tokens[i+3]) for i in range(0,len(tokens)-4,3))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we will do it with tensors of the numericalized values, which is what the model will actually use:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(#21031) [(tensor([0, 1, 2]), 1),(tensor([1, 3, 1]), 4),(tensor([4, 1, 5]), 1),(tensor([1, 6, 1]), 7),(tensor([7, 1, 8]), 1),(tensor([1, 9, 1]), 10),(tensor([10, 1, 11]), 1),(tensor([ 1, 12, 1]), 13),(tensor([13, 1, 14]), 1),(tensor([ 1, 15, 1]), 16)...]" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "seqs = L((tensor(nums[i:i+3]), nums[i+3]) for i in range(0,len(nums)-4,3))\n", + "seqs" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Then we can batch those easily using the `DataLoader` class. For now we will split randomly the sequences." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "bs = 64\n", + "cut = int(len(seqs) * 0.8)\n", + "dls = DataLoaders.from_dsets(seqs[:cut], seqs[cut:], bs=64, shuffle=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can now create a neural network architecture that takes three words as input, and returns a prediction of the probability of each possible next word in the vocab. We will use three standard linear layers, but with two tweaks.\n", + "\n", + "The first tweak is that the first linear layer will use only the first word's embedding as activations, the second layer will use the second word's embedding plus the first layer's output activations, and the third layer will use the third word's embedding plus the second layer's output activations. The key effect of this is that every word is interpreted in the information context of any words preceding it. \n", + "\n", + "The second tweak is that each of these three layers will use the same weight matrix. The way that one word impacts the activations from previous words should not change depending on the position of a word. In other words, activation values will change as data moves through the layers, but the layer weights themselves will not change from layer to layer. So a layer does not learn one sequence position; it must learn to handle all positions.\n", + "\n", + "Since layer weights do not change, you might think of the sequential layers as the \"same layer\" repeated. In fact PyTorch makes this concrete; we can just create one layer, and use it multiple times." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Our language model in PyTorch" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can now create the language model module that we described earlier:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class LMModel1(Module):\n", + " def __init__(self, vocab_sz, n_hidden):\n", + " self.i_h = nn.Embedding(vocab_sz, n_hidden) \n", + " self.h_h = nn.Linear(n_hidden, n_hidden) \n", + " self.h_o = nn.Linear(n_hidden,vocab_sz)\n", + " \n", + " def forward(self, x):\n", + " h = F.relu(self.h_h(self.i_h(x[:,0])))\n", + " h = h + self.i_h(x[:,1])\n", + " h = F.relu(self.h_h(h))\n", + " h = h + self.i_h(x[:,2])\n", + " h = F.relu(self.h_h(h))\n", + " return self.h_o(h)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As you see, we have created three layers:\n", + "\n", + "- The embedding layer (`i_h` for *input* to *hidden*)\n", + "- The linear layer to create the activations for the next word (`h_h` for *hidden* to *hidden*)\n", + "- A final linear layer to predict the fourth word (`h_o` for *hidden* to *output*)\n", + "\n", + "This might be easier to represent in pictorial form. Let's define a simple pictorial representation of basic neural networks. <> shows how we're going to represent a neural net with one hidden layer." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\"Pictorial" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Each shape represents activations: rectangle for input, circle for hidden (inner) layer activations, and triangle for output activations. We will use those shapes (summarized in <>) in all the diagrams of this chapter." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\"Shapes" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "An arrow represents the actual layer computation—i.e. the linear layer followed by the activation layers. Using this notation, <> shows what our simple language model looks like." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\"Representation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To simplify things, we've removed the details of the layer computation from each arrow. We've also color-coded the arrows, such that all arrows with the same color have the same weight matrix. For instance, all the input layers use the same embedding matrix, so they all have the same color (green).\n", + "\n", + "Let's try training this model and see how it goes:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
epochtrain_lossvalid_lossaccuracytime
01.8242971.9709410.46755400:02
11.3869731.8232420.46755400:02
21.4175561.6544970.49441400:02
31.3764401.6508490.49441400:02
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "learn = Learner(dls, LMModel1(len(vocab), 64), loss_func=F.cross_entropy, metrics=accuracy)\n", + "learn.fit_one_cycle(4, 1e-3)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To see if this is any good, let's check what would a very simple model give us. In this case we could always predict the most common token, so let's find out which token is the most often the target in our validation set:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(tensor(29), 'thousand', 0.15165200855716662)" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "n,counts = 0,torch.zeros(len(vocab))\n", + "for x,y in dls.valid:\n", + " n += y.shape[0]\n", + " for i in range_of(vocab): counts[i] += (y==i).long().sum()\n", + "idx = torch.argmax(counts)\n", + "idx, vocab[idx.item()], counts[idx].item()/n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The most common token has the index 29, which corresponds to the token 'thousand'. Always predicting this token would give us an accuracy of roughly 15\\%, so we are faring way better!" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "> A: My first guess was that the separator would be the most common token, since there is one for every number. But looking at `tokens` reminded me that large numbers are written with many words, so on the way to 10,000 you write \"thousand\" a lot: five thousand, five thousand and one, five thousand and two, etc.. Oops! Looking at your data is great for noticing subtle features and also embarrassingly obvious ones." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This is a nice first baseline. Let's see how we can refactor this with a loop." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Our first recurrent neural network" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Looking at the code for our module, we could simplify it by replacing the duplicated code that calls the layers with a for loop. As well as making our code simpler, this will also have the benefit that we could apply our module equally well to token sequences of different lengths; we would not be restricted to token lists of length three." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class LMModel2(Module):\n", + " def __init__(self, vocab_sz, n_hidden):\n", + " self.i_h = nn.Embedding(vocab_sz, n_hidden) \n", + " self.h_h = nn.Linear(n_hidden, n_hidden) \n", + " self.h_o = nn.Linear(n_hidden,vocab_sz)\n", + " \n", + " def forward(self, x):\n", + " h = 0\n", + " for i in range(3):\n", + " h = h + self.i_h(x[:,i])\n", + " h = F.relu(self.h_h(h))\n", + " return self.h_o(h)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's check that we get the same results using this refactoring:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
epochtrain_lossvalid_lossaccuracytime
01.8162741.9641430.46018500:02
11.4238051.7399640.47325900:02
21.4303271.6851720.48538200:02
31.3883901.6570330.47040600:02
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "learn = Learner(dls, LMModel2(len(vocab), 64), loss_func=F.cross_entropy, metrics=accuracy)\n", + "learn.fit_one_cycle(4, 1e-3)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can also refactor our pictorial representation in exactly the same way, see <> (we're also removing the details of activation sizes here, and using the same arrow colors as in <>)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\"Basic" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You will see that there is a set of activations which are being updated each time through the loop, and are stored in the variable `h` — this is called the *hidden state*." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "> Jargon: hidden state: the activations that are updated at each step of a recurrent neural network" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A neural network which is defined using a loop like this is called a *recurrent neural network*, also known as an RNN. It is important to realise that an RNN is not a complicated new architecture, but is simply a refactoring of a multilayer neural network using a for loop.\n", + "\n", + "> A: My true opinion: if they were called \"looping neural networks\", or LNNs, they would seem 50% less daunting!" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now that we know what an RNN is, let's try to make it a little bit beter." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Improving the RNN" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Looking at the code for our RNN, one thing that seems problematic is that we are initialising our hidden state to zero for every new input sequence. Why is that a problem? We made our sample sequences short so they would fit easily into batches. But if we order those samples correctly, those sample sequences will be read in order by the model, exposing the model to long stretches of the original sequence. \n", + "\n", + "Another thing we can look at is havin more signal: why only predict the fourth word when we could use the intermediate predictions to also predict the second and third words? \n", + "\n", + "We'll see how we can implement those changes, starting with adding some state." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Maintaining the state of an RNN" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Because we initialize the model's hidden state to zero for each new sample, we are throwing away all the information we have about the sentences we have seen so far, which means that our model doesn't actually know where we are up to in the overall counting sequence. This is easily fixed; we can simply move the initialisation of the hidden state to `__init__`.\n", + "\n", + "But this fix will create its own subtle, but important, problem. It effectively makes our neural network as deep as the entire number of tokens in our document. For instance, if there were 10,000 tokens in our dataset, we would be creating a 10,000 layer neural network.\n", + "\n", + "To see this, consider the original pictorial representation of our recurrent neural network in <>, before refactoring it with a for loop. You can see each layer corresponds with one token input. When we talk about the representation of a recurrent neural network before refactoring with the for loop, we call this the *unrolled representation*. It is often helpful to consider the unrolled representation when trying to understand an RNN.\n", + "\n", + "The problem with a 10,000 layer neural network is that if and when you get to the 10,000th word of the dataset, you will still need to calculate the derivatives all the way back to the first layer. This is going to be very slow indeed, and very memory intensive. It is unlikely that you could store even one mini batch on your GPU.\n", + "\n", + "The solution to this is to tell PyTorch that we do not want to back propagate the derivatives through the entire implicit neural network. Instead, we will just keep the last three layers of gradients. To remove all of the gradient history in PyTorch, we use the `detach` method.\n", + "\n", + "Here is the new version of our RNN. It is now stateful, because it remembers its activations between different calls to `forward`, which represent its use for different samples in the batch:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class LMModel3(Module):\n", + " def __init__(self, vocab_sz, n_hidden):\n", + " self.i_h = nn.Embedding(vocab_sz, n_hidden) \n", + " self.h_h = nn.Linear(n_hidden, n_hidden) \n", + " self.h_o = nn.Linear(n_hidden,vocab_sz)\n", + " self.h = 0\n", + " \n", + " def forward(self, x):\n", + " for i in range(3):\n", + " self.h = self.h + self.i_h(x[:,i])\n", + " self.h = F.relu(self.h_h(self.h))\n", + " out = self.h_o(self.h)\n", + " self.h = self.h.detach()\n", + " return out\n", + " \n", + " def reset(self): self.h = 0" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you think about it, this model will have the same activations whatever the sequence length we pick, because the hidden state will remember the last activation from the previous batch. The only thing that will be different are the gradients computed at each step: they will only be calculated on sequence length tokens in the past, instead of the whole stream. That is why this sequence length is often called *bptt* for back-propagation through time." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "* jargon: Back propagation through time (BPTT): Treating a neural net with effectively one layer per time step (usually refactored using a loop) as one big model, and calculating gradients on it in the usual way. To avoid running out of memory and time, we usually use _truncated_ BPTT, which \"detaches\" the history of computation steps in the hidden state every few time steps." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To use `LMModel3`, we need to make sure the samples are going to be seen in a certain order. As we saw in <>, if the first line of the first batch is our `dset[0]` then the second batch should have `dset[1]` as the first line, so that the model sees the text flowing.\n", + "\n", + "`LMDataLoader` was doing this for us in <>. This time we're going to do it ourselves.\n", + "\n", + "To do this, we are going to rearrange our dataset. First we divide the samples into `m = len(dset) // bs` groups (this is the equivalent of splitting the whole concatenated dataset into, for instance, 64 equally sized pieces, since we're using `bs=64` here). `m` is the length of each of these pieces. For instance, if we're using our whole dataset (although we'll actually split it into train vs valid in a moment), that will be:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(328, 64, 21031)" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "m = len(seqs)//bs\n", + "m,bs,len(seqs)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The first batch will be composed of the samples:\n", + "\n", + " (0, m, 2*m, ..., (bs-1)*m)\n", + "\n", + "then the second batch of the samples: \n", + "\n", + " (1, m+1, 2*m+1, ..., (bs-1)*m+1)\n", + "\n", + "and so forth. This way, at each epoch, the model will see a chunk of contiguous text of size `3*m` (since each text is of size 3) on each line of the batch.\n", + "\n", + "The following function does that reindexing:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def group_chunks(ds, bs):\n", + " m = len(ds) // bs\n", + " new_ds = L()\n", + " for i in range(m): new_ds += L(ds[i + m*j] for j in range(bs))\n", + " return new_ds" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Then we just pass `drop_last=True` when building our `DataLoaders` to drop the last batch that has not a shape of `bs`, we also pass `shuffle=False` to make sure the texts are read in order." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cut = int(len(seqs) * 0.8)\n", + "dls = DataLoaders.from_dsets(\n", + " group_chunks(seqs[:cut], bs), \n", + " group_chunks(seqs[cut:], bs), \n", + " bs=bs, drop_last=True, shuffle=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The last thing we add is a little tweak of the training loop via a `Callback`. We will talk more about callbacks in <>; this one will call the `reset` method of our model at the beginning of each epoch and before each validation phase. Since we implemented that method to zero the hidden state of the model, this will make sure we start we a clean state before reading those continuous chunks of text. We can also start training a bit longer:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
epochtrain_lossvalid_lossaccuracytime
01.6770741.8273670.46754800:02
11.2827221.8709130.38894200:02
21.0907051.6517930.46250000:02
31.0050921.6137940.51658700:02
40.9659751.5607750.55120200:02
50.9161821.5958570.56057700:02
60.8976571.5397330.57427900:02
70.8362741.5851410.58317300:02
80.8058771.6298080.58677900:02
90.7950961.6512670.58894200:02
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "learn = Learner(dls, LMModel3(len(vocab), 64), loss_func=F.cross_entropy,\n", + " metrics=accuracy, cbs=ModelReseter)\n", + "learn.fit_one_cycle(10, 3e-3)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This is already better! The next step is to use more targets and compare them to the intermediate predictions." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Creating more signal" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Another problem with our current approach is that we only predict one output word for each three input words. That means that the amount of signal that we are feeding back to update weights with is not as large as it could be. It would be better if we predicted the next word after every single word, rather than every three words, as shown in <>." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\"RNN" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This is easy enough to add. We need to first change our data so that the dependent variable has each of the three next words after each of our three input words. Instead of 3, we use an attribute, `sl` (for sequence length) and make it a bit bigger:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sl = 16\n", + "seqs = L((tensor(nums[i:i+sl]), tensor(nums[i+1:i+sl+1]))\n", + " for i in range(0,len(nums)-sl-1,sl))\n", + "cut = int(len(seqs) * 0.8)\n", + "dls = DataLoaders.from_dsets(group_chunks(seqs[:cut], bs),\n", + " group_chunks(seqs[cut:], bs),\n", + " bs=bs, drop_last=True, shuffle=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Looking at the first element of `seqs`, we can see that it contains two lists of the same size. The second list is the same as the first, but offset by one element:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[(#16) ['one','.','two','.','three','.','four','.','five','.'...],\n", + " (#16) ['.','two','.','three','.','four','.','five','.','six'...]]" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "[L(vocab[o] for o in s) for s in seqs[0]]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we need to modify our model so that it outputs a prediction after every word, rather than just at the end of a three word sequence:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class LMModel4(Module):\n", + " def __init__(self, vocab_sz, n_hidden):\n", + " self.i_h = nn.Embedding(vocab_sz, n_hidden) \n", + " self.h_h = nn.Linear(n_hidden, n_hidden) \n", + " self.h_o = nn.Linear(n_hidden,vocab_sz)\n", + " self.h = 0\n", + " \n", + " def forward(self, x):\n", + " outs = []\n", + " for i in range(sl):\n", + " self.h = self.h + self.i_h(x[:,i])\n", + " self.h = F.relu(self.h_h(self.h))\n", + " outs.append(self.h_o(self.h))\n", + " self.h = self.h.detach()\n", + " return torch.stack(outs, dim=1)\n", + " \n", + " def reset(self): self.h = 0" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This model will return outputs of shape `bs x sl x vocab_sz` (since we stacked on `dim=1`). Our targets are of shape `bs x sl`, so we need to flatten those before using them in `F.cross_entropy`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def loss_func(inp, targ):\n", + " return F.cross_entropy(inp.view(-1, len(vocab)), targ.view(-1))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can now use this loss function to train the model:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
epochtrain_lossvalid_lossaccuracytime
03.1032982.8743410.21256500:01
12.2319641.9712800.46215800:01
21.7113581.8135470.46118200:01
31.4485161.8281760.48323600:01
41.2886301.6595640.52067100:01
51.1614701.7140230.55493200:01
61.0555681.6609160.57503300:01
70.9607651.7196240.59106400:01
80.8701531.8395600.61466500:01
90.8085451.7702780.62434900:01
100.7580841.8429310.61075800:01
110.7193201.7995270.64656600:01
120.6834391.9179280.64982100:01
130.6602831.8747120.62858100:01
140.6461541.8775190.64005500:01
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "learn = Learner(dls, LMModel4(len(vocab), 64), loss_func=loss_func,\n", + " metrics=accuracy, cbs=ModelReseter)\n", + "learn.fit_one_cycle(15, 3e-3)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We need to train for longer, since the task has changed a bit and is more complicated now. But we end up with a good result... At least, sometimes. If you run it a few times, you'll see that you can get quite different results on different runs. That's because effectively we have a very deep network here, which can result in very large or very small gradients. We'll see in the next part of to deal with this.\n", + "\n", + "Now, the obvious way to get a better model is to go deeper: we only have one linear layer between the hidden state and the output activations in our basic RNN, so maybe we would get better results with more." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Multilayer RNNs" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In a multilayer RNN, we pass the activations from our recurrent neural network into a second recurrent neural network, like in <>." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\"2-layer" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "…or in an unrolled representation in <> (the same way as in <> last section)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\"2-layer" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's see how to implement this in practice." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## The model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's save some time by using PyTorch's RNN class, which implements exactly what we have created above, but also gives us the option to stack multiple RNNs, as we have discussed:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class LMModel5(Module):\n", + " def __init__(self, vocab_sz, n_hidden, n_layers):\n", + " self.i_h = nn.Embedding(vocab_sz, n_hidden)\n", + " self.rnn = nn.RNN(n_hidden, n_hidden, n_layers, batch_first=True)\n", + " self.h_o = nn.Linear(n_hidden, vocab_sz)\n", + " self.h = torch.zeros(n_layers, bs, n_hidden)\n", + " \n", + " def forward(self, x):\n", + " res,h = self.rnn(self.i_h(x), self.h)\n", + " self.h = h.detach()\n", + " return self.h_o(res)\n", + " \n", + " def reset(self): self.h.zero_()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
epochtrain_lossvalid_lossaccuracytime
03.0558532.5916400.43790700:01
12.1623591.7873100.47159800:01
21.7106631.9418070.32177700:01
31.5207831.9997260.31201200:01
41.3308462.0129020.41324900:01
51.1632971.8961920.45068400:01
61.0338132.0052090.43481400:01
70.9190902.0470830.45670600:01
80.8229392.0680310.46883100:01
90.7501802.1360640.47509800:01
100.6951202.1391400.48543300:01
110.6557522.1550810.49365200:01
120.6296502.1625830.49853500:01
130.6135832.1716490.49104800:01
140.6043092.1803550.48787400:01
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "learn = Learner(dls, LMModel5(len(vocab), 64, 2), loss_func=CrossEntropyLossFlat(), \n", + " metrics=accuracy, cbs=ModelReseter)\n", + "learn.fit_one_cycle(15, 3e-3)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now that's disappointing... we are doing more poorly than the single-layer RNN from the end of last section. The reason is that we have a deeper model, leading to exploding or disappearing activations." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Exploding or disappearing activations" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In practice, creating accurate models from this kind of RNN is difficult. We will get better results if we call `detach` less often, and have more layers — this gives our RNN a longer time horizon to learn from, and richer features to create. But it also means we have a deeper model to train. The key challenge in the development of deep learning has been figuring out how to train these kinds of models.\n", + "\n", + "The reason this is challenging is because of what happens when you multiply by a matrix many times. Think about what happens when you multiply by a number many times. For example, if you multiply by two, starting at one, you get the sequence 1, 2, 4, 8,… after 32 steps you are already at 4,294,967,296. A similar issue happens if we multiply by 0.5: we get 0.5, 0.25, 0.125… and after 32 steps it's 0.00000000023. As you can see, a number even slightly higher or lower than one results in an explosion or disappearance of our number, after just a few repeated multiplications.\n", + "\n", + "Because matrix multiplication is just multiplying numbers and adding them up, exactly the same thing happens with repeated matrix multiplications. And a deep neural network is just repeated matrix multiplications--each extra layer is another matrix multiplication. This means that it is very easy for a deep neural network to end up with extremely large, or extremely small numbers.\n", + "\n", + "This is a problem, because the way computers store numbers (known as \"floating point\") means that they become less and less accurate the further away the numbers get from zero. The diagram in <>, from the excellent article [What you never wanted to know about floating point but will be forced to find out](http://www.volkerschatz.com/science/float.html), shows how the precision of floating point numbers varies over the number line:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\"Precision" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This inaccuracy means that often the gradients calculated for updating the weights end up as zero or infinity for deep networks. This is commonly refered to as *vanishing gradients* or *exploding gradients*. That means that in SGD, the weights are updated either not at all, or jump to infinity. Either way, they won't improve with training.\n", + "\n", + "Researchers have developed a number of ways to tackle this problem, which we will be discussing later in the book. One way to tackle the problem is to change the definition of a layer in a way that makes it less likely to have exploding activations. We'll look at the details of how this is done in <>, when we discuss *batch normalization*, and <>, when we discuss *ResNets*, although these details don't generally matter in practice (unless you are a researcher that is creating new approaches to solving this problem). Another way to deal with this is by being careful about *initialization*, which is a topic we'll investigate in <>.\n", + "\n", + "For RNNs, there are two types of layers frequently used to avoid exploding activations, and they are: *gated recurrent units* (GRU), and *Long Short-Term Memory* (LSTM). Both of these are available in PyTorch, and are drop-in replacements for the RNN layer. We will only cover LSTMs in this book, there are plenty of good tutorials online explaining GRUs, which are a minor variant on the LSTM design." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## LSTM" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "LSTM (for long short-term memory) is an architecture that was introduced back in 1997 by Jurgen Schmidhuber and Sepp Hochreiter. In this architecture, there are not one but two hidden states. In our base RNN, the hidden state is the output of the RNN at the previous time step. That hidden state is then responsible for doing two things at a time:\n", + "\n", + "- having the right information for the output layer to predict the correct next token\n", + "- retaining memory of everything that happened in the sentence\n", + "\n", + "Consider, for example, the sentences \"Henry has a dog and he likes his dog very much\" and \"Sophie has a dog and she likes her dog very much\". It's very clear that the RNN needs to remember the name at the beginning of the sentence to be able to predict *he/she* or *his/her*. \n", + "\n", + "In practice, RNNs are really bad at retaining memory of what happened much earlier in the sentence, which is the motivation to have another hidden state (called cell state) in the LSTM. The cell state will be responsible for keeping *long short-term memory*, while the hidden state will focus on the next token to predict. Let's have a closer look and how this is achieved and build one LSTM from scratch." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Building an LSTM from scratch" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In order to build an LSTM, we first have to understand its architecture. <> shows us its inner structure.\n", + " \n", + "\"A" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this picture, our input $x_{t}$ enters on the bottom with the previous hidden state ($h_{t-1}$) and cell state ($x_{t-1}$). The four orange boxes represent four layers with the activation being either sigmoid (for $\\sigma$) or tanh. tanh is just a sigmoid rescaled to the range -1 to 1. Its mathematical expression can be written like this:\n", + "\n", + "$$\\tanh(x) = \\frac{e^{x} + e^{-x}}{e^{x}-e^{-x}} = 2 \\sigma(2x) - 1$$\n", + "\n", + "where $\\sigma$ is the sigmoid function. The green boxes are elementwise operations. What goes out is the new hidden state ($h_{t}$) and new cell state ($c_{t}$) on the left, ready for our next input. The new hidden state is also use as output, which is why the arrow splits to go up.\n", + "\n", + "Let's go over the four neural nets (called *gates*) one by one and explain the diagram, but before this, notice how very little the cell state (on the top) is changed. It doesn't even go directly through a neural net! This is exactly why it will carry on a longer-term state.\n", + "\n", + "First, the arrows for input and old hidden state are joined together. In the RNN we wrote before in this chapter, we were adding them together. In the LSTM, we stack them in one big tensor. This means the dimension of our embeddings (which is the dimension of $x_{t}$) can be different than the dimension of our hidden state. If we call those `n_in` and `n_hid`, the arrow at the bottom is of size `n_in + n_hid`, thus all the neural nets (orange boxes) are linear layers with `n_in + n_hid` inputs and `n_hid` outputs.\n", + "\n", + "The first gate (looking from the left to right) is called the *forget gate*. Since it's a linear layer followed by a sigmoid, its output will have scalars between 0 and 1. We multiply this result by the cell gate, so for all the values close to 0, we will forget what was inside that cell state (and for the values close to 1 it doesn't do anything). This gives the ability to the LSTM to forget things about its longterm state. For instance, when crossing a period or an `xxbos` token, we would expect to it to (have learned to) reset its cell state.\n", + "\n", + "The second gate is called the *input gate*. It works with the third gate (which doesn't really have a name but is sometimes called the *cell gate*) to update the cell state. For instance we may see a new gender pronoun, so we must replace the information about gender that the forget gate removed by the new one. Like the forget gate, the input gate ends up on a product, so it jsut decides which element of the cell state to update (valeus close to 1) or not (values close to 0). The third gate will then fill those values with things between -1 and 1 (thanks to the tanh). The result is then added to the cell state.\n", + "\n", + "The last gate is the *output gate*. It will decides which information take in the cell state to generate the output. The cell state goes through a tanh before this and the output gate combined with the sigmoid decides which values to take inside it.\n", + "\n", + "\n", + "In terms of code, we can write the same steps like this:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class LSTMCell(Module):\n", + " def __init__(self, ni, nh):\n", + " self.forget_gate = nn.Linear(ni + nh, nh)\n", + " self.input_gate = nn.Linear(ni + nh, nh)\n", + " self.cell_gate = nn.Linear(ni + nh, nh)\n", + " self.output_gate = nn.Linear(ni + nh, nh)\n", + "\n", + " def forward(self, input, state):\n", + " h,c = state\n", + " h = torch.stack([h, input], dim=1)\n", + " forget = torch.sigmoid(self.forget_gate(h))\n", + " c = c * forget\n", + " inp = torch.sigmoid(self.input_gate(h))\n", + " cell = torch.tanh(self.cell_gate(h))\n", + " c = c + inp * cell\n", + " out = torch.sigmoid(self.output_gate(h))\n", + " h = outgate * torch.tanh(c)\n", + " return h, (h,c)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In practice, we can then refactor the code. Also, in terms of performance, it's better to do one big matrix multiplication than four smaller ones (that's because we only launch the special fast kernel on GPU once, and it gives the GPU more work to do in parallel). The stacking takes a bit of time (since we have to move one of the tensors around on the GPU to have it all in a contiguous array), so we use two separate layers for the input and the hidden state. The optimized and refactored code then looks like that:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class LSTMCell(Module):\n", + " def __init__(self, ni, nh):\n", + " self.ih = nn.Linear(ni,4*nh)\n", + " self.hh = nn.Linear(nh,4*nh)\n", + "\n", + " def forward(self, input, state):\n", + " h,c = state\n", + " #One big multiplication for all the gates is better than 4 smaller ones\n", + " gates = (self.ih(input) + self.hh(h)).chunk(4, 1)\n", + " ingate,forgetgate,outgate = map(torch.sigmoid, gates[:3])\n", + " cellgate = gates[3].tanh()\n", + "\n", + " c = (forgetgate*c) + (ingate*cellgate)\n", + " h = outgate * c.tanh()\n", + " return h, (h,c)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here we use the PyTorch `chunk` method to split our tensor into 4 pieces, e.g.:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "t = torch.arange(0,10); t" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(tensor([0, 1, 2, 3, 4]), tensor([5, 6, 7, 8, 9]))" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "t.chunk(2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's now use this architecture to train a language model!" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Training a language model using LSTMs" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here is the same network as `LMModel5`, using a two-layer LSTM. We can train it at a higher learning rate, for a shorter time, and get better accuracy:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class LMModel6(Module):\n", + " def __init__(self, vocab_sz, n_hidden, n_layers):\n", + " self.i_h = nn.Embedding(vocab_sz, n_hidden)\n", + " self.rnn = nn.LSTM(n_hidden, n_hidden, n_layers, batch_first=True)\n", + " self.h_o = nn.Linear(n_hidden, vocab_sz)\n", + " self.h = [torch.zeros(2, bs, n_hidden) for _ in range(n_layers)]\n", + " \n", + " def forward(self, x):\n", + " res,h = self.rnn(self.i_h(x), self.h)\n", + " self.h = [h_.detach() for h_ in h]\n", + " return self.h_o(res)\n", + " \n", + " def reset(self): \n", + " for h in self.h: h.zero_()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
epochtrain_lossvalid_lossaccuracytime
03.0008212.6639420.43831400:02
12.1396422.1847800.24047900:02
21.6072751.8126820.43977900:02
31.3477111.8309820.49747700:02
41.1231131.9377660.59440100:02
50.8520422.0121270.63159200:02
60.5654941.3127420.72574900:02
70.3474451.2979340.71126300:02
80.2081911.4412690.73120100:02
90.1263351.5699520.73730500:02
100.0797611.4271870.75415000:02
110.0529901.4949900.74511700:02
120.0390081.3937310.75789400:02
130.0315021.3732100.75846400:02
140.0280681.3680830.75846400:02
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "learn = Learner(dls, LMModel6(len(vocab), 64, 2), loss_func=CrossEntropyLossFlat(), \n", + " metrics=accuracy, cbs=ModelReseter)\n", + "learn.fit_one_cycle(15, 1e-2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now that's better than a multilayer RNN! We can still see there is a bit of overfitting, which is a sign that a bit of regularization might help." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Regularizing an LSTM" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Recurrent neural networks, in general, are hard to train, because of the problems of vanishing activations and gradients we saw before. Using LSTMs (or GRUs) cell make training easier than vanilla RNNs, but there are still very prone to overfitting. Data augmentation, while it exists for text data, is less often used because in most cases, it requires another model to generate random augmentation (by translating in another language and back to the language used for instance). Overall, data augmentation for text data is currently not a well explored space.\n", + "\n", + "However, there are other regularization techniques we can use instead to reduce overfitting, which were thoroughly studied for use with LSTMs in the paper [Regularizing and Optimizing LSTM Language Models](https://arxiv.org/abs/1708.02182). This paper showed how effective use of *dropout*, *activation regularization*, and *temporal activation regularization* could allow an LSTM to beat state of the art results that previously required much more complicated models. They called an LSTM using these techniques an *AWD LSTM*. We'll look at each of these techniques in turn." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Dropout" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Dropout is a regularization technique that was introduce by Geoffrey Hinton et al. in [Improving neural networks by preventing co-adaptation of feature detectors](https://arxiv.org/abs/1207.0580). The basic idea is to randomly change some activations to zero at training time. This makes sure all neurons actively work toward the output as seen in <> which is a screenshot from the original paper.\n", + "\n", + "\"A\n", + "\n", + "Hinton used a nice metaphor when he explained, in an interview, the inspiration for dropout:\n", + "\n", + "> : \"I went to my bank. The tellers kept changing and I asked one of them why. He said he didn’t know but they got moved around a lot. I figured it must be because it would require cooperation between employees to successfully defraud the bank. This made me realize that randomly removing a different subset of neurons on each example would prevent conspiracies and thus reduce overfitting\"\n", + "\n", + "In the same interview, he also explained that neuroscience provided additional inspiration:\n", + "\n", + "> : \"We don't really know why neurons spike. One theory is that they want to be noisy so as to regularize, because we have many more parameters than we have data points. The idea of dropout is that if you have noisy activations, you can afford to use a much bigger model.\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "TK add takeaway form those citations before moving on." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can see there that if we just zero those activations without doing anything else, our model will have problems to train: if we go from the sum of 5 activations (that are all positive numbers since we apply a ReLU) to just 2, this won't have the same scale. Therefore if we dropout with a probability `p`, we rescale all activation by dividing them by `1-p` (on average `p` will be zeroed, so it leaves `1-p`), as shown in <> which is a diagram from the original paper.\n", + "\n", + "\"A\n", + "\n", + "This is a full implementation of the dropout layer in PyTorch (although PyTorch's native layer is actually written in C, not Python):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class Dropout(Module):\n", + " def __init__(self, p): self.p = p\n", + " def forward(self, x):\n", + " if not self.training: return x\n", + " mask = x.new(*x.shape).bernoulli_(1-p)\n", + " return x * mask.div_(1-p)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The `bernoulli_` method is creating a tensor with random zeros (with probability p) and ones (with probability 1-p), which is then multiplied with our input before dividing by `1-p`. Note the use of the `training` attribute, which is available in any PyTorch `nn.Module`, and tells us if we are doing training or inference.\n", + "\n", + "> note: In previous chapters of the book we'd be adding a code example for `bernoulli_` here, so you can see exactly how it works. But now that you know enough to do this yourself, we're going to be doing fewer and fewer examples for you, and instead expecting you to do your own experiments to see how things work. In this case, you'll see in the end-of-chapter questionnaire that we're asking you to experiment with `bernoulli_`--but don't wait for us to ask you to experiment to develop your understanding of the code we're studying, go ahead and do it anyway!\n", + "\n", + "Using dropout before passing the output of our LSTM to the final layer will help reduce overfitting. Dropout is also used in many other models, including the default CNN head used in `fastai.vision`, and is also available in `fastai.tabular` by passing the `ps` parameter (where each \"p\" is passed to each added `Dropout` layer), as we'll see in <>." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Dropout has a different behavior in training and validation mode, which we achieved using the `training` attribute in `Dropout` above. Calling the `train()` method on a `Module` sets `training` to `True` (both for the module you call the method on, and for every module it recursively contains), and `eval()` sets it to `False`. This is done automatically when calling the methods of `Learner`, but if you are not using that class, remember to switch from one to the other as needed." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### AR and TAR regularization" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "AR (for *activation regularization*) and TAR (for *temporal activation regularization*) are two regularization methods very similar to weight decay. When applying weight decay, we add a small penalty to the loss that aims at making the weights as small as possible. For the activation regularization, it's the final activations produced by the LSTM that we will try to make as small as possible, instead of the weights.\n", + "\n", + "To regularize the final activations, we have to store those somewhere, then add the means of the squares of them to the loss (along with a multiplier `alpha`, which is just like `wd` for weight decay):\n", + "\n", + "``` python\n", + "loss += alpha * activations.pow(2).mean()\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Temporal activation regularization is linked to the fact we are predicting tokens in a sentence. That means it's likely that the outputs of our LSTMs should somewhat make sense when we read them in order. TAR is there to encourage that behavior by adding a penalty to the loss to make the difference between two consecutive activations as small as possible: our activations tensor has a shape `bs x sl x n_hid`, and we read consecutive activation on the sequence length axis (so the dimension in the middle). With this, TAR can be expressed as:\n", + "\n", + "``` python\n", + "loss += beta * (activations[:,1:] - activations[:,:-1]).pow(2).mean()\n", + "```\n", + "\n", + "`alpha` and `beta` are then two hyper-parameters to tune. To make this work, we need our model with dropout to return three things: the proper output, the activations of the LSTM pre-dropout and the activations of the LSTM post-dropout. AR is often applied on the dropped out activations (to not penalize the activations we turned in 0s afterward) while TAR is applied on the non-dropped out activations (because those 0s create big differences between two consecutive timesteps). There is then a callback called `RNNRegularizer` that will apply this regularization for us." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Training a weight-tied regularized LSTM" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can combine dropout (applied before we go in our output layer) with the AR and TAR regularization to train our previous LSTM. We just need to return three things instead of one: the normal output of our LSTM, the dropped-out activations and the activations from our LSTMs. Those last two will be picked up by the callback `RNNRegularization` for the contributions it has to make to the loss.\n", + "\n", + "Another useful trick we can add from the AWD LSTM paper is *weight tying*. In a language model, the input embeddings represent a mapping from English words to activations, and the output hidden layer represents a mapping from activations to English words. We might expect, intuitively, that these mappings could be the same. We can represent this in PyTorch by assigning the same weight matrix to each of these layers:\n", + "\n", + " self.h_o.weight = self.i_h.weight\n", + "\n", + "In `LMMModel7`, we include these final tweaks:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class LMModel7(Module):\n", + " def __init__(self, vocab_sz, n_hidden, n_layers, p):\n", + " self.i_h = nn.Embedding(vocab_sz, n_hidden)\n", + " self.rnn = nn.LSTM(n_hidden, n_hidden, n_layers, batch_first=True)\n", + " self.drop = nn.Dropout(p)\n", + " self.h_o = nn.Linear(n_hidden, vocab_sz)\n", + " self.h_o.weight = self.i_h.weight\n", + " self.h = [torch.zeros(2, bs, n_hidden) for _ in range(n_layers)]\n", + " \n", + " def forward(self, x):\n", + " raw,h = self.rnn(self.i_h(x), self.h)\n", + " out = self.drop(raw)\n", + " self.h = [h_.detach() for h_ in h]\n", + " return self.h_o(out),raw,out\n", + " \n", + " def reset(self): \n", + " for h in self.h: h.zero_()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can create a regularized `Learner` using the `RNNRegularizer` callback:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "learn = Learner(dls, LMModel7(len(vocab), 64, 2, 0.5),\n", + " loss_func=CrossEntropyLossFlat(), metrics=accuracy,\n", + " cbs=[ModelReseter, RNNRegularizer(alpha=2, beta=1)])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A `TextLearner` automatically adds those two callbacks for us (with default for `alpha` and `beta` as above) so we can simplify the line above to:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "learn = TextLearner(dls, LMModel7(len(vocab), 64, 2, 0.4),\n", + " loss_func=CrossEntropyLossFlat(), metrics=accuracy)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can the train the model, and add additional regularization by increasing the weight decay to `0.1`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
epochtrain_lossvalid_lossaccuracytime
02.6938852.0134840.46663400:02
11.6855491.1873100.62931300:02
20.9733070.7913980.74560500:02
30.5558230.6404120.79410800:02
40.3518020.5572470.83610000:02
50.2449860.5949770.80729200:02
60.1922310.5116900.84676100:02
70.1624560.5203700.85807300:02
80.1426640.5259180.84228500:02
90.1284930.4950290.85807300:02
100.1175890.4642360.86718800:02
110.1098080.4665500.86930300:02
120.1042160.4551510.87182600:02
130.1002710.4526590.87361700:02
140.0981210.4583720.86938500:02
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "learn.fit_one_cycle(15, 1e-2, wd=0.1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now this is far better than our previous model!" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Conclusion" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You have now seen everything that is inside the AWD-LSTM architecture we used in text classification in <>. It uses dropouts in a lot more places:\n", + "\n", + "- embedding dropout (just after the embedding layer)\n", + "- input dropout (after the embedding layer)\n", + "- weight dropout (applied to the weights of the LSTM at each training step)\n", + "- hidden dropout (applied to the hidden state between two layers)\n", + "\n", + "which makes it even more regularized. Since fine-tuning those five dropout values (adding the dropout before the output layer) is complicated, so we have determined good defaults, and allow the magnitude of dropout to be tuned overall with the `drop_mult` parameter you saw (which is multiplied by each dropout).\n", + "\n", + "Another architecture that is very powerful, especially in \"sequence to sequence\" problems (that is, problems where the dependent variable is itself a variable length sequence, such as language translation), is the Transformers architecture. You can find it in an online bonus chapter on the book website." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Questionnaire" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "1. If the dataset for your project is so big and complicated that working with it takes a significant amount of time, what should you do?\n", + "1. Why do we concatenating the documents in our dataset before creating a language model?\n", + "1. To use a standard fully connected network to predict the fourth word given the previous three words, what two tweaks do we need to make?\n", + "1. How can we share a weight matrix across multiple layers in PyTorch?\n", + "1. Write a module which predicts the third word given the previous two words of a sentence, without peeking.\n", + "1. What is a recurrent neural network?\n", + "1. What is hidden state?\n", + "1. What is the equivalent of hidden state in ` LMModel1`?\n", + "1. To maintain the state in an RNN why is it important to pass the text to the model in order?\n", + "1. What is an unrolled representation of an RNN?\n", + "1. Why can maintaining the hidden state in an RNN lead to memory and performance problems? How do we fix this problem?\n", + "1. What is BPTT?\n", + "1. Write code to print out the first few batches of the validation set, including converting the token IDs back into English strings, as we showed for batches of IMDb data in <>.\n", + "1. What does the `ModelReseter` callback do? Why do we need it?\n", + "1. What are the downsides of predicting just one output word for each three input words?\n", + "1. Why do we need a custom loss function for `LMModel4`?\n", + "1. Why is the training of `LMModel4` unstable?\n", + "1. In the unrolled representation, we can see that a recurrent neural network actually has many layers. So why do we need to stack RNNs to get better results?\n", + "1. Draw a representation of a stacked (multilayer) RNN.\n", + "1. Why should we get better results in an RNN if we call `detach` less often? Why might this not happen in practice with a simple RNN?\n", + "1. Why can a deep network result in very large or very small activations? Why does this matter?\n", + "1. In a computer's floating point representation of numbers, which numbers are the most precise?\n", + "1. Why do vanishing gradients prevent training?\n", + "1. Why does it help to have two hidden states in the LSTM architecture? What is the purpose of each one?\n", + "1. What are these two states called in an LSTM?\n", + "1. What is tanh, and how is it related to sigmoid?\n", + "1. What is the purpose of this code in `LSTMCell`?: `h = torch.stack([h, input], dim=1)`\n", + "1. What does `chunk` to in PyTorch?\n", + "1. Study the refactored version of `LSTMCell` carefully to ensure you understand how and why it does the same thing as the non-refactored version.\n", + "1. Why can we use a higher learning rate for `LMModel6`?\n", + "1. What are the three regularisation techniques used in an AWD-LSTM model?\n", + "1. What is dropout?\n", + "1. Why do we scale the weights with dropout? Is this applied during training, inference, or both?\n", + "1. What is the purpose of this line from `Dropout`?: `if not self.training: return x`\n", + "1. Experiment with `bernoulli_` to understand how it works.\n", + "1. How do you set your model in training mode in PyTorch? In evaluation mode?\n", + "1. Write the equation for activation regularization (in maths or code, as you prefer). How is it different to weight decay?\n", + "1. Write the equation for temporal activation regularization (in maths or code, as you prefer). Why wouldn't we use this for computer vision problems?\n", + "1. What is \"weight tying\" in a language model?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Further research" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "1. In ` LMModel2` why can `forward` start with `h=0`? Why don't we need to say `h=torch.zeros(…)`?\n", + "1. Write the code for an LSTM from scratch (but you may refer to <>).\n", + "1. Search on the Internet for the GRU architecture and implement it from scratch, and try training a model. See if you can get the similar results as we saw in this chapter. Compare it to the results of PyTorch's built in GRU module.\n", + "1. Have a look at the source code for AWD-LSTM in fastai, and try to map each of the lines of code to the concepts shown in this chapter." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "jupytext": { + "split_at_heading": true + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/14_deep_conv.ipynb b/14_deep_conv.ipynb deleted file mode 100644 index 87e1134..0000000 --- a/14_deep_conv.ipynb +++ /dev/null @@ -1,1044 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#hide\n", - "from utils import *" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "[[chapter_deep_conv]]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Improving training stability" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Since we are so good at recognizing threes from sevens, let's move onto something harder—recognizing all 10 digits. That means we'll need to use `MNIST` instead of `MNIST_SAMPLE`:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "path = untar_data(URLs.MNIST)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#hide\n", - "Path.BASE_PATH = path" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(#2) [Path('testing'),Path('training')]" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "path.ls()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The data is in two folders named `training` and `testing`, so we have to tell `GrandparentSplitter` about that (it defaults to `train` and `valid`). We define a function `get_dls` to make it easy to change our batch size later:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def get_dls(bs=64):\n", - " return DataBlock(\n", - " blocks=(ImageBlock(cls=PILImageBW), CategoryBlock), \n", - " get_items=get_image_files, \n", - " splitter=GrandparentSplitter('training','testing'),\n", - " get_y=parent_label,\n", - " batch_tfms=Normalize()\n", - " ).dataloaders(path, bs=bs)\n", - "\n", - "dls = get_dls()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Always a good idea to look at your data before you use it:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "dls.show_batch(max_n=9, figsize=(4,4))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now that we have our data ready, we can train a simple model on it." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## A simple baseline" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In the previous chapter, we built a model based on a `conv` function like this:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def conv(ni, nf, ks=3, act=True):\n", - " res = nn.Conv2d(ni, nf, stride=2, kernel_size=ks, padding=ks//2)\n", - " if act: res = nn.Sequential(res, nn.ReLU())\n", - " return res" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's start with a basic CNN as a baseline. We'll use the same as we had in the last chapter, but with one tweak: we'll use more activations.\n", - "\n", - "As we discussed, we generally want to double the number of filters each time we have a stride 2 layer. So, one way to increase the number of filters throughout our network is to double the number of activations in the first layer – then every layer after that will end up twice as big as the previous version as well.\n", - "\n", - "But there is a subtle problem with this. Consider the kernel which is being applied to each pixel. By default, we use a 3x3 pixel kernel. That means that there are a total of 3×3 = 9 pixels that the kernel is being applied to at each location. Previously, our first layer had four filters output. That meant that there were four values being computed from nine pixels at each location. Think about what happens if we double this output to 8 filters. Then when we apply our kernel we would be using nine pixels to calculate eight numbers. That means that it isn't really learning much at all — the output size is almost the same as the input size. Neural networks will only create useful features if they're forced to do so—that is, that the number of outputs from an operation is smaller than the number of inputs.\n", - "\n", - "To fix this, we can use a larger kernel in the first layer. If we use a kernel of 5x5 pixels then there are 25 pixels being used at each kernel application — creating eight filters from this will mean the neural net will have to find some useful features." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def simple_cnn():\n", - " return sequential(\n", - " conv(1 ,8, ks=5), #14x14\n", - " conv(8 ,16), #7x7\n", - " conv(16,32), #4x4\n", - " conv(32,64), #2x2\n", - " conv(64,10, act=False), #1x1\n", - " Flatten(),\n", - " )" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "As you'll see in a moment, we're going to look inside our models while they're training in order to try to find ways to make them train better. To do this, we use the `ActivationStats` callback, which records the mean, standard deviation, and histogram of activations of every trainable layer (as we've seen, callbacks are used to add behavior to the training loop; we'll see how they work in <>)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from fastai2.callback.hook import *" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We want to train quickly, so that means training at a high learning rate. Let's see how we go at 0.06:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def fit(epochs=1):\n", - " learn = Learner(dls, simple_cnn(), loss_func=F.cross_entropy,\n", - " metrics=accuracy, cbs=ActivationStats(with_hist=True))\n", - " learn.fit(epochs, 0.06)\n", - " return learn" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
epochtrain_lossvalid_lossaccuracytime
02.3070712.3058650.11350000:16
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "learn = fit()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This didn't train at all well! Let's find out why.\n", - "\n", - "One handy feature of the callbacks passed to `Learner` is that they are made available automatically, with the same name as the callback class, except in `camel_case`. So our `ActivationStats` callback can be accessed through `activation_stats`. In fact--I'm sure you remember `learn.recorder`... can you guess how that is implemented? That's right, it's a callback called `Recorder`!\n", - "\n", - "`ActivationStats` includes some handy utilities for plotting the activations during training. `plot_layer_stats(idx)` plots the mean and standard deviation of the activations of layer number `idx`, along with the percent of activations near zero. Here's the first layer's plot:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "learn.activation_stats.plot_layer_stats(0)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Generally our model should have a consistent, or at least smooth, mean and standard deviation of layer activations during training. Activations near zero are particularly problematic, because it means we have computation in the model that's doing nothing at all (since multiplying by zero gives zero). When you have some zeros in one layer, they will therefore generally carry over to the next layer... which will then create more zeros. Here's the penultimate layer of our network:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAs8AAADWCAYAAAAuNG/NAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAAgAElEQVR4nO3de5gcVZ3/8fdnLplJMgmQMAQFQwQTlShhddSVGBfvsKsPuKy/VbKAuhgXFsH1yrqiiK6iq+yqD4Kw3OSqKyCweEEFlODvJwY1YBQCAlHuE0JCZshtZr6/P6o6KZqemZ5MzXR39ef1PPVk+tSp6m/3zEl/+9Q5pxQRmJmZmZnZ6FpqHYCZmZmZWaNw8mxmZmZmViUnz2ZmZmZmVXLybGZmZmZWJSfPZmZmZmZVcvJsZmZmZlYlJ89mZk1O0sGSQtLetY7FzKzeOXk2MysgSXunCfHBtY7FrB5IeqekByStl3SBpPbMvlZJv5T097WM0RqDk2czMzMrNEmzgQuAfwOWAAcByzJVPgQ8HBHfrkF4VZM0pYjP1WicPDcJSTdLOk/S5yQ9nn7z/ndJLZI+JekxSb2S/j1zTJukUyXdL2mzpFWS3l923pMk/VZSn6RHJV0h6TmZ/aXLwW+S9HNJT0v6vaS3TObrNysqSa+RdKukjem2Mm1ff06r3JS2wQcyx3xA0oNpe/wRMLcWsZtNon2BDRFxaUTcCXwP2B9A0nzgJOC4ak6Ufi7eK+kwSXdJ6pd0k6T9yuq9XNIN6edjr6SrJO2T2f/8tOzhtC3eKemosnOUPrs/K+kR4KFhYro5befl27szdT6QxrtZ0j2S/k1SW2b/A2mO8A1JTwC3puXPST/b10valD5XTzXvVVE5eW4ufwe0A68h+Zb9CeB/gS6Sb+IfAT4h6dC0/n8Dfwu8H3gxcBrwRUn/WHbejwAvBd5O8iF8RYXn/jLweWARsAL4tqRdc3tlZk1IUitwLfBL4GXpdirwdPozwBHAc4BXpMccBvwncAZwIPAd4D8mM26zGrgXmCapR9J04K+A30gScB7wiYh4dAznew5Jsr2UpBd7V+D80k5J+wM/A/4v0AO8HhgEfiypM63WBfwUOITkM/Qc4AJJryt7rv8DdANvSM9Tyd+mMZW2zwH9wK/SeE4l+az+V5LP85NIPts/XXaeE4HHgVcDx6Tvz/eAFwFvBV4JPJa+jt1HfIeKLCK8NcEG3Az8tqxsFXBnWdlKkkT3+cAQ8KKy/Z8qP0/Z/r8AAtgrfXxw+vhvM3X2TMveUuv3xZu3Rt6A3dK2dHCFfXtX2gcsBy4tK/tyWnfvWr8mb94magPeln7G3UfyBbIV+ABwffq59L1037eArhHOcyowAHRnyt6ZfmZ2po8vBK4oO66D5Ivt4SOc+xrg3Mzjm4HVQMsYXudbgC3A29LH09LnPaSs3tHA+szjB4CfltV5Q/p/w/5lr+MR4FO1/p3WatveXW9NYWXZ40fTrbxsD5JvygJWJF88t2sj+fYMJMMySL7J7k/yzbt0NWMfnnl56belHyLiUUmDwJydfB1mBkTEk5L+G/iRpBtJerqujoi7Rzhsf+DysrLlwIcnKEyzuhAR1wHXlR5LmgecDLwK+BrwB5IrtJcApwAfH+F0D0dEb+bxQySfmXsAfyK50vMCSX1lx3UC89Pnn0bSIfU2kt7iKSSJ6U1lx9weEUPVvEZJC0muJn08fb0AC4GpwJWSIlO9FeiU1J15LbeVnXIh8ERE/L5UEBFbJP0y3deUnDw3l21lj2OYshZ2JMEHkXxjLa+DpLnA94GLSYZ0rCXp7foJyX8CWVsrxONhQ2bjFBHvk/RV4M3Am4DPSjqBpDdt2MMmJTiz+nYucGpEPCjpjcBnImJA0iXAZ0Y5tvwzrdSmWjL/XgycXuHYJ9J//wM4jOSL610kwyy+AuxSVr9/tBcCIGkPkqGYl0TEf2V2lWJ6B0kvdrl1ozxXpf8vNEx5U3DybMO5Pf13bkT87zB1XkHybfaDEbEJkgkSkxGcme0QEb8DfgecIelsklUErk53t5ZV/z2wGPhGpmzxhAdpVkckvQ9QRJybFrWQzAmCpPNnvJ07K4ADgD9GOtahgteSDKH6dhpTC7CAZEzxmEjqIGnzd5GMW85aBWwG9o2I74/x1KuA3SXtX+p9Tp/rlTzz/5Cm4uTZKoqIeyWdD5wr6WMkkx6mAy8nGef1ReAekm+eH5Z0KclkwE/VKmazZiPpBcD7SC5F/xl4Lsnk31+TXAnqA94saRWwJSKeJOnZ+h9Jt5FcOXoNcFSF05sVkqS9SCbKvSZT/HPgQ+mKU/9MMgRqPD5PMgTikvTKUC8wDzgc+GpE3AfcDRwm6UqStvohkjY85uQZ+GZ67HuA7sxwyw0R0Sfp88Dn0/Ifk+R/LwX+IiJGGp5yY/o6LpP0z8AGkiEtncBZOxFnIfiyuY1kGcmkin8j6a36KXAMyYQKIuIOkskW70/3fwT4YE0iNWtO/STjJ68guRx7JfAL4IR0jOQ/k8zU/zPwG4CIuJrkMvHHgDtIVgsY6cPTrGi+CZweEQ9kyk4kSW5XkLSr0YZtjCgi/kAy7LEL+BHJZ+S5JFdr16fV/gVYQzLG+ack46a/u5NPeTBJ/HeTTOYrbX+fxvPZ9PmOJZn/tDx9/MAoryNIEv67SIaC/YpkcuWbImLtTsba8DT81QQzMzMzM8tyz7OZmZmZWZWcPJuZmZmZVcnJs5mZmZlZlZw8m5mZmZlVycmzmZmZmVmVGmqd59133z3mzZtX6zDM6sbtt9++NiK6ax1HJW6vZs9Uz+0V3GbNyg3XZhsqeZ43bx4rVqyodRhmdUPSmlrHMBy3V7Nnquf2Cm6zZuWGa7O5DtuQNEvS1ZL6Ja2RdOQw9STpi5KeSLcvKXM7HDMzMzOzepR3z/OZwFZgDnAgcL2klRGxqqzeMpI71iwiub3zj0nuWnd2zvGYmZmZmeUmt55nSdOBI4BTIqIvIpYD1wJHVah+DPCViHgwIh4CvgK8O69YzMzMikTSCZJWSNoi6cJR6v6LpEclbZB0vqSOSQrTrCnkOWxjATAYEaszZSuBhRXqLkz3jVYPScvS/zBW9Pb25hasmZlZA3kY+Bxw/kiVJL0FOBl4AzAP2Bf4zEQHZ9ZM8hy20QVsKCvbAMyoou4GoEuSIiKyFSPiHOAcgJ6enmfsK3fuz+9jy8AgJ7x+/lhjNyscSSeQXNF5KXB5RLx7mHrHACcC84GngMuAT0TEQLr/ZuAvgYH0kIci4oXjjW9gcIhv3PxHjl3yfKZNaai5y2aTLiKuApDUA+w9QtVjgPNKwyUlfRa4lCShzs3ZP/sjf3y8L89Tmk2qQ1+6J69/0ZydOjbPT6w+YGZZ2UxgYxV1ZwJ95YnzWP38nl76tgw4eTZLlHqq3gJMHaHeNOCDwC+BbpLhVh8BTs/UOSEi/jvP4K76zUOc8ePVPLVpG5986/55ntqsmS0Ersk8XgnMkTQ7Ip4oryxpGck8JObOnVvVE2wbHOL0H9zFjM42ZnT4i681ppfuvctOH5vnX/1qoE3S/Ii4Jy1bBJRPFiQtWwTcNkq9MRtf+m1WHNX2VEXEWZmHD0m6FHjdBIfHloEhADZtG5zopzJrJpWu7EJyFfhZyfNYru6WbE7b7ElvmM+xS/YdT6xmDSm3Mc8R0Q9cBZwmabqkxcBhwMUVqn8L+JCkvSQ9F/gwcOF4Y5CEc2ezcXstz/4y+wVJayXdKung4Q7cmTkKbrNmuap0ZRcqXwXeKZu3JV98O9p8k2JrTnn/5R9Pcnn4ceBy4LiIWCVpiaTs4KhvAtcBdwK/A65Py8ZF4K5ns3GQ9B6gB/hypvjjJJOO9iLpobpO0n6Vjo+IcyKiJyJ6urvr9kZqZkVWurJbsgh4rNKQjZ1V6nnuaG/N65RmDSXXwUoRsY5k/eby8ltILiWVHgfwsXTLjeReLLOdJelwknHOb4yItaXyiPhlptpFkt4F/DXw9VyeN4+TmBWcpDaSz+xWoFVSJzBQmtib8S3gwnT41SPAJ8nhym7JY09t5q5Hk07sTifP1qQKNdLfH8JmO0fSIcC5wN9ExJ2jVA9ybG7+wmtWlU8Cn848/gfgM5LOB34P7B8Rf4qIH0r6EnATyZXgK8uO22nrn97K4tNvZGAoabW7TG3P47RmDadQyTN41IZZSbU9VZJeT7KU1dsj4rayfbsCrwJ+RrJU3d+TjIn+4MS/AjMriYhTgVOH2d2VfRARZwBn5B3Duv6tDAwF7z5oHn/1wm4W7zc776cwawiFGu2fTBh09myW+iSwiWR9139If/6kpLmS+iSV1qU6BdgF+H5a3ifpB+m+dpLl7nqBtcAHgMMj4u68gvQVI7PGUJoo+Krnz+J1L9yDttZCpRBmVStUz7Nwz7NZSbU9VREx7LJ0EdELvCLXwMqfYyJPbma52TKQTBT0WGdrdoX62ig5eTYzM5sI25eoay9U6mA2ZgVrAV7n2azReNiGWWPY7J5nM6BgyXPS8+z02ayRuMWaNYaBwaS1trcUKnUwG7NCtQD3YJmZmU2MobRzyrmzNbtCNQGPeTZrPP7Sa9YYSld2W+RWa82tWMkzXqrOrNG4xZo1hvTeKE6erekVK3l2ezYzM5sQ24dt+LPWmlyhkmfwsA2zRuPPYbPGUOp5lnuqrMkVKnmWfAnYrNG4zZo1hnDPsxlQtOQZeak6MzOzCTDkCYNmQMGSZ9zzbNZw/DFs1hiGkhsMOnm2pleo5Fng7NmswbjJmjWGUs+zc2drdsVKnuXbc5uZmU2E0qjIFg96tiZXrOQZ357bzMxsInipOrNEsZJnj3k2axj+/DVrLL5JilmiWMkzXufZrFG4qZo1Fo95NksUK3mWb89tZmY2EcJL1ZkBRUueax2AmVXN7dWssXjYhlmiUMkzeNiGWaNwUzVrLJ4waJYoVvIsJ89mZmYTodTzLF83siZXqOTZDdpsB0knSFohaYukC0ep+y+SHpW0QdL5kjoy++ZJuknS05LukvTGXOLL4yRmNmlKY55VqMzBbOwK1QS2DAzy0PpNXuvZLPEw8Dng/JEqSXoLcDLwBmAesC/wmUyVy4HfALOBfwO+K6l7vMG5lZo1lvCYZzOgYMnz/97xCADL711b40jMai8iroqI7wFPjFL1GOC8iFgVEU8CnwXeDSBpAfAy4NMRsSkirgTuBI6YuMjNrBJJsyRdLalf0hpJRw5Tb1dJF0l6PN1OzeP5PebZLNFW6wAmwpNPb6t1CGaNZCFwTebxSmCOpNnpvvsiYmPZ/oXjfVJ//pqN2ZnAVmAOcCBwvaSVEbGqrN5/AtNIriTtAfxU0pqIuGA8T+7VNswShep5LhkYHKp1CGaNpAvYkHlc+nlGhX2l/TMqnUjSsnSc9Yre3t4Rn9TDNsyqJ2k6yRWfUyKiLyKWA9cCR1Wo/jbgSxHxdEQ8AJwHvHe8MfgmKWaJgibP/lg2G4M+YGbmcennjRX2lfZvpIKIOCcieiKip7t73MOizWyHBcBgRKzOlI10FUhlP79kvAH4JilmiUImz1vd82w2FquARZnHi4DHIuKJdN++kmaU7S+/TDxm/vg1G5OxXAX6IXCypBmSXkDS6zyt0knHcrXIwzbMEoVMnj1swwwktUnqBFqBVkmdkirNc/gW8I+S9pe0G/BJ4EKAtJfrt8Cn0+PfDhwAXDne+Hx9yGxMxnIV6ERgE3APyXyGy4EHK510LFeLPGHQLFHM5HnIH8tmJEnwJpJl6P4h/fmTkuZK6pM0FyAifgh8CbgJWJNun86c551AD/AkcDrwdxExcheVmeVtNdAmaX6mrOJVoIhYFxFLI2LPiFhI8ll/23gD2H6TFPc8W5PLJXmudvmctO7r0hsubJD0QB7PX87JsxlExKkRobLt1Ij4U0R0RcSfMnXPiIg5ETEzIt4TEVsy+x6IiIMjYmpEvDAifpJHfP74NateRPQDVwGnSZouaTFwGHBxeV1J+0maLalV0qHAMpI138cbg3udzciv5zm7fM5S4CxJw01i6Ce5acNHc3ruZ/GwDTMzK6DjganA4yRDMY6LiFWSlkjqy9R7Ocl67BuBLwBLKyxnN2ZDER7vbEYO6zxnls95SUT0AcsllZbPObm8fkTcBtyW1y1+K9nm1TbMzKxgImIdcHiF8ltIJhSWHn8H+E7ezz8UnixoBvn0PI91+ZwJt809z2ZmZrkaivAaz2bkkzyP6SYKYzWWZXRKPObZzMwsX+GeZzOgiuRZ0s2SYphtOWO8icJY7cxNF9zzbGZmlq+hIU8YNIMqxjxHxMEj7U/HPLdJmh8R96TFudxEYWf5DoNmZmb58phns8S4h22MZfkcAEkt6Y0b2pOH6pQ0ZbxxZA0MuefZzMwsTx7zbJbIa6m6isvnAFRYQue1JDdr+D4wN/35hpziALzahpmZWZ42bxvkwl88wFObB2odilnNjXupOhh++Zx0X/kSOjczwfdH8JhnMzOz/GzZ5s9Vs5Ji3p7bPc9mZmZmNgGKmTx7zLOZmVl+PNbZbLtiJs/ueTYzMzOzCVDI5HmrxzybmZnlxqtsmO1QyOR5y4CTZzMzMzPLn5NnMzMzG5E7ns12KFTyPKMjWXlvq5NnMzMzM5sAhUqerz9xCS2CrQODtQ7FzMysMORBz2bbFSp5njt7GocduJcnDJqZmZnZhChU8gwwpbXFwzbMzMxy5H5nsx2Klzy3OXk2MzMzs4nh5NmswCTNknS1pH5JayQdOUy9H0jqy2xbJd2Z2f+ApE2Z/TfkFWP4nkZmdc9Dns12aKt1AHmb0tbiMc9mO5wJbAXmAAcC10taGRGrspUi4tDsY0k3AzeWnettEfGTCYzVzMys7hWv57m1hW2DwdCQu7OsuUmaDhwBnBIRfRGxHLgWOGqU4+YBS4CLJzrG5Pkm41nMbDzkUc9m2xUveW5LXpJ7n81YAAxGxOpM2Upg4SjHHQ3cEhH3l5VfKqlX0g2SFuUVpIdtmJlZIylc8tzh5NmspAvYUFa2AZgxynFHAxeWlS0F5gH7ADcBP5K0a/mBkpZJWiFpRW9v787EbGZ1yFeIzHYoXPK8vefZkwbN+oCZZWUzgY3DHSDpNcCewHez5RFxa0RsioinI+ILwHqSoR2U1TsnInoioqe7u7uqIP2hbGZmjaR4yXOrk2ez1GqgTdL8TNkiYNUw9QGOAa6KiL5Rzh3ktPSrh22YmVkjKV7y7J5nMwAioh+4CjhN0nRJi4HDGGYioKSpwDsoG7Ihaa6kxZKmSOqU9FFgd+DW8cTnHmez6o1h2ckOSWdLekzSOknXSdprsuM1K7LiJs8e82wGcDwwFXgcuBw4LiJWSVoiqbx3+XCSMdE3lZXPAM4CngQeAg4BDo2IJ8YTmHuczcYku+zkUuAsSZUm/54EvBo4AHguyRCrr4/3yf1l12yHwq3z3NHWCrjn2QwgItaRJMXl5beQTCjMll1OkmCX111F8kFsZjWQWXbyJemQquWSSstOnlxW/fnAjyLisfTYK4AzJjNes6IrbM/zFifPZnXNPVlmVRvLspPnAYslPVfSNJJe6h+MNwCv82y2Q+F6nj1h0KwxeNiGWdXGsuzkauBPJEOsBoE7gROGO7GkZcAygLlz5+YRq1nhFbbn2WOezcysIMay7ORZQCcwG5hOMml42J7napeX9JUisx0Klzx3eLUNs4bgD2Ozqo1l2clFwIURsS4itpBMFnylpN0nIU6zplC45NlL1Zk1Bg/bMKvOGJed/BVwtKRdJLWTrLjzcESsHU8M/q5rtkPxkufSmOfBwRpHYmZmlptql538CLAZuAfoBf4aePtkB2tWZMWbMOieZ7OG4GEbZtWrdtnJdP31pZMYmlnTKV7Ps5NnMzOzXMnfds22K2zy7HWezczMzCxvxUueW71UnZmZWZ7c72y2Q3GTZ/c8m5mZmVnOCpc8t7SI9lY5eTYzM8uJhzyb7ZBL8ixplqSrJfVLWiPpyBHqflTS7yRtlHS/pI/mEUPWlNYWJ89mZmZmlru8lqo7E9gKzAEOBK6XtDIiKt39SMDRwB3AfsANkv4cEVfkFAtT2lo85tnMzCwnXm3DbIdx9zxLmg4cAZwSEX0RsRy4FjiqUv2I+FJE/DoiBiLibuAaYPF448ia0uaeZzMzMzPLXx7DNhYAgxGxOlO2Elg42oFKvsouASr1UO80J89mZmZmNhHySJ67gA1lZRuAGVUce2oawwXDVZC0TNIKSSt6e3urCmhKawtbPGzDzMzMzHI2avIs6WZJMcy2HOgDZpYdNhPYOMp5TyAZ+/w3EbFluHoRcU5E9ERET3d39+ivCOhoa2XLNifPZmZmZpavUScMRsTBI+1Pxzy3SZofEfekxYsYYSiGpPcCJwOvjYgHqw+3OlOntLJ522DepzUzMzOzJjfuYRsR0Q9cBZwmabqkxcBhwMWV6ktaCnweeFNE3Dfe569kansrm5w8m5mZmVnO8rpJyvHAVOBx4HLguNIydZKWSOrL1P0cMBv4laS+dDs7pziApOf56a1Ons3MzMwsX7kkzxGxLiIOj4jpETE3Ii7L7LslIroyj58fEe0R0ZXZ/imPOEqmtnvYhhlUfwMjSadK2pb5Qtsnad/M/gMl3S7p6fTfAyfvVZiZmdWPwt2eG2DalFae3jpQ6zDM6kH2BkZLgbMkDbeM5LfLvtTeByBpCsl67JcAuwEXAdek5WZmZk2lkMlzZ3srmzxsw5rcWG9gNIKDSSYX/1dEbImIr5HcKfT1ecZrZmbWCAqZPE+b4gmDZoz9BkZvk7RO0ipJx2XKFwJ3RERkyu6odJ6dWZcdYvQqZmZmdaKQyfPU9la2DQbbfKMUa25juYHRd4AXA93A+4BPSXrXWM+zM+uym5mZNZJiJs9TWgHc+2zNruobGEXE7yPi4YgYjIhfAF8F/m6s59k5yuc0ZmZmk6DQyfNmj3u25raa9AZGmbIRb2CUEezIalcBB0jKZrkHVHmeKp/KzMysMRQyeZ6WJs9e69ma2VhuYCTpMEm7KfFK4ESSFTYAbgYGgRMldUg6IS2/ccJfhJmZWZ0pZPI8td3DNsxSFW9gVOHmRe8E7iUZivEt4IsRcRFARGwFDgeOBtYD7wUOT8tz4GEbZmbWONpqHcBEmDoleVnuebZmFxHrSBLf8vJbSCYClh6/q7xOWf3fAC/PPcDk7BNzWjMzswlQ6J5n32XQzMzMzPJUyOTZY57NGomHbZhVQ9IsSVdL6pe0RtKRw9T7gaS+zLZV0p2THa9ZURVy2EanxzybNRAP2zCr0pnAVmAOcCBwvaSVEfGMlW8i4tDsY0k34wm+ZrkpdM/zpq0DNY7EzMxs/CRNB44ATomIvohYDlwLHDXKcfOAJVRYZcfMdk4hk+ftq2142IZZA/CwDbMqLAAGI2J1pmwlsHCU444GbomI+ycsMrMmU8jkeVpHkjz3O3k2awAetmFWhS5gQ1nZBmDGKMcdDVw43E5JyyStkLSit7d3fBGaNYlCJs8dba1MaWvhqc3bah2KmZlZHvqAmWVlM0nWZq9I0muAPYHvDlcnIs6JiJ6I6Onu7s4lULOiK2TyDDCzs42+zR7zbFb/PGzDrAqrgTZJ8zNli4BVw9QHOAa4KiL6RqhjZmNU2OS5q6ONjU6ezRqAh22YjSYi+oGrgNMkTZe0GDiMYSYCSpoKvIMRhmyY2c4pbPI8o7OdjR62YWZmxXE8MBV4HLgcOC4iVklaIqm8d/lwkjHRN01yjGaFV8h1ngFmdLrn2awxeNiGWTUiYh1JUlxefgvJhMJs2eUkCbaZ5azAPc9Ons0ag4dtmJlZ4yhs8tzV0U7fFifPZmZmZpafwibPMzrbvFSdmZmZmeWqsMnzzM42+rYMMDTkS8Jm9Uge62xmZg2osMnzjM52IqB/q4dumNWj8FhnMzNrQIVNnrs6k4VEPGnQzMzMzPJS2OR5Rpo8e9KgWX3ysA0zM2tEhU2eZ3a2A7BhkycNmtUjD9swM7NGVNjkedb0KQCs699a40jMzMzMrCgKmzzP7kqS5yf6nDxb85I0S9LVkvolrZF05DD1Pirpd5I2Srpf0kfL9j8gaZOkvnS7YdyxediGmZk1oMLenntHz/OWGkdiVlNnAluBOcCBwPWSVkbEqrJ6Ao4G7gD2A26Q9OeIuCJT520R8ZO8AvOwDTMza0SF7XnuaGulq6ONJzxsw5qUpOnAEcApEdEXEcuBa4GjyutGxJci4tcRMRARdwPXAIsnN2IzM7P6V9jkGZLeZ495tia2ABiMiNWZspXAwpEOkiRgCVDeO32ppF5JN0haNMyxyyStkLSit7d3xOA8bMPMzBpRoZPn2V1Onq2pdQEbyso2ADNGOe5Ukv8bLsiULQXmAfsANwE/krRr+YERcU5E9ERET3d394hP4mEbZmbWiHJJnqudlJTW/aCk+yQ9JelhSf8paULGXs+ePoW1njBozasPmFlWNhPYONwBkk4gGfv8NxGxfcJARNwaEZsi4umI+AKwnqR32szMrKnk1fOcnZS0FDhL0nCXhq8DXhYRM4GXAIuAE3OK4xmSYRueMGhNazXQJml+pmwRzx6OAYCk9wInA2+IiAdHOXfA+MZdeNiGmZk1onEnz2OZlAQQEX+MiPWlw4Eh4AXjjaOS2V0drOvfytCQLw9b84mIfuAq4DRJ0yUtBg4DLi6vK2kp8HngTRFxX9m+uZIWS5oiqTNdxm534NZxxedhG2Zm1oDy6Hke86QkSUdKegpYS9IT9s0R6lY9Aancc3bpZNtgsLbPvc/WtI4HpgKPA5cDx0XEKklLJPVl6n0OmA38KrOW89npvhnAWcCTwEPAIcChEfHEpL0KMzOzOpHHWOMxT0qKiMuAy9LLyUcDj41Q9xzgHICenp4xdVXttetUAB5av4k9ZnaO5VCzQoiIdcDhFcpvIWm7pcfPH+Ecq4AD8o7NwzbMzKwRjdrzLOlmSTHMtpydmJRUEhH3kIy//MbOBD+avXbbkTybWX3xsA0zM2tEo/Y8R8TBI+1Pxzy3SZqfJsMwwqSkYWLYr8q6Y/LcUs/zk06ezczMzGz8xj3meSyTkgAkHStpj/Tn/WrTRS8AAAsMSURBVIF/BX463jgqmdnZzozONh52z7NZ3fGwDTMza0R5LVVXcVISQIWJSYuBOyX1A99Pt0/kFMez7LXrVA/bMDMzM7Nc5HJzkuEmJaX7yicmvSeP56zWvNnTWf3YqMOvzczMzMxGVejbcwMs2HMGDzzRz+Ztg7UOxczMzMwaXOGT5xftOYOhgHsf7xu9spmZWZ2SNEvS1ZL6Ja2RdOQIdV8m6efpmu2PSTppMmM1K7LCJ88L5iTLTd/9qIdumJlZQzsT2ArMAZYCZ0l61g3JJO0O/JDkBmSzSe7ie8MkxmlWaIVPnufNnkZHWwu/e7j8Pi5mZmaNIV0W9gjglIjoi4jlwLXAURWqfwj4UURcGhFbImJjRPxhMuM1K7LCJ89trS28bO5u3Hb/ulqHYmYZSleqGxzyzVLMqrAAGIyI1ZmylcCzep6BvwTWSfqFpMclXSdpbqWTSlomaYWkFb29vRMQtlnxFD55BnjVvrP4/SNPseHpbbUOxcxSLWnyvGGT26VZFbqA8kuoG4AZFeruDRwDnATMBe4nWUb2WSLinIjoiYie7u7uHMM1K66mSJ5fve9sImD5vWtrHYqZlXnSX2rNqtEHzCwrmwlUmtCzCbg6In4VEZuBzwAHSdplgmM0awpNkTz3zJtF94wOrvntQ7UOxczKPNm/tdYhmDWC1UCbpPmZskXAqgp17wCy46FKP/u2nmY5aIrkubVFHLboudx09+M8umFzrcMxMyDSj/MHnuhn68BQbYMxq3MR0Q9cBZwmabqkxcBhwMUVql8AvF3SgZLagVOA5RGxfvIiNiuupkieAY45aB4R8LUb76l1KGaWsW0w2P9TP+R/Vvy51qGY1bvjganA4yRjmI+LiFWSlkjafjODiLgR+ARwfVr3BcCwa0Kb2djkcnvuRvC8WdM46tX7cMGtD3Dwgm7evHDPWodkZsAr581iy+AQH7vyDmZNn8IbXjyn1iGZ1aWIWAccXqH8FpIJhdmys4CzJik0s6bSNMkzwMcPeRG3r3mS4y/9NR9843yOPmgeMzvbax2WWVMqDcL86rsOZNepU3jHN3/BsotvZ9HeuzC9o40I2DY4xMBQMDA4xLbBYGBoiIHBYNvQ0PZhH2b14vL3/SXPmzWt1mGY2QRrquS5s72VS459FSdfeQdfvmE1X7vxXg7YaxdesEcXs6ZPYddp7bS3ttDWIlpbWmhtgRYJKZ85FnnO1MgpJKtju0xtb4peWCGmTmnl4ve+inNvuY9f/+lJnto8QIugvaWFjrYWujraaG8VbS0ttLWK9tYWtwGrOx3tTTMS0qypNVXyDDCzs51vLH05K/+8nmtXPswdD67nJ394jCef3uabNVhdefFzZjZF8lyy2/QpfOyQF9U6DDMbQc8+u9U6BLOaa7rkuWTR83Zl0fN23f44IujfOshAepl4cCgYGAqGckqo87zEHDjJbwbtrcXuxXrrAc/hoP1ms3vXlFqHYmZVuOffD6XFl3zMmjd5LieJrg6/HVYskmYB5wFvBtYC/xoRl1WoJ+B04Ni06Dzg4xHJ1z5JB6ZlLwb+APxjRPx2PLHN6GxnhuccmDWMon+hN6uWW4JZsZ0JbAXmAEuBsyQtrFBvGcks/kXAAcBbgfcDSJoCXANcAuwGXARck5abmZk1FSfPZgUlaTpwBHBKRPRFxHLgWuCoCtWPAb4SEQ9GxEPAV4B3p/sOJrlK9V8RsSUivkYy//X1E/wSzMzM6o6TZ7PiWgAMRsTqTNlKoFLP88J0X6V6C4E7SkM4UncMcx4zM7NCc/JsVlxdwIaysg3AjCrqbgC60rHQVZ9H0jJJKySt6O3t3enAzczM6pWTZ7Pi6gNmlpXNBDZWUXcm0Jf2Nld9nog4JyJ6IqKnu7t7pwM3MzOrVw21vMTtt9++VtKaUartTrKqQKNy/LXTiLHvM8K+1UCbpPkRcU9atghYVaHuqnTfbRXqrQI+LEmZoRsHkExGHJbba0Nw/JNrpPZac26zda+RY4fGjL9im1UU7B63klZERE+t49hZjr92Gjn24Ui6guRO2McCBwLfBw6KiFVl9f4JOAl4Y1r/x8DXI+LsdFWNe4AzgLOB9wEfBeZHxNZxxtfQ77njr61Gj78RNfp73sjxN3Ls0PjxZ3nYhlmxHQ9MBR4HLgeOi4hVkpZI6svU+yZwHXAn8Dvg+rSMNEE+HDgaWA+8Fzh8vImzmZlZI2qoYRtmNjYRsY4k8S0vv4VkImDpcQAfS7dK5/kN8PIJCtPMzKxhFLHn+ZxaBzBOjr92Gjn2RtXo77njr61Gj78RNfp73sjxN3Ls0Pjxb1e4Mc9mZmZmZhOliD3PZmZmZmYTwsmzmZmZmVmVCpM8S5ol6WpJ/ZLWSDqy1jGVSOqQdF4a10ZJv5F0aLpvnqSQ1JfZTik79nxJT0l6VNKHavQabpa0ORPj3Zl9R6avrV/S9yTNyuyr+e+l7L3tkzQo6evpvoZ4/4umHv4uRuI26zZrz1QPfxfDcXt1e510EVGIjWQZrm+TrCDwGpLbBy+sdVxpbNOBU4F5JF9Y3kpyd7Z56RZA2zDHfgG4BdgNeDHwKHBIDV7DzcCxFcoXpq/ltel7fxlwRb3+XtLfRR/w2vRxQ7z/Rdvq7e9imL8Tt9k6+N24zdbHVm9/FxX+Rtxe6+D30iztteYB5PjL2gosyJRdDJxe69hGiPkO4Igq/rAeAt6cefzZbMOZxHiHa9ifBy7LPN4v/V3MqMffC3AMcB87Jss2xPtfpK0e/y6qjNtttjbvu9tsjbd6/LuoIma319q8703RXosybGMBMBgRqzNlK0m+sdUdSXNIYs7e5W2NpAclXSBp97TebsBzSV5LSS1f1xckrZV0q6SD07KFZOKLiD+SNmbq8/dyDPCtSFtpRiO8/0VRj38XI3KbdZttcvX4dzEst1e314lWlOS5i+RSRdYGkm9mdUVSO3ApcFFE3EVyn/dXkNw//eUkMV+aVi/dxCL72mr1uj4O7AvsRbJW43WS9mPk976ufi+S5gJ/BVyUKW6U979I6urvYjRus26zVl9/FyNxe3V7nQxFucNgHzCzrGwmyTihuiGpheSSylbgBICI6ANWpFUek3QC8IikmSSvC5LXsjnz86S/roj4ZebhRZLeBfw1I7/3QyPsq4WjgeURcX+poFHe/4JpiPYKbrNl+2rBbbY+NESbdXt9xr5aaJr2WpSe59VAm6T5mbJFPPOSTU1JEnAeMAc4IiK2DVO1dKlDEfEk8AjJaympl9cVgEhi2R6fpH2BDpLfSb39Xo7mmd+IK2mU97+R1dvfRUVus9u5zVq9/V08i9vrdm6vk6HWg67z2oArSGadTgcWU0czgdP4zgb+H9BVVv4q4IUkX2Rmk8yavSmz/3TgZyQzUV9E8oc2qTNRgV2BtwCdJFcrlgL9adwLgaeAJel7fwnPnAlcF78X4KA05hmN9v4XcauXv4tRYnSbdZv1Vmd/FyPE5/bq9jp5r7fWAeT4i5sFfC/95f0JOLLWMWVi24fk29ZmkssUpW0p8C7g/jTuR4BvAXtmju0Azk8bz2PAh2oQfzfwK5JLKevT/6DelNl/ZPqe9wPXALPq7fcCfBO4uEJ53b//Rdzq5e9ihPjcZt1mvT3zfa+Lv4thYnN7dXud1K20lIiZmZmZmY2iKGOezczMzMwmnJNnMzMzM7MqOXk2MzMzM6uSk2czMzMzsyo5eTYzMzMzq5KTZzMzMzOzKjl5NjMzMzOrkpNnMzMzM7MqOXk2MzMzM6vS/wfiI2mA+SNXPQAAAABJRU5ErkJggg==\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "learn.activation_stats.plot_layer_stats(-2)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "As expected, the problems get worse towards the end of the network, as the instability and zero activations compound over layers." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Increase batch size" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "One way to make training more stable is to *increase the batch size*. Larger batches have gradients that are more accurate, since they're calculated from more data. On the downside though, a larger batch size means fewer batches per epoch, which means less opportunities for your model to update weights. Let's see if a batch size of 512 helps:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dls = get_dls(512)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
epochtrain_lossvalid_lossaccuracytime
02.3093852.3027440.11350000:08
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "learn = fit()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's see what the penultimate layer looks like:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "learn.activation_stats.plot_layer_stats(-2)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Again, we've got most of our activations near zero. Let's see what else we can do to improve training stability." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 1cycle training" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Our initial weights are not well suited to the task we're trying to solve. Therefore, it is dangerous to begin training with a high learning rate: we may very well make the training diverge instantly, as we've seen above. We probably don't want to end training with a high learning rate either, so that we don't skip over a minimum. But we want to train at a high learning rate for the rest of training, because we'll be able to train more quickly. Therefore, we should change the learning rate during training, from low, to high, and then back to low again.\n", - "\n", - "Leslie Smith (yes, the same guy that invented the learning rate finder!) developed this idea in his article [Super-Convergence: Very Fast Training of Neural Networks Using Large Learning Rates](https://arxiv.org/abs/1708.07120) by designing a schedule for learning rate separated in two phases: one were the learning rate grows from the minimum value to the maximum value (*warm-up*), and then one where it decreases back to the minimum value (*annealing*). Smith called this combination of approaches *1cycle training*.\n", - "\n", - "1cycle training allows us to use a much higher maximum learning rate than other types of training, which gives two benefits:\n", - "\n", - "- By training with higher learning rates, we train faster, a phenomenon Leslie N. Smith named *super-convergence*\n", - "- By training with higher learning rates, we overfit less because we skip over the sharp local minimas to end-up in a smoother (and therefore more generalizable) part of the loss.\n", - "\n", - "The second point is an interesting and subtle idea; it is based on the observation that a model that generalises well is one whose loss would not change very much if you change the input by a small amount. If a model trains at a large learning rate for quite a while, and can find a good loss when doing so, it must have found an area that also generalises well, because it is jumping around a lot from batch to batch (that is basically the definition of a high learning rate). The problem is that, as we have discussed, just jumping to a high learning rate is more likely to result in diverging losses, rather than seeing your losses improve. So we don't just jump to a high learning rate. Instead, we start at a low learning rate, where our losses do not diverge, and we allow the optimiser to gradually find smoother and smoother areas of our parameters, by gradually going to higher and higher learning rates.\n", - "\n", - "Then, once we have found a nice smooth area for our parameters, we then want to find the very best part of that area, which means we have to bring out learning rates down again. This is why 1cycle training has a gradual learning rate warmup, and a gradual learning rate cooldown. Many researchers have found that in practice this approach leads to more accurate models, and trains more quickly. That is why it is the approach that is used by default for `fine_tune` in fastai.\n", - "\n", - "Later in this book we'll learn all about *momentum* in SGD. Briefly, momentum is a technique where the optimizer takes a step not only in the direction of the gradients, but also continues in the direction of previous steps. Leslie Smith introduced cyclical momentums in [A disciplined approach to neural network hyper-parameters: Part 1](https://arxiv.org/pdf/1803.09820.pdf). It suggests that the momentum varies in the opposite direction of the learning rate: when we are at high learning rate, we use less momentum, and we use more again in the annealing phase.\n", - "\n", - "We can use 1cycle training in fastai by calling `fit_one_cycle`:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def fit(epochs=1, lr=0.06):\n", - " learn = Learner(dls, simple_cnn(), loss_func=F.cross_entropy,\n", - " metrics=accuracy, cbs=ActivationStats(with_hist=True))\n", - " learn.fit_one_cycle(epochs, lr)\n", - " return learn" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
epochtrain_lossvalid_lossaccuracytime
00.2108380.0848270.97430000:08
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "learn = fit()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We're finally making some progress! It's giving us a reasonable accuracy now.\n", - "\n", - "We can view the learning rate and momentum throughout training by calling `plot_sched` on `learn.recorder`. `learn.recorder` (as the name suggests) records everything that happens during training, including losses, metrics, and hyperparameters such as learning rate and momentum:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "learn.recorder.plot_sched()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Smith's original 1cycle paper used a linear warm-up and linear annealing. As you see above, we adapted the approach in fastai by combining it with another popular approach: cosine annealing. `fit_one_cycle` provides the following parameters you can adjust:\n", - "\n", - "- `lr_max`: The highest learning rate that will be used (this can also be a list of learning rates for each layer group, or a python `slice` object containing the first and last layer group learning rates)\n", - "- `div`: How much to divide `lr_max` by to get the starting learning rate\n", - "- `div_final`: How much to divide `lr_max` by to get the ending learning rate\n", - "- `pct_start`: What % of the batches to use for the warmup\n", - "- `moms`: A tuple `(mom1,mom2,mom3)` where mom1 is the initial momentum, mom2 is the minimum momentum, and mom3 is the final momentum.\n", - "\n", - "Let's take a look at our layer stats again:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "learn.activation_stats.plot_layer_stats(-2)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The % of non-zero weights is getting much better, although it's still quite high.\n", - "\n", - "We can see even more about what's going on in our training using `color_dim`, passing it a layer index:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "learn.activation_stats.color_dim(-2)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "`color_dim` was developed by fast.ai in conjunction with a student, Stefano Giomo. Stefano, who refers to the idea as the *colorful dimension*, has a [detailed explanation](https://forums.fast.ai/t/the-colorful-dimension/42908) of the history and details behind the method. The basic idea is to create a histogram of the activations of a layer, which we would hope would follow a smooth pattern such as the normal distribution shown by Stefano here:" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\"Histogram" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To create `color_dim`, we take the histogram shown on the left here, and convert it into just the colored representation shown at the bottom. Then we flip it on its side, as shown on the right. We found that the distribution is clearer if we take the `log` of the histogram values. Then, Stefano describes:\n", - "\n", - "> : The final plot for each layer is made by stacking the histogram of the activations from each batch along the horizontal axis. So each vertical slice in the visualisation represents the histogram of activations for a single batch. The color intensity corresponds to the height of the histogram, in other words the number of activations in each histogram bin.\n", - "\n", - "This is Stefano's picture of how this all fits together:" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\"Summary" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "So with that in mind, let's take another look at the result for the penultimate layer:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "learn.activation_stats.color_dim(-2)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This shows a classic picture of \"bad training\". We start with nearly all activations at zero--that's what we see at the far left, with nearly all the left hand side dark blue; the bright yellow at the bottom are the near-zero activations. Then over the first few batches we see the number of non-zero activations exponentially increasing. But it goes too far, and collapses! We see the dark blue return, and the bottom becomes bright yellow again. It almost looks like training restarts from scratch. Then we see the activations increase again, and then it collapses again. After repeating a few times, eventually we see a spread of activations throughout the range.\n", - "\n", - "It's much better if training can be smooth from the start. The cycles of exponential increase and then collapse that we see above tend to result in a lot of near-zero activations, resulting in slow training, and poor final results." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Batch normalization" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To fix this, we need to both fix the initial large percentage of near-zero activations, and then try to maintain a good distribution of activations throughout training. In the abstract, they describe just the problem that we've seen:\n", - "\n", - "Sergey Ioffe and Christian Szegedy showed a solution to this problem in the 2015 paper [Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift](https://arxiv.org/abs/1502.03167). \n", - "\n", - "> : \"Training Deep Neural Networks is complicated by the fact that the distribution of each layer's inputs changes during training, as the parameters of the previous layers change. This slows down the training by requiring lower learning rates and careful parameter initialization... We refer to this phenomenon as internal covariate shift, and address the problem by normalizing layer inputs.\"\n", - "\n", - "Their solution, they say is:\n", - "\n", - "> : \"...making normalization a part of the model architecture and performing the normalization for each training mini-batch. Batch Normalization allows us to use much higher learning rates and be less careful about initialization.\"\n", - "\n", - "The paper caused great excitement as soon as it was released, because they showed this chart, which clearly demonstrated that batch normalization could train a model that was even more accurate than the current state of the art (the *inception* architecture), around 5x faster:" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\"Impact" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The way batch normalization (often just called *batchnorm*) works is that it takes an average of the mean and standard deviations of the activations of a layer, and uses those to normalize the activations. However, this can cause problems because the network might really want some activations to be really high in order to make accurate predictions, they also add two learnable parameters (meaning they will be updated in our SGD step), usually called `gamma` and `beta`; after normalizing the activations to get some new activation vector `y`, a batchnorm layer returns `gamma*y + beta`.\n", - "\n", - "That why our activations can have any mean or variance, which is independent from the mean and std of the results of the previous layer. Those statistics are learned separately, making training easier on our model. The behavior is different during training and validation: during training, we use the mean and standard deviation of the batch to normalize the data. During validation, we instead use a running mean of the statistics calculated during training.\n", - "\n", - "Let's add a batchnorm layer to `conv`:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def conv(ni, nf, ks=3, act=True):\n", - " layers = [nn.Conv2d(ni, nf, stride=2, kernel_size=ks, padding=ks//2)]\n", - " layers.append(nn.BatchNorm2d(nf))\n", - " if act: layers.append(nn.ReLU())\n", - " return nn.Sequential(*layers)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "...and fit our model:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
epochtrain_lossvalid_lossaccuracytime
00.1300360.0550210.98640000:10
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "learn = fit()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "That's a great result! Let's take a look at `color_dim`:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "learn.activation_stats.color_dim(-4)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This is just what we hope to see: a smooth development of activations, with no \"crashes\". Batchnorm has really delivered on its promise here! In fact, batchnorm has been so successful that we see it (or something very similar) today in nearly all modern neural networks.\n", - "\n", - "An interesting observation about models containing batch normalisation layers is that they tend to generalise better than models that don't contain them. Although we haven't as yet seen a rigourous analysis of what's going on here, most researchers believe that the reason for this is that batch normalisation add some extra randomness to the training process. Each mini batch will have a somewhat different mean and standard deviation to each other mini batch. Therefore, the activations will be normalised by different values each time. In order for the model to make accurate predictions, it will have to learn to become insensitive to these variations. In general, adding additional randomisation to the training process often helps.\n", - "\n", - "Since things are going so well, let's train for a few more epochs and see how it goes. In fact, let's even *increase* the learning rate, since the abstract of the batchnorm paper claimed we should be able to \"train at much higher learning rates\":" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
epochtrain_lossvalid_lossaccuracytime
00.1917310.1217380.96090000:11
10.0837390.0558080.98180000:10
20.0531610.0444850.98710000:10
30.0344330.0302330.99020000:10
40.0176460.0254070.99120000:10
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "learn = fit(5, lr=0.1)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
epochtrain_lossvalid_lossaccuracytime
00.1832440.0840250.97580000:13
10.0807740.0670600.97880000:12
20.0502150.0625950.98130000:12
30.0300200.0303150.99070000:12
40.0151310.0251480.99210000:12
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "learn = fit(5, lr=0.1)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "At this point, I think it's fair to say we know how to recognize digits! It's time to move on to something harder..." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Questionnaire" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "1. What method can we use to see that data in DataLoaders?\n", - "1. Why do we double the number of filters after each stride 2 conv?\n", - "1. Why do we use a larger kernel in the first conv with MNIST (with `simple_cnn`)?\n", - "1. What information does `ActivationStats` save for each layer?\n", - "1. How can we access a learner's callback after training?\n", - "1. What are the three statistics plotted by `plot_layer_stats`? What does the x-axis represent?\n", - "1. Why are activations near zero problematic?\n", - "1. What are the upsides and downsides of training with a larger batch size?\n", - "1. Why should we avoid using a high learning rate at the start of training?\n", - "1. What is 1cycle training?\n", - "1. What are the benefits of training with a high learning rate?\n", - "1. Why do we want to use a low learning rate at the end of training?\n", - "1. What is cyclical momentum?\n", - "1. What callback tracks hyperparameter values during training (along with other information)?\n", - "1. What does one column of pixels in the `color_dim` plot represent?\n", - "1. What does \"bad training\" look like in `color_dim`? Why?\n", - "1. What trainable parameters does a batch normalization layer contain?\n", - "1. What statistics are used to normalize in batch normalization during training? How about during validation?\n", - "1. Why do models with batch normalization layers generalize better?" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Further research" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "1. There are other normalization layers available in PyTorch. Try them out and see what works best. Learn about why other normalization layers have been developed, and how they differ from batch normalization.\n", - "1. Try moving the activation function after the batch normalization layer in `conv`. Does it make a difference? See what you can find out about what order is recommended, and why.\n", - "1. Batch normalization isn't defined for a batch size of one, since the standard deviation isn't defined for a single item. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "jupytext": { - "split_at_heading": true - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/15_resnet.ipynb b/14_resnet.ipynb similarity index 98% rename from 15_resnet.ipynb rename to 14_resnet.ipynb index 1614936..b5217a0 100644 --- a/15_resnet.ipynb +++ b/14_resnet.ipynb @@ -26,6 +26,15 @@ "# Resnets" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this chapter, we will build on top of the CNNs introduced in the previous chapter and explain to you the ResNet (for residual network) architecture. It was introduced in 2015 in [this article](https://arxiv.org/abs/1512.03385) and is by far the most used model architecture nowadays. More recent developments in models almost always use the same trick of residual connections, and most of the time, they are just a tweak of the original ResNet.\n", + "\n", + "We will frist show you the basic ResNet as it was first designed, then explain to you what modern tweaks to it make it more performamt. But first, we will need a problem a little bit more difficult that the MNIST dataset, since we are already close to 100% accuracy with a regular CNN on it." + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -37,7 +46,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "It's going to be tough to judge any improvement we do to our models when we are already at an accuracy that is as high as we saw on MNIST in the previous chapter, so we will tackle a tougher problem by going back to Imagenette. We'll stick with small images to keep things reasonably fast.\n", + "It's going to be tough to judge any improvement we do to our models when we are already at an accuracy that is as high as we saw on MNIST in the previous chapter, so we will tackle a tougher image classification problem by going back to Imagenette. We'll stick with small images to keep things reasonably fast.\n", "\n", "Let's grab the data--we'll use the already-resized 160px version to make things faster still, and will random crop to 128px:" ] @@ -303,7 +312,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "That's a pretty good start, considering we have to pick the correct one of ten categories, and we're training from scratch for just 5 epochs!" + "That's a pretty good start, considering we have to pick the correct one of ten categories, and we're training from scratch for just 5 epochs! But we can do way better than this using a deeper model. However, just stacking new layers won't really improve our results (you can try and see for yourself!). To work around this problem, ResNets introduce the idea of skip connections. Let's have a look at what it is exactly." ] }, { @@ -317,7 +326,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We now have all the pieces needed to build the models we have been using in each computer vision task since the beginning of this book: ResNets. We introduce the main idea behind them and show how it improves accuracy Imagenette compared to our previous model, before building a version with all the recent tweaks." + "We now have all the pieces needed to build the models we have been using in each computer vision task since the beginning of this book: ResNets. We'll introduce the main idea behind them and show how it improves accuracy Imagenette compared to our previous model, before building a version with all the recent tweaks." ] }, { @@ -335,7 +344,7 @@ "\n", "> : Unexpectedly, such degradation is not caused by overfitting, and adding more layers to a suitably deep model leads to higher training error, as [previously reported] and thoroughly verified by our experiments.\n", "\n", - "This is the graph they showed, with training error on the left, and test on the right:" + "They showed the graph in <>, with training error on the left, and test on the right." ] }, { @@ -361,7 +370,7 @@ "\n", "What has that gained us, then? The key thing is that those 36 extra layers, as they stand, are an *identity mapping*, but they have *parameters*, which means they are *trainable*. So, we can start with our best 20 layer model, add these 36 extra layers which initially do nothing at all, and then *fine tune the whole 56 layer model*. If those extra 36 layers can be useful, then they can learn parameters to do so!\n", "\n", - "The ResNet paper actually proposed a variant of this, which is to instead \"skip over\" every 2nd convolution, so effectively we get `x+conv2(conv1(x))`. Or In diagram form (from the paper):" + "The ResNet paper actually proposed a variant of this, which is to instead \"skip over\" every 2nd convolution, so effectively we get `x+conv2(conv1(x))`. This is shown by the diagram in <> (from the paper)." ] }, { @@ -659,7 +668,7 @@ "\n", "The authors of the ResNet paper went on to win the 2015 ImageNet challenge. At the time, this was by far the most important annual event in computer vision. We have already seen another ImageNet winner: the 2013 winners, Zeiler and Fergus. It is interesting to note that in both cases the starting point for the breakthroughs were experimental observations. Observations about what layers actually learn, in the case of Zeiler and Fergus, and observations about which kind of networks can be trained, in the case of the ResNet authors. This ability to design and analyse thoughtful experiments, or even just to see an unexpected result say \"hmmm, that's interesting\" — and then, most importantly, to figure out what on earth is going on, with great tenacity, is at the heart of many scientific discoveries. Deep learning is not like pure mathematics. It is a heavily experimental field, so it's important to be strong practitioner, not just a theoretician.\n", "\n", - "Since the ResNet was introduced, there's been many papers studying it and applying it to many domains. One of the most interesting, published in 2018, is [Visualizing the Loss Landscape of Neural Nets](https://arxiv.org/abs/1712.09913). It shows that using skip connections help smoothen the loss function, which makes training easier as it avoids us falling into a very sharp area. Here's a stunning picture from the paper, showing the bumpy terrain that SGD has to navigate to optimize a regular CNN (left) versus the smooth surface of a ResNet (right):" + "Since the ResNet was introduced, there's been many papers studying it and applying it to many domains. One of the most interesting, published in 2018, is [Visualizing the Loss Landscape of Neural Nets](https://arxiv.org/abs/1712.09913). It shows that using skip connections help smoothen the loss function, which makes training easier as it avoids us falling into a very sharp area. <> shows a stunning picture from the paper, showing the bumpy terrain that SGD has to navigate to optimize a regular CNN (left) versus the smooth surface of a ResNet (right)." ] }, { @@ -669,6 +678,13 @@ "\"Impact" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This first model is already good, but further research has discovered more tricks we can apply to make it better." + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -910,7 +926,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Even although we have more channels (and our model is therefore even more accurate), our training is just as fast as before, thanks to our optimized stem." + "Even although we have more channels (and our model is therefore even more accurate), our training is just as fast as before, thanks to our optimized stem.\n", + "\n", + "To make our model deeper without taking too much compute or memory, the ResNet paper introduced anotehr kind of blocks for ResNets with a depth of 50 or more, using something called a bottleneck. " ] }, { @@ -924,7 +942,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Things are a tiny bit more complicated for deeper models like `resnet50` as they don't use the same resnet blocks: instead of stacking two convolutions with a kernel size of 3, they use three different convolutions: two 1x1 (at the beginning and the end) and one 3x3, as shown in the right of this image from the ResNet paper (using an example of 64 channel output, comparing to the regular ResBlock on the left):" + "Instead of stacking two convolutions with a kernel size of 3, *bottleneck layers* use three different convolutions: two 1x1 (at the beginning and the end) and one 3x3, as shown in the right of <> the ResNet paper (using an example of 64 channel output, comparing to the regular ResBlock on the left)." ] }, { @@ -938,7 +956,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Why? 1x1 convolutions are much faster, so even if this seems to be a more complex design, this block executes faster than the first resnet block we saw. This then lets us use more filters: as we see on the illustration, the number of filters in and out is 4 times higher (256) and the 1 by 1 convs are here to diminish then restore the number of channels (hence the name bottleneck). The overall impact is that we can use more filters in the same amount of time.\n", + "Why is that useful? 1x1 convolutions are much faster, so even if this seems to be a more complex design, this block executes faster than the first resnet block we saw. This then lets us use more filters: as we see on the illustration, the number of filters in and out is 4 times higher (256) and the 1 by 1 convs are here to diminish then restore the number of channels (hence the name bottleneck). The overall impact is that we can use more filters in the same amount of time.\n", "\n", "Let's try replacing our ResBlock with this bottleneck design:" ] @@ -1174,6 +1192,13 @@ "The bottleneck design we've shown here is only used in ResNet50, 101, and 152 in all official models we've seen. ResNet18 and 34 use the non-bottleneck design seen in the previous section. However, we've noticed that the bottleneck layer generally works better even for the shallower networks. This just goes to show that the little details in papers tend to stick around for years, even if they're actually not quite the best design! Questioning assumptions and \"stuff everyone knows\" is always a good idea, because this is still a new field, and there's lots of details that aren't always done well." ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "TK add conclusion" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -1243,31 +1268,6 @@ "display_name": "Python 3", "language": "python", "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.5" - }, - "toc": { - "base_numbering": 1, - "nav_menu": {}, - "number_sections": false, - "sideBar": true, - "skip_h1_title": true, - "title_cell": "Table of Contents", - "title_sidebar": "Contents", - "toc_cell": false, - "toc_position": {}, - "toc_section_display": true, - "toc_window_display": false } }, "nbformat": 4, diff --git a/16_arch_details.ipynb b/15_arch_details.ipynb similarity index 89% rename from 16_arch_details.ipynb rename to 15_arch_details.ipynb index 5a6dc7d..6a96b5b 100644 --- a/16_arch_details.ipynb +++ b/15_arch_details.ipynb @@ -28,7 +28,11 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We are now in the exciting position that we can fully understand the entire architectures that we have been using for our state-of-the-art models for computer vision, natural language processing, and tabular analysis. In this chapter, we're going to fill in all the missing details on how fastai's application models work." + "We are now in the exciting position that we can fully understand the entire architectures that we have been using for our state-of-the-art models for computer vision, natural language processing, and tabular analysis. In this chapter, we're going to fill in all the missing details on how fastai's application models work and show you how to build the models they use.\n", + "\n", + "We will also go back to the custom data preprocessing pipeline we saw in <> for Siamese networks and show you how you can use the components in the fastai library to build custom pretrained models for new tasks.\n", + "\n", + "We will go voer each application in turn, starting with computer vision." ] }, { @@ -38,6 +42,13 @@ "## Computer vision" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In computer vision, we used the functions `cnn_learner` and `unet_learner` to build our models, depending on the task. Let's see how they start from a pretrained ResNet to build the `Learner` objects we have used in part 1 and 2 of this book." + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -162,6 +173,13 @@ "> note: One parameter to create_head that is worth looking at is bn_final. Setting this to true will cause a batchnorm layer to be added as your final layer. This can be useful in helping your model to more easily ensure that it is scaled appropriately for your output activations. We haven't seen this approach published anywhere, as yet, but we have found that it works well in practice, wherever we have used it." ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's now have a look at what `unet_learner` did in the segmentation problem we showed in <>." + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -181,7 +199,7 @@ "\n", "We will (naturally) do this with a neural network! So we need some kind of layer which can increase the grid size in a CNN. One very simple approach to this is to replace every pixel in the 7x7 grid with four pixels in a 2x2 square. Each of those four pixels would have the same value — this is known as nearest neighbour interpolation. PyTorch provides a layer which does this for us, so we could create a head which contains stride one convolutional layers (along with batchnorm and ReLU as usual) interspersed with 2x2 nearest neighbour interpolation layers. In fact, you could try this now! See if you can create a custom head designed like this, and see if it can complete the CamVid segmentation task. You should find that you get some reasonable results, although it won't be as good as our <> results.\n", "\n", - "Another approach is to replace the nearest neighbour and convolution combination with a *transposed convolution* otherwise known as a *stride half convolution*. This is identical to a regular convolution, but first zero padding is inserted between every pixel in the input. This is easiest to see with a picture — here's a diagram from the excellent convolutional arithmetic paper we have seen before, showing a 3x3 transposed convolution applied to a 3x3 image:" + "Another approach is to replace the nearest neighbour and convolution combination with a *transposed convolution* otherwise known as a *stride half convolution*. This is identical to a regular convolution, but first zero padding is inserted between every pixel in the input. This is easiest to see with a picture — <> shows a diagram from the excellent convolutional arithmetic paper we have seen before, showing a 3x3 transposed convolution applied to a 3x3 image." ] }, { @@ -199,7 +217,7 @@ "\n", "Neither of these approaches, however, works really well. The problem is that our 7x7 grid simply doesn't have enough information to create a 224x224 pixel output. It's asking an awful lot of the activations of each of those grid cells to have enough information to fully regenerate every pixel in the output. The solution to this problem is to use skip connections, like in a resnet, but skipping from the activations in the body of the resnet all the way over to the activations of the transposed convolution on the opposite side of the architecture. This is known as a U-Net, and it was developed in the 2015 paper [U-Net: Convolutional Networks for Biomedical Image Segmentation](https://arxiv.org/abs/1505.04597). Although the paper focussed on medical applications, the U-Net has revolutionized all kinds of generation vision models.\n", "\n", - "The U-Net paper shows the architecture like this:" + "<> shows the U-Net architecture (form the paper). " ] }, { @@ -218,6 +236,13 @@ "With this architecture, the input to the transposed convolutions is not just the lower resolution grid in the preceding layer, but also the higher resolution grid in the resnet head. This allows the U-Net to use all of the information of the original image, as it is needed. One challenge with U-Nets is that the exact architecture depends on the image size. fastai has a unique `DynamicUnet` class which auto-generates an architecture of the right size based on the data provided." ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now that we've seen how to create complete state of the art computer vision models, let's move on to NLP." + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -229,9 +254,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now that we've seen how to create complete state of the art computer vision models, let's move on to NLP.\n", - "\n", - "Converting an AWD-LSTM language model into a transfer learning classifier follows a very similar process to what we saw for `cnn_learner` in the first section of this chapter. We do not need a \"meta\" dictionary in this case, because we do not have such a variety of architectures to support in the body. All we need to do is to select the stacked RNN for the encoder in the language model, which is a single PyTorch module. This encoder will provide an activation for every word of the input, because a language model needs to output a prediction for every next word.\n", + "Converting an AWD-LSTM language model into a transfer learning classifier as we have done in <> follows a very similar process to what we saw for `cnn_learner` in the first section of this chapter. We do not need a \"meta\" dictionary in this case, because we do not have such a variety of architectures to support in the body. All we need to do is to select the stacked RNN for the encoder in the language model, which is a single PyTorch module. This encoder will provide an activation for every word of the input, because a language model needs to output a prediction for every next word.\n", "\n", "To create a classifier from this we use an approach described in the ULMFiT paper as \"BPTT for Text Classification (BPT3C)\". The paper describes this:" ] @@ -240,7 +263,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "> In order to make fine-tuning a classifier for large documents feasible, we propose BPTT for Text Classification (BPT3C): We divide the document into fixed-length batches of size `b`. At the beginning of each batch, the model is initialized with the final state of the previous batch; we keep track of the hidden states for mean and max-pooling; gradients are back-propagated to the batches whose hidden states contributed to the final prediction. In practice, we use variable length backpropagation sequences." + "> : In order to make fine-tuning a classifier for large documents feasible, we propose BPTT for Text Classification (BPT3C): We divide the document into fixed-length batches of size `b`. At the beginning of each batch, the model is initialized with the final state of the previous batch; we keep track of the hidden states for mean and max-pooling; gradients are back-propagated to the batches whose hidden states contributed to the final prediction. In practice, we use variable length backpropagation sequences." ] }, { @@ -256,6 +279,13 @@ "This is done automatically behind the scenes by the fastai library when creating our `DataLoaders`." ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The last application where we used fastai's model we haven't shown you yet is tabular." + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -336,7 +366,9 @@ "\n", "```\n", "\n", - "Finally, this is passed through the linear layers (each of which includes batchnorm, if `use_bn` is True, and dropout, if `ps` is set to some value or list of values)." + "Finally, this is passed through the linear layers (each of which includes batchnorm, if `use_bn` is True, and dropout, if `ps` is set to some value or list of values).\n", + "\n", + "Congratulations! Now, you know every single piece of the architectures used in the fastai library!" ] }, { @@ -354,7 +386,7 @@ "\n", "Now that we have investigated all of the pieces of a model and the data that is passed into it, we can consider what this means for practical deep learning. If you have unlimited data, unlimited memory, and unlimited time, then the advice is easy: train a huge model on all of your data for a really long time. The reason that deep learning is not straightforward is because your data, memory, and time is limited. If you are running out of memory or time, then the solution is to train a smaller model. If you are not able to train for long enough to overfit, then you are not taking advantage of the capacity of your model.\n", "\n", - "So step one is to get to the point that you can overfit. Then, the question is how to reduce that overfitting. Here is how we recommend prioritising the steps from there:" + "So step one is to get to the point that you can overfit. Then, the question is how to reduce that overfitting. <> shows how we recommend prioritising the steps from there." ] }, { diff --git a/17_accel_sgd.ipynb b/16_accel_sgd.ipynb similarity index 86% rename from 17_accel_sgd.ipynb rename to 16_accel_sgd.ipynb index d0c1e09..7320cb3 100644 --- a/17_accel_sgd.ipynb +++ b/16_accel_sgd.ipynb @@ -23,22 +23,24 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Variants of SGD" + "# The training process" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Now that you know all about how the architectures are put together, it's time to start exploring the training process.\n", + "Since we now know how to create state-of-the-art architectures for computer vision, natural image processing, tabular analysis, and collaborative filtering, and we know how to train them quickly, we're done, right? Not quite yet. We still have to explorea little bit more the training process.\n", "\n", - "We explained earlier the basis of Stochastic Gradient Descent: pass a minibatch in the model, compare it to our target with the loss function then compute the gradients of this loss function with regards to each weight before updating the weights with the formula:\n", + "We explained in <> the basis of Stochastic Gradient Descent: pass a minibatch in the model, compare it to our target with the loss function then compute the gradients of this loss function with regards to each weight before updating the weights with the formula:\n", "\n", "```python\n", "new_weight = weight - lr * weight.grad\n", "```\n", "\n", - "We implemented this from scratch in a training loop, and also saw that Pytorch provides a simple `nn.SGD` class that does this calculation for each parameter for us. Let's now build some faster optimizers, using a flexible foundation." + "We implemented this from scratch in a training loop, and also saw that Pytorch provides a simple `nn.SGD` class that does this calculation for each parameter for us. In this chapter, we will build some faster optimizers, using a flexible foundation. But that's not all what we might want to change in the training process. For any tweak of the training loop, we will need a way to add some code to the basis of SGD. The fastai library has a system of callbacks to do this, and we will teach you all about it.\n", + "\n", + "Firs things first, let's start with standard SGD to get a baseline, then we will introduce most commonly used optimizers." ] }, { @@ -429,7 +431,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "It's working! So that's how we create SGD from scratch in fastai." + "It's working! So that's how we create SGD from scratch in fastai. Now let's see see what this momentum is exactly." ] }, { @@ -456,7 +458,7 @@ "\n", "Note that we are writing `weight.avg` to highlight the fact we need to store thoe moving averages for each parameter of the model (and they all their own independent moving averages).\n", "\n", - "Here is an example of noisy data for a single parameter, with the momentum curve plotted in red, and the gradients of the parameter plotted in blue. The gradients increase, and then decrease, and the momentum does a good job of following the general trend, without getting too influenced by noise:" + "<> shows an example of noisy data for a single parameter, with the momentum curve plotted in red, and the gradients of the parameter plotted in blue. The gradients increase, and then decrease, and the momentum does a good job of following the general trend, without getting too influenced by noise." ] }, { @@ -480,6 +482,10 @@ } ], "source": [ + "#hide_input\n", + "#id img_mommentum\n", + "#caption An example of momentum\n", + "#alt Graph showing an example of momentum\n", "x = np.linspace(-4, 4, 100)\n", "y = 1 - (x/3) ** 2\n", "x1 = x + np.random.randn(100) * 0.1\n", @@ -499,7 +505,7 @@ "source": [ "It works particularly well if the loss function has narrow canyons we need to navigate: vanilla SGD would send us from one side to the other while SGD with momentum will average those to roll down inside. The parameter `beta` determines the strength of that momentum we are using: with a small beta we stay closer to the actual gradient values whereas with a high beta, we will mostly go in the direction of the average of the gradients and it will take a while before any change in the gradients makes that trend move.\n", "\n", - "With a large beta, we might miss that the gradients have changed directions and roll over a small local minima which is a desired side-effect: intuitively, when we show a new picture/text/data to our model, it will look like something in the training set but won't be exactly like it. That means it will correspond to a point in the loss function that is closest to the minimum we ended up with at the end of training, but not exactly *at* that minimum. We then would rather end up training in a wide minimum, where nearby points have approximately the same loss (or if you prefer, a point where the loss is as flat as possible). Here's how the above chart varies as we change beta:" + "With a large beta, we might miss that the gradients have changed directions and roll over a small local minima which is a desired side-effect: intuitively, when we show a new picture/text/data to our model, it will look like something in the training set but won't be exactly like it. That means it will correspond to a point in the loss function that is closest to the minimum we ended up with at the end of training, but not exactly *at* that minimum. We then would rather end up training in a wide minimum, where nearby points have approximately the same loss (or if you prefer, a point where the loss is as flat as possible). <> shows how the chart in <> varies as we change beta." ] }, { @@ -523,6 +529,10 @@ } ], "source": [ + "#hide_input\n", + "#id img_betas\n", + "#caption Momentum with different beta values\n", + "#alt Graph showing how the beta value imfluence momentum\n", "x = np.linspace(-4, 4, 100)\n", "y = 1 - (x/3) ** 2\n", "x1 = x + np.random.randn(100) * 0.1\n", @@ -852,7 +862,9 @@ "\n", "In fastai, Adam is the default optimizer we use since it allows faster training, but we found that `beta2=0.99` is better suited for the type of schedule we are using. `beta1` is the momentum parameter, which we specify with the argument `moms` in our call to `fit_one_cycle`. As for `eps`, fastai uses a default of 1e-5. `eps` is not just useful for numerical stability. A higher `eps` limits the maximum value of the adjusted learning rate. To take an extreme example, if `eps` is 1, then the adjusted learning will never be higher than the base learning rate. \n", "\n", - "Rather than show all the code for this in the book, we'll let you look at the optimizer notebook in fastai's GitHub repository--you'll see all the code we've seen so far, along with Adam and other optimizers, and lots of examples and tests." + "Rather than show all the code for this in the book, we'll let you look at the optimizer notebook in fastai's GitHub repository--you'll see all the code we've seen so far, along with Adam and other optimizers, and lots of examples and tests.\n", + "\n", + "One thing that changes when we go from SGD to Adam is the way we apply weight decay, and it can have important consequences." ] }, { @@ -889,7 +901,319 @@ "\n", "Most libraries use the first formulation, but it was pointed out in [Decoupled Weight Regularization](https://arxiv.org/pdf/1711.05101.pdf) by Ilya Loshchilov and Frank Hutter, second one is the only correct approach with the Adam optimizer or momentum, which is why fastai makes it its default.\n", "\n", - "Now you know everything that is hidden behind the line `learn.fit_one_cycle`!" + "Now you know everything that is hidden behind the line `learn.fit_one_cycle`!\n", + "\n", + "OPtimizers are only one part of the training process. When you need to change the training loop with fastai, you can't directly change the code inside the library. Instead, we have designed a system of callbacks to let you write any tweak in independent blocks you can then mix and match. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Callbacks" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Sometimes you need to change how things work a little bit. In fact, we have already seen examples of this: mixup, FP16 training, resetting the model after each epoch for training RNNs, and so forth. How do we go about making these kinds of tweaks to the training process?\n", + "\n", + "We've seen the basic training loop, which, with the help of the `Optimizer` class, looks like this for a single epoch:\n", + "\n", + "```python\n", + "for xb,yb in dl:\n", + " loss = loss_func(model(xb), yb)\n", + " loss.backward()\n", + " opt.step()\n", + " opt.zero_grad()\n", + "```\n", + "\n", + "<> shows how to picture that." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\"Basic" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The usual way for deep learning practitioners to customise the training loop is to make a copy of an existing training loop, and then insert their code necessary for their particular changes into it. This is how nearly all code that you find online will look. But it has some very serious problems.\n", + "\n", + "It's not very likely that some particular tweaked training loop is going to meet your particular needs. There are hundreds of changes that can be made to a training loop, which means there are billions and billions of possible permutations. You can't just copy one tweak from a training loop here, another from a training loop there, and expect them all to work together. Each will be based on different assumptions about the environment that it's working in, use different naming conventions, and expect the data to be in different formats.\n", + "\n", + "We need a way to allow users to insert their own code at any part of the training loop, but in a consistent and well-defined way. Computer scientists have already come up with an answer to this question: the callback. A callback is a piece of code that you write, and inject into another piece of code at some predefined point. In fact, callbacks have been used with deep learning training loops for years. The problem is that only a small subset of places that may require code injection have been available in previous libraries, and, more importantly, callbacks were not able to do all the things they needed to do.\n", + "\n", + "In order to be just as flexible as manually copying and pasting a training loop and directly inserting code into it, a callback must be able to read every possible piece of information available in the training loop, modify all of it as needed, and fully control when a batch, epoch, or even all the whole training loop should be terminated. fastai is the first library to provide all of this functionality. It modifies the training loop so it looks like <>." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\"Training" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The real test of whether this works has been borne out over the last couple of years — it has turned out that every single new paper implemented, or use a request fulfilled, for modifying the training loop has successfully been achieved entirely by using the fastai callback system. The training loop itself has not required modifications. <> shows just a few of the callbacks that have been added." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\"Some" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The reason that this is important for all of us is that it means that whatever idea we have in our head, we can implement it. We need never dig into the source code of PyTorch or fastai and act together some one-off system to try out our ideas. And when we do implement our own callbacks to develop our own ideas, we know that they will work together with all of the other functionality provided by fastai – so we will get progress bars, mixed precision training, hyperparameter annealing, and so forth.\n", + "\n", + "Another advantage is that it makes it easy to gradually remove or add functionality and perform ablation studies. You just need to adjust the list of callbacks you pass along to your fit function." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As an example, here is the fastai source code that is run for each batch of the training loop:\n", + "\n", + "```python\n", + "try:\n", + " self._split(b); self('begin_batch')\n", + " self.pred = self.model(*self.xb); self('after_pred')\n", + " self.loss = self.loss_func(self.pred, *self.yb); self('after_loss')\n", + " if not self.training: return\n", + " self.loss.backward(); self('after_backward')\n", + " self.opt.step(); self('after_step')\n", + " self.opt.zero_grad()\n", + "except CancelBatchException: self('after_cancel_batch')\n", + "finally: self('after_batch')\n", + "```\n", + "\n", + "The calls of the form `self('...')` are where the callbacks are called. As you see, after every step a callback is called. The callback will receive the entire state of training, and can also modify it. For instance, as you see above, the input data and target labels are in `self.xb` and `self.yb` respectively. A callback can modify these to modify the data the training loop sees. It can also modify `self.loss`, or even modify the gradients.\n", + "\n", + "Let's see how this work in practice by writing a `Callback`." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Creating a callback" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "When you want to write your own callback, the full list of available events is:\n", + "\n", + "- `begin_fit`:: called before doing anything, ideal for initial setup.\n", + "- `begin_epoch`:: called at the beginning of each epoch, useful for any behavior you need to reset at each epoch.\n", + "- `begin_train`:: called at the beginning of the training part of an epoch.\n", + "- `begin_batch`:: called at the beginning of each batch, just after drawing said batch. It can be used to do any setup necessary for the batch (like hyper-parameter scheduling) or to change the input/target before it goes in the model (change of the input with techniques like mixup for instance).\n", + "- `after_pred`:: called after computing the output of the model on the batch. It can be used to change that output before it's fed to the loss.\n", + "- `after_loss`:: called after the loss has been computed, but before the backward pass. It can be used to add any penalty to the loss (AR or TAR in RNN training for instance).\n", + "- `after_backward`:: called after the backward pass, but before the update of the parameters. It can be used to do any change to the gradients before said update (gradient clipping for instance).\n", + "- `after_step`:: called after the step and before the gradients are zeroed.\n", + "- `after_batch`:: called at the end of a batch, for any clean-up before the next one.\n", + "- `after_train`:: called at the end of the training phase of an epoch.\n", + "- `begin_validate`:: called at the beginning of the validation phase of an epoch, useful for any setup needed specifically for validation.\n", + "- `after_validate`:: called at the end of the validation part of an epoch.\n", + "- `after_epoch`:: called at the end of an epoch, for any clean-up before the next one.\n", + "- `after_fit`:: called at the end of training, for final clean-up.\n", + "\n", + "This list is available as attributes of the special variable `event`; so just type `event.` and hit `Tab` in your notebook to see a list of all the options" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's take a look at an example. Do you recall how in <> we needed to ensure that our special `reset` method was called at the start of training and validation for each epoch? We used the `ModelReseter` callback provided by fastai to do this for us. But how did `ModelReseter` do that exactly? Here's the full actual source code to that class:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class ModelReseter(Callback):\n", + " def begin_train(self): self.model.reset()\n", + " def begin_validate(self): self.model.reset()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Yes, that's actually it! It just does what we said in the paragraph above: after completing training and epoch or validation for an epoch, call a method named `reset`.\n", + "\n", + "Callbacks are often \"short and sweet\" like this one. In fact, let's look at one more. Here's the fastai source for the callback that add RNN regularization (*AR* and *TAR*):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class RNNRegularizer(Callback):\n", + " def __init__(self, alpha=0., beta=0.): self.alpha,self.beta = alpha,beta\n", + "\n", + " def after_pred(self):\n", + " self.raw_out,self.out = self.pred[1],self.pred[2]\n", + " self.learn.pred = self.pred[0]\n", + "\n", + " def after_loss(self):\n", + " if not self.training: return\n", + " if self.alpha != 0.:\n", + " self.learn.loss += self.alpha * self.out[-1].float().pow(2).mean()\n", + " if self.beta != 0.:\n", + " h = self.raw_out[-1]\n", + " if len(h)>1:\n", + " self.learn.loss += self.beta * (h[:,1:] - h[:,:-1]\n", + " ).float().pow(2).mean()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "> stop: Go back to where we discussed TAR and AR regularization, and compare to the code here. Made sure you understand what it's doing, and why." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In both of these examples, notice how we can access attributes of the training loop by directly checking `self.model` or `self.pred`. That's because a `Callback` will always try to get an attribute it doesn't have inside the `Learner` associated to it. This is a shortcut for `self.learn.model` or `self.learn.pred`. Note that this shortcut works for reading attributes, but not for writing them, which is why when `RNNRegularizer` changes the loss or the predictions, you see `self.learn.loss = ` or `self.learn.pred = `. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "When writing a callback, the following attributes of `Learner` are available:\n", + "\n", + "- `model`: the model used for training/validation\n", + "- `data`: the underlying `DataLoaders`\n", + "- `loss_func`: the loss function used\n", + "- `opt`: the optimizer used to udpate the model parameters\n", + "- `opt_func`: the function used to create the optimizer\n", + "- `cbs`: the list containing all `Callback`s\n", + "- `dl`: current `DataLoader` used for iteration\n", + "- `x`/`xb`: last input drawn from `self.dl` (potentially modified by callbacks). `xb` is always a tuple (potentially with one element) and `x` is detuplified. You can only assign to `xb`.\n", + "- `y`/`yb`: last target drawn from `self.dl` (potentially modified by callbacks). `yb` is always a tuple (potentially with one element) and `y` is detuplified. You can only assign to `yb`.\n", + "- `pred`: last predictions from `self.model` (potentially modified by callbacks)\n", + "- `loss`: last computed loss (potentially modified by callbacks)\n", + "- `n_epoch`: the number of epochs in this training\n", + "- `n_iter`: the number of iterations in the current `self.dl`\n", + "- `epoch`: the current epoch index (from 0 to `n_epoch-1`)\n", + "- `iter`: the current iteration index in `self.dl` (from 0 to `n_iter-1`)\n", + "\n", + "The following attributes are added by `TrainEvalCallback` and should be available unless you went out of your way to remove that callback:\n", + "\n", + "- `train_iter`: the number of training iterations done since the beginning of this training\n", + "- `pct_train`: from 0. to 1., the percentage of training iterations completed\n", + "- `training`: flag to indicate if we're in training mode or not\n", + "\n", + "The following attribute is added by `Recorder` and should be available unless you went out of your way to remove that callback:\n", + "\n", + "- `smooth_loss`: an exponentially-averaged version of the training loss" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Callbacks can also interrupt any part of the training loop by using a system of exceptions." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Callback ordering and exceptions" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Sometimes, callbacks need to be able to tell fastai to skip over a batch, or an epoch, or stop training altogether. For instance, consider `TerminateOnNaNCallback`. This handy callback will automatically stop training any time the loss becomes infinite or `NaN` (*not a number*). Here's the fastai source for this callback:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class TerminateOnNaNCallback(Callback):\n", + " run_before=Recorder\n", + " def after_batch(self):\n", + " if torch.isinf(self.loss) or torch.isnan(self.loss):\n", + " raise CancelFitException" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The way it tells the training loop to interrupt training at this point is to `raise CancelFitException`. The training loop catches this exception and does not run any further training or validation. The callback control flow exceptions available are:\n", + "\n", + "- `CancelFitException`:: Skip the rest of this batch and go to `after_batch\n", + "- `CancelEpochException`:: Skip the rest of the training part of the epoch and go to `after_train\n", + "- `CancelTrainException`:: Skip the rest of the validation part of the epoch and go to `after_validate\n", + "- `CancelValidException`:: Skip the rest of this epoch and go to `after_epoch\n", + "- `CancelBatchException`:: Interrupts training and go to `after_fit" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can detect one of those exceptions occurred and add code that executes right after with the following events:\n", + "\n", + "- `after_cancel_batch`:: reached immediately after a `CancelBatchException` before proceeding to `after_batch`\n", + "- `after_cancel_train`:: reached immediately after a `CancelTrainException` before proceeding to `after_epoch`\n", + "- `after_cancel_valid`:: reached immediately after a `CancelValidException` before proceeding to `after_epoch`\n", + "- `after_cancel_epoch`:: reached immediately after a `CancelEpochException` before proceeding to `after_epoch`\n", + "- `after_cancel_fit`:: reached immediately after a `CancelFitException` before proceeding to `after_fit`" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Sometimes, callbacks need to be called in a particular order. In the case of `TerminateOnNaNCallback`, it's important that `Recorder` runs its `after_batch` after this callback, to avoid registering an NaN loss. You can specify `run_before` (this callback must run before ...) or `run_after` (this callback must run after ...) in your callback to ensure the ordering that you need." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now that we have seen how to tweak the training loop of fastai to do anything we need, let's take a step back and dig a little bit deeper in the foundations of that training loop." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "TK Write a conclusion" ] }, { @@ -920,7 +1244,16 @@ "1. Calculate the value of `unbias_avg` and `w.avg` for a few batches of dummy values.\n", "1. What's the impact of having a high eps in Adam?\n", "1. Read through the optimizer notebook in fastai's repo, and execute it.\n", - "1. In what situations do dynamic learning rate methods like Adam change the behaviour of weight decay?" + "1. In what situations do dynamic learning rate methods like Adam change the behaviour of weight decay?\n", + "1. What are the four steps of a training loop?\n", + "1. Why is the use of callbacks better than writing a new training loop for each tweak you want to add?\n", + "1. What are the necessary points in the design of the fastai's callback system that make it as flexible as copying and pasting bits of code?\n", + "1. How can you get the list of events available to you when writing a callback?\n", + "1. Write the `ModelResetter` callback (without peeking).\n", + "1. How can you access the necessary attributes of the training loop inside a callback? When can you use or not use the shortcut that goes with it?\n", + "1. How can a callback influence the control flow of the training loop.\n", + "1. Write the `TerminateOnNaN` callback (without peeking if possible).\n", + "1. How do you make sure your callback runs after or before another callback?" ] }, { @@ -934,7 +1267,28 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "1. Look up the \"rectified Adam\" paper and implement it using the general optimizer framework, and try it out. Search for other recent optimizers that work well in practice, and pick one to implement." + "1. Look up the \"rectified Adam\" paper and implement it using the general optimizer framework, and try it out. Search for other recent optimizers that work well in practice, and pick one to implement.\n", + "1. Look at the mixed precision callback with the documentation. Try to understand what each event and line of code does.\n", + "1. Implement your own version of ther learning rate finder from scratch. Compare it with fastai's version.\n", + "1. Look at the source code of the callbacks that ship with fastai. See if you can find one that's similar to what you're looking to do, to get some inspiration." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Foundations of Deep Learning: Wrap up" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Congratulations, you have made it to the end of the \"foundations of deep learning\" section. You now understand how all of fastai's applications and most important architectures are built, and the recommended ways to train them, and have all the information you need to build these from scratch. Whilst you probably won't need to create your own training loop, or batchnorm layer, for instance, knowing what is going on behind the scenes is very helpful for debugging, profiling, and deploying your solutions.\n", + "\n", + "Since you understand all of the foundations of fastai's applications now, be sure to spend some time digging through fastai's source notebooks, and running and experimenting with parts of them, since you can and see exactly how everything in fastai is developed.\n", + "\n", + "In the next section, we will be looking even further under the covers, to see how the actual forward and backward passes of a neural network are done, and we will see what tools are at our disposal to get better performance. We will then finish up with a project that brings together everything we have learned throughout the book, which we will use to build a method for interpreting convolutional neural networks." ] }, { diff --git a/19_foundations.ipynb b/17_foundations.ipynb similarity index 96% rename from 19_foundations.ipynb rename to 17_foundations.ipynb index 2695de5..fe2d1d8 100644 --- a/19_foundations.ipynb +++ b/17_foundations.ipynb @@ -39,7 +39,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## A neural net from scratch" + "## A neural net layer from scratch" ] }, { @@ -470,14 +470,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Broadcasting" + "### Broadcasting" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "As we discussed in <>, broadcasting is a term introduced by the numpy library that describes how tensor of different ranks are treated during arithmetic operations. For instance, it's obvious there is no way to add a 3 by 3 matrix with a 4 by 5 matrix, but what if we want to add one scalar (which can be represented as a 1 by 1 tensor) with a matrix? Or a vector of size 3 with a 3 by 4 matrix? In both cases, we can find a way to make sense of what the operation could be.\n", + "As we discussed in <>, broadcasting is a term introduced by the [numpy library](https://docs.scipy.org/doc/) that describes how tensor of different ranks are treated during arithmetic operations. For instance, it's obvious there is no way to add a 3 by 3 matrix with a 4 by 5 matrix, but what if we want to add one scalar (which can be represented as a 1 by 1 tensor) with a matrix? Or a vector of size 3 with a 3 by 4 matrix? In both cases, we can find a way to make sense of what the operation could be.\n", "\n", "Broadcasting gives specific rules to codify when shapes are compatible when trying to do an element-wise operation, and how the tensor of the smaller shape is expanded to match the tensor of the bigger shape. It's essential to master those rules if you want to be able to write code that executes quickly. In this section, we'll expand our previous treatment of broadcasting to understand these rules." ] @@ -486,14 +486,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Broadcasting with a scalar" + "#### Broadcasting with a scalar" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "This is the easiest broadcating: when we have a tensor `a` and a scalar, we just imagine a tensor of the same shape as `a` filled with that scalar and perform the operation." + "Broadcasting with a scalar is the easiest broadcating: when we have a tensor `a` and a scalar, we just imagine a tensor of the same shape as `a` filled with that scalar and perform the operation." ] }, { @@ -553,7 +553,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Broadcasting a vector to a matrix" + "Now you could have different means for each row of the matrix, in which case you would need to broadcast a vector to a matrix." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Broadcasting a vector to a matrix" ] }, { @@ -1027,14 +1034,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We're now 3,700 times faster than our first implementation!" + "We're now 3,700 times faster than our first implementation! Now let's discuss the exact rules of broadcasting." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Broadcasting Rules" + "#### Broadcasting Rules" ] }, { @@ -1077,7 +1084,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Einstein summation" + "Another useful thing for tensor manipulations is the use of Einstein summations." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Einstein summation" ] }, { @@ -1157,6 +1171,13 @@ "As we see, not only is it practical, but it's *very* fast. `einsum` is often the fastest way to do custom operations in PyTorch, without diving into C++ and CUDA. (But it's generally not as fast as carefully optimized CUDA code, as you see in the matrix multiplication example)." ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now that we know how to implement a matrix multiplication from scratch, we are ready to build our neural net, specifically its forward and backward passes, just using matrix multiplications." + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -1168,7 +1189,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now that we have defined `matmul` from scratch, we are ready to define our first neural net. As we saw in <>, to train it, we will need to compute all the gradients of a given a loss with respect to its parameters, which is known as the *backward pass*. The *forward pass* is computing the output of the model on a given input, which is just based on the matrix products we saw. As we define our first neural net, we will also delve in the problem of properly initializing the weights, which is crucial to make training start properly." + "As we saw in <>, to train it, we will need to compute all the gradients of a given a loss with respect to its parameters, which is known as the *backward pass*. The *forward pass* is computing the output of the model on a given input, which is just based on the matrix products we saw. As we define our first neural net, we will also delve in the problem of properly initializing the weights, which is crucial to make training start properly." ] }, { @@ -1734,6 +1755,13 @@ "loss = mse(out, y)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "That is all for the forward pass, let now look at the gradients." + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -1932,6 +1960,13 @@ "And now we can access to the gradients of our model parameters in `w1.g`, `b1.g`, `w2.g`, `b2.g`." ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We have sucessfuly defined our model, now let's make it a bit more like a PyTorch module." + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -2194,6 +2229,7 @@ "Then the structure used to build a more complex model that takes advantage of those functions is a `torch.nn.Module`. This is the base structure for all models and all the neural nets you have seen up until now where from that class. It mostly helps to register all the trainable parameters, which as we've seen can be used in the training loop.\n", "\n", "To implement a `nn.Module` you just need to\n", + "\n", "- Make sure the superclass `__init__` is called first when you initiliaze it,\n", "- Define any parameter of the model as attributes with `nn.Parameter`,\n", "- Define a `forward` function that returns the output of your model.\n", @@ -2314,6 +2350,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ + "TK tweak this and make it a real conclusion\n", + "\n", "- A neural net is basically a bunch of matrix multiplications with non-linearities in-between.\n", "- Python is slow so to write fast code we have to vectorize it and take advantage of element-wise arithmetic or broadcasting.\n", "- Two tensors are broadcastable if the dimensions starting from the end and going backward match (they are the same or one of them is 1). To make tensors broadcastable, we may need to add dimensions of size 1 with `unsqueeze` or a `None` index.\n", diff --git a/20_CAM.ipynb b/18_CAM.ipynb similarity index 99% rename from 20_CAM.ipynb rename to 18_CAM.ipynb index 82957c9..949dc12 100644 --- a/20_CAM.ipynb +++ b/18_CAM.ipynb @@ -30,7 +30,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now that we know how to build up pretty much anything from scratch, let's use that knowledge to create entirely new (and very useful!) functionality: the *class activation map*. In the process, we'll learn about one handy feature of PyTorch we haven't seen before, the *hook*, and we'll apply many of the concepts classes we've learned in the rest of the book. If you want to really test out your understanding of the material in this book, after you've finished this chapter, try putting the book aside, and recreate the ideas here yourself from scratch (no peaking!)" + "Now that we know how to build up pretty much anything from scratch, let's use that knowledge to create entirely new (and very useful!) functionality: the *class activation map*. It gives a us an hindsight of why a CNN made the predictions it did.\n", + "\n", + "In the process, we'll learn about one handy feature of PyTorch we haven't seen before, the *hook*, and we'll apply many of the concepts classes we've learned in the rest of the book. If you want to really test out your understanding of the material in this book, after you've finished this chapter, try putting the book aside, and recreate the ideas here yourself from scratch (no peaking!)" ] }, { @@ -44,7 +46,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Class Activation Mapping (or CAM) was introduced by Zhou et al. in [Learning Deep Features for Discriminative Localization](https://arxiv.org/abs/1512.04150). It uses the output of the last convolutional layer (just before our average pooling) together with the predictions to give us some heatmap visulaization of why the model made its decision.\n", + "Class Activation Mapping (or CAM) was introduced by Zhou et al. in [Learning Deep Features for Discriminative Localization](https://arxiv.org/abs/1512.04150). It uses the output of the last convolutional layer (just before our average pooling) together with the predictions to give us some heatmap visulaization of why the model made its decision. This is a useful tool for intepretation.\n", "\n", "More precisely, at each position of our final convolutional layer we have has many filters as the last linear layer. We can then compute the dot product of those activations by the final weights to have, for each location on our feature map, the score of the feature that was used to make a decision.\n", "\n", @@ -422,6 +424,13 @@ "fastai provides this `Hook` class for you, as well as some other handy classes to make working with hooks easier." ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This method is useful but only works for the last layer. Gradient CAM is a variant that addreses this problem." + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -580,6 +589,13 @@ " interpolation='bilinear', cmap='magma');" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "TK Write conclusion" + ] + }, { "cell_type": "markdown", "metadata": {}, diff --git a/18_callbacks.ipynb b/18_callbacks.ipynb deleted file mode 100644 index 28e06ed..0000000 --- a/18_callbacks.ipynb +++ /dev/null @@ -1,419 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#hide\n", - "from utils import *" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "[[chapter_callbacks]]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Callbacks" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Introduction to callbacks" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Since we now know how to create state-of-the-art architectures for computer vision, natural image processing, tabular analysis, and collaborative filtering, and we know how to train them quickly with accelerated optimisers, and we know how to regularise them effectively, we're done, right?\n", - "\n", - "Well… Yes, sort of. But other things come up. Sometimes you need to change how things work a little bit. In fact, we have already seen examples of this: mixup, FP16 training, resetting the model after each epoch for training RNNs, and so forth. How do we go about making these kinds of tweaks to the training process?\n", - "\n", - "We've seen the basic training loop, which, with the help of the `Optimizer` class, looks like this for a single epoch:\n", - "\n", - "```python\n", - "for xb,yb in dl:\n", - " loss = loss_func(model(xb), yb)\n", - " loss.backward()\n", - " opt.step()\n", - " opt.zero_grad()\n", - "```\n", - "\n", - "Here's one way to picture that:" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\"Basic" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The usual way for deep learning practitioners to customise the training loop is to make a copy of an existing training loop, and then insert their code necessary for their particular changes into it. This is how nearly all code that you find online will look. But it has some very serious problems.\n", - "\n", - "It's not very likely that some particular tweaked training loop is going to meet your particular needs. There are hundreds of changes that can be made to a training loop, which means there are billions and billions of possible permutations. You can't just copy one tweak from a training loop here, another from a training loop there, and expect them all to work together. Each will be based on different assumptions about the environment that it's working in, use different naming conventions, and expect the data to be in different formats.\n", - "\n", - "We need a way to allow users to insert their own code at any part of the training loop, but in a consistent and well-defined way. Computer scientists have already come up with an answer to this question: the callback. A callback is a piece of code that you write, and inject into another piece of code at some predefined point. In fact, callbacks have been used with deep learning training loops for years. The problem is that only a small subset of places that may require code injection have been available in previous libraries, and, more importantly, callbacks were not able to do all the things they needed to do.\n", - "\n", - "In order to be just as flexible as manually copying and pasting a training loop and directly inserting code into it, a callback must be able to read every possible piece of information available in the training loop, modify all of it as needed, and fully control when a batch, epoch, or even all the whole training loop should be terminated. fastai is the first library to provide all of this functionality. It modifies the training loop so it looks like this:" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\"Training" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The real test of whether this works has been borne out over the last couple of years — it has turned out that every single new paper implemented, or use a request fulfilled, for modifying the training loop has successfully been achieved entirely by using the fastai callback system. The training loop itself has not required modifications. Here are just a few of the callbacks that have been added:" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\"Some" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The reason that this is important for all of us is that it means that whatever idea we have in our head, we can implement it. We need never dig into the source code of PyTorch or fastai and act together some one-off system to try out our ideas. And when we do implement our own callbacks to develop our own ideas, we know that they will work together with all of the other functionality provided by fastai – so we will get progress bars, mixed precision training, hyperparameter annealing, and so forth.\n", - "\n", - "Another advantage is that it makes it easy to gradually remove or add functionality and perform ablation studies. You just need to adjust the list of callbacks you pass along to your fit function." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "As an example, here is the fastai source code that is run for each batch of the training loop:\n", - "\n", - "```python\n", - "try:\n", - " self._split(b); self('begin_batch')\n", - " self.pred = self.model(*self.xb); self('after_pred')\n", - " self.loss = self.loss_func(self.pred, *self.yb); self('after_loss')\n", - " if not self.training: return\n", - " self.loss.backward(); self('after_backward')\n", - " self.opt.step(); self('after_step')\n", - " self.opt.zero_grad()\n", - "except CancelBatchException: self('after_cancel_batch')\n", - "finally: self('after_batch')\n", - "```\n", - "\n", - "The calls of the form `self('...')` are where the callbacks are called. As you see, after every step a callback is called. The callback will receive the entire state of training, and can also modify it. For instance, as you see above, the input data and target labels are in `self.xb` and `self.yb` respectively. A callback can modify these to modify the data the training loop sees. It can also modify `self.loss`, or even modify the gradients." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Creating a callback" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The full list of available callback events is:\n", - "\n", - "- `begin_fit`: called before doing anything, ideal for initial setup.\n", - "- `begin_epoch`: called at the beginning of each epoch, useful for any behavior you need to reset at each epoch.\n", - "- `begin_train`: called at the beginning of the training part of an epoch.\n", - "- `begin_batch`: called at the beginning of each batch, just after drawing said batch. It can be used to do any setup necessary for the batch (like hyper-parameter scheduling) or to change the input/target before it goes in the model (change of the input with techniques like mixup for instance).\n", - "- `after_pred`: called after computing the output of the model on the batch. It can be used to change that output before it's fed to the loss.\n", - "- `after_loss`: called after the loss has been computed, but before the backward pass. It can be used to add any penalty to the loss (AR or TAR in RNN training for instance).\n", - "- `after_backward`: called after the backward pass, but before the update of the parameters. It can be used to do any change to the gradients before said update (gradient clipping for instance).\n", - "- `after_step`: called after the step and before the gradients are zeroed.\n", - "- `after_batch`: called at the end of a batch, for any clean-up before the next one.\n", - "- `after_train`: called at the end of the training phase of an epoch.\n", - "- `begin_validate`: called at the beginning of the validation phase of an epoch, useful for any setup needed specifically for validation.\n", - "- `after_validate`: called at the end of the validation part of an epoch.\n", - "- `after_epoch`: called at the end of an epoch, for any clean-up before the next one.\n", - "- `after_fit`: called at the end of training, for final clean-up.\n", - "\n", - "This list is available as attributes of the special variable `event`; so just type `event.` and hit `Tab` in your notebook to see a list of all the options" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's take a look at an example. Do you recall how in <> we needed to ensure that our special `reset` method was called at the start of training and validation for each epoch? We used the `ModelReseter` callback provided by fastai to do this for us. But how did `ModelReseter` do that exactly? Here's the full actual source code to that class:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "class ModelReseter(Callback):\n", - " def begin_train(self): self.model.reset()\n", - " def begin_validate(self): self.model.reset()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Yes, that's actually it! It just does what we said in the paragraph above: after completing training and epoch or validation for an epoch, call a method named `reset`.\n", - "\n", - "Callbacks are often \"short and sweet\" like this one. In fact, let's look at one more. Here's the fastai source for the callback that add RNN regularization (*AR* and *TAR*):" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "class RNNRegularizer(Callback):\n", - " def __init__(self, alpha=0., beta=0.): self.alpha,self.beta = alpha,beta\n", - "\n", - " def after_pred(self):\n", - " self.raw_out,self.out = self.pred[1],self.pred[2]\n", - " self.learn.pred = self.pred[0]\n", - "\n", - " def after_loss(self):\n", - " if not self.training: return\n", - " if self.alpha != 0.:\n", - " self.learn.loss += self.alpha * self.out[-1].float().pow(2).mean()\n", - " if self.beta != 0.:\n", - " h = self.raw_out[-1]\n", - " if len(h)>1:\n", - " self.learn.loss += self.beta * (h[:,1:] - h[:,:-1]\n", - " ).float().pow(2).mean()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "> stop: Go back to where we discussed TAR and AR regularization, and compare to the code here. Made sure you understand what it's doing, and why." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In both of these examples, notice how we can access attributes of the training loop by directly checking `self.model` or `self.pred`. That's because a `Callback` will always try to get an attribute it doesn't have inside the `Learner` associated to it. This is a shortcut for `self.learn.model` or `self.learn.pred`. Note that this shortcut works for reading attributes, but not for writing them, which is why when `RNNRegularizer` changes the loss or the predictions, you see `self.learn.loss = ` or `self.learn.pred = `. " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "When writing a callback, the following attributes of `Learner` are available:\n", - "\n", - "- `model`: the model used for training/validation\n", - "- `data`: the underlying `DataLoaders`\n", - "- `loss_func`: the loss function used\n", - "- `opt`: the optimizer used to udpate the model parameters\n", - "- `opt_func`: the function used to create the optimizer\n", - "- `cbs`: the list containing all `Callback`s\n", - "- `dl`: current `DataLoader` used for iteration\n", - "- `x`/`xb`: last input drawn from `self.dl` (potentially modified by callbacks). `xb` is always a tuple (potentially with one element) and `x` is detuplified. You can only assign to `xb`.\n", - "- `y`/`yb`: last target drawn from `self.dl` (potentially modified by callbacks). `yb` is always a tuple (potentially with one element) and `y` is detuplified. You can only assign to `yb`.\n", - "- `pred`: last predictions from `self.model` (potentially modified by callbacks)\n", - "- `loss`: last computed loss (potentially modified by callbacks)\n", - "- `n_epoch`: the number of epochs in this training\n", - "- `n_iter`: the number of iterations in the current `self.dl`\n", - "- `epoch`: the current epoch index (from 0 to `n_epoch-1`)\n", - "- `iter`: the current iteration index in `self.dl` (from 0 to `n_iter-1`)\n", - "\n", - "The following attributes are added by `TrainEvalCallback` and should be available unless you went out of your way to remove that callback:\n", - "\n", - "- `train_iter`: the number of training iterations done since the beginning of this training\n", - "- `pct_train`: from 0. to 1., the percentage of training iterations completed\n", - "- `training`: flag to indicate if we're in training mode or not\n", - "\n", - "The following attribute is added by `Recorder` and should be available unless you went out of your way to remove that callback:\n", - "\n", - "- `smooth_loss`: an exponentially-averaged version of the training loss" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Callback ordering and exceptions" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Sometimes, callbacks need to be able to tell fastai to skip over a batch, or an epoch, or stop training altogether. For instance, consider `TerminateOnNaNCallback`. This handy callback will automatically stop training any time the loss becomes infinite or `NaN` (*not a number*). Here's the fastai source for this callback:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "class TerminateOnNaNCallback(Callback):\n", - " run_before=Recorder\n", - " def after_batch(self):\n", - " if torch.isinf(self.loss) or torch.isnan(self.loss):\n", - " raise CancelFitException" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The way it tells the training loop to interrupt training at this point is to `raise CancelFitException`. The training loop catches this exception and does not run any further training or validation. The callback control flow exceptions available are:\n", - "\n", - "- `CancelFitException`: Skip the rest of this batch and go to `after_batch\n", - "- `CancelEpochException`: Skip the rest of the training part of the epoch and go to `after_train\n", - "- `CancelTrainException`: Skip the rest of the validation part of the epoch and go to `after_validate\n", - "- `CancelValidException`: Skip the rest of this epoch and go to `after_epoch\n", - "- `CancelBatchException`: Interrupts training and go to `after_fit" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You can detect one of those exceptions occurred and add code that executes right after with the following events:\n", - "\n", - "- `after_cancel_batch`: reached immediately after a `CancelBatchException` before proceeding to `after_batch`\n", - "- `after_cancel_train`: reached immediately after a `CancelTrainException` before proceeding to `after_epoch`\n", - "- `after_cancel_valid`: reached immediately after a `CancelValidException` before proceeding to `after_epoch`\n", - "- `after_cancel_epoch`: reached immediately after a `CancelEpochException` before proceeding to `after_epoch`\n", - "- `after_cancel_fit`: reached immediately after a `CancelFitException` before proceeding to `after_fit`" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Sometimes, callbacks need to be called in a particular order. In the case of `TerminateOnNaNCallback`, it's important that `Recorder` runs its `after_batch` after this callback, to avoid registering an NaN loss. You can specify `run_before` (this callback must run before ...) or `run_after` (this callback must run after ...) in your callback to ensure the ordering that you need.\n", - "\n", - "Now that we have seen how to tweak the training loop of fastai to do anything we need, let's take a step back and dig a little bit deeper in the foundations of that training loop." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Questionnaire" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "1. What are the four steps of a training loop?\n", - "1. Why is the use of callbacks better than writing a new training loop for each tweak you want to add?\n", - "1. What are the necessary points in the design of the fastai's callback system that make it as flexible as copying and pasting bits of code?\n", - "1. How can you get the list of events available to you when writing a callback?\n", - "1. Write the `ModelResetter` callback (without peeking).\n", - "1. How can you access the necessary attributes of the training loop inside a callback? When can you use or not use the shortcut that goes with it?\n", - "1. How can a callback influence the control flow of the training loop.\n", - "1. Write the `TerminateOnNaN` callback (without peeking if possible).\n", - "1. How do you make sure your callback runs after or before another callback?" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Further research" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "1. Look at the mixed precision callback with the documentation. Try to understand what each event and line of code does.\n", - "1. Implement your own version of ther learning rate finder from scratch. Compare it with fastai's version.\n", - "1. Look at the source code of the callbacks that ship with fastai. See if you can find one that's similar to what you're looking to do, to get some inspiration." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Foundations of Deep Learning: Wrap up" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Congratulations, you have made it to the end of the \"foundations of deep learning\" section. You now understand how all of fastai's applications and most important architectures are built, and the recommended ways to train them, and have all the information you need to build these from scratch. Whilst you probably won't need to create your own training loop, or batchnorm layer, for instance, knowing what is going on behind the scenes is very helpful for debugging, profiling, and deploying your solutions.\n", - "\n", - "Since you understand all of the foundations of fastai's applications now, be sure to spend some time digging through fastai's source notebooks, and running and experimenting with parts of them, since you can and see exactly how everything in fastai is developed.\n", - "\n", - "In the next section, we will be looking even further under the covers, to see how the actual forward and backward passes of a neural network are done, and we will see what tools are at our disposal to get better performance. We will then finish up with a project that brings together everything we have learned throughout the book, which we will use to build a method for interpreting convolutional neural networks." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "jupytext": { - "split_at_heading": true - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.5" - }, - "toc": { - "base_numbering": 1, - "nav_menu": {}, - "number_sections": false, - "sideBar": true, - "skip_h1_title": true, - "title_cell": "Table of Contents", - "title_sidebar": "Contents", - "toc_cell": false, - "toc_position": {}, - "toc_section_display": true, - "toc_window_display": false - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/21_learner.ipynb b/19_learner.ipynb similarity index 98% rename from 21_learner.ipynb rename to 19_learner.ipynb index 81054d7..171d96b 100644 --- a/21_learner.ipynb +++ b/19_learner.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -35,7 +35,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -44,7 +44,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -53,7 +53,7 @@ "Path('/home/jhoward/.fastai/data/imagenette2-160/val/n03417042/n03417042_3752.JPEG')" ] }, - "execution_count": 3, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -65,7 +65,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -74,7 +74,7 @@ "Path('/home/jhoward/.fastai/data/imagenette2-160/val/n03417042/n03417042_3752.JPEG')" ] }, - "execution_count": 4, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -87,7 +87,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -97,7 +97,7 @@ "" ] }, - "execution_count": 5, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -109,7 +109,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -118,7 +118,7 @@ "(#10) ['n03417042','n03445777','n03888257','n03394916','n02979186','n03000684','n03425413','n01440764','n03028079','n02102040']" ] }, - "execution_count": 6, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -129,7 +129,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -147,7 +147,7 @@ " 'n02102040': 9}" ] }, - "execution_count": 7, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -158,7 +158,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -167,7 +167,7 @@ "torch.Size([160, 213, 3])" ] }, - "execution_count": 8, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -186,7 +186,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -201,7 +201,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -210,7 +210,7 @@ "(9469, 3925)" ] }, - "execution_count": 10, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -223,7 +223,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -232,7 +232,7 @@ "(torch.Size([64, 64, 3]), tensor(0))" ] }, - "execution_count": 11, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -245,7 +245,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -267,7 +267,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -278,7 +278,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -287,7 +287,7 @@ "(torch.Size([2, 64, 64, 3]), tensor([0, 0]))" ] }, - "execution_count": 14, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -299,7 +299,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -319,7 +319,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -328,7 +328,7 @@ "(torch.Size([128, 64, 64, 3]), torch.Size([128]), 74)" ] }, - "execution_count": 16, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -343,7 +343,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -352,7 +352,7 @@ "[tensor([0.4544, 0.4453, 0.4141]), tensor([0.2812, 0.2766, 0.2981])]" ] }, - "execution_count": 17, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -364,7 +364,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -378,7 +378,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -388,7 +388,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -397,7 +397,7 @@ "(tensor([0.3732, 0.4907, 0.5633]), tensor([1.0212, 1.0311, 1.0131]))" ] }, - "execution_count": 20, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -416,7 +416,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -427,7 +427,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -466,7 +466,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -475,7 +475,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -496,7 +496,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -505,7 +505,7 @@ "2" ] }, - "execution_count": 26, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -517,7 +517,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -526,7 +526,7 @@ "torch.Size([128, 4, 64, 64])" ] }, - "execution_count": 27, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -539,7 +539,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -555,7 +555,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -564,7 +564,7 @@ "torch.Size([3, 2])" ] }, - "execution_count": 29, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -577,7 +577,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -589,7 +589,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -598,7 +598,7 @@ "4" ] }, - "execution_count": 31, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -610,7 +610,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -619,7 +619,7 @@ "device(type='cuda', index=5)" ] }, - "execution_count": 32, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -638,7 +638,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -655,7 +655,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -665,7 +665,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -682,7 +682,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -691,7 +691,7 @@ "10" ] }, - "execution_count": 36, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -703,7 +703,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -713,7 +713,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -732,7 +732,7 @@ "torch.Size([128, 10])" ] }, - "execution_count": 38, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -751,7 +751,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -767,7 +767,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -776,7 +776,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -785,7 +785,7 @@ "tensor(-2.7753, grad_fn=)" ] }, - "execution_count": 41, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -796,7 +796,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -805,7 +805,7 @@ "tensor(2.5293, grad_fn=)" ] }, - "execution_count": 42, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -828,7 +828,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -837,7 +837,7 @@ "tensor(-2.7753, grad_fn=)" ] }, - "execution_count": 43, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -863,7 +863,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -872,7 +872,7 @@ "tensor(False)" ] }, - "execution_count": 44, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -885,7 +885,7 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -896,7 +896,7 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -905,7 +905,7 @@ "tensor(2.3158, grad_fn=)" ] }, - "execution_count": 46, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -923,7 +923,7 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -932,7 +932,7 @@ }, { "cell_type": "code", - "execution_count": 48, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -941,7 +941,7 @@ "tensor(-2.7753, grad_fn=)" ] }, - "execution_count": 48, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -952,7 +952,7 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -968,7 +968,7 @@ }, { "cell_type": "code", - "execution_count": 50, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -982,7 +982,7 @@ }, { "cell_type": "code", - "execution_count": 51, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -992,7 +992,7 @@ }, { "cell_type": "code", - "execution_count": 52, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -1001,7 +1001,7 @@ }, { "cell_type": "code", - "execution_count": 53, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -1045,7 +1045,7 @@ }, { "cell_type": "code", - "execution_count": 54, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -1054,7 +1054,7 @@ }, { "cell_type": "code", - "execution_count": 55, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -1077,7 +1077,7 @@ }, { "cell_type": "code", - "execution_count": 56, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -1091,7 +1091,7 @@ }, { "cell_type": "code", - "execution_count": 57, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -1100,7 +1100,7 @@ }, { "cell_type": "code", - "execution_count": 58, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -1109,7 +1109,7 @@ }, { "cell_type": "code", - "execution_count": 75, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -1160,7 +1160,7 @@ }, { "cell_type": "code", - "execution_count": 59, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -1182,7 +1182,7 @@ }, { "cell_type": "code", - "execution_count": 60, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -1191,7 +1191,7 @@ }, { "cell_type": "code", - "execution_count": 61, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -1200,7 +1200,7 @@ }, { "cell_type": "code", - "execution_count": 62, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -1273,7 +1273,7 @@ }, { "cell_type": "code", - "execution_count": 113, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -1296,7 +1296,7 @@ }, { "cell_type": "code", - "execution_count": 136, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -1323,7 +1323,7 @@ }, { "cell_type": "code", - "execution_count": 137, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -1332,7 +1332,7 @@ }, { "cell_type": "code", - "execution_count": 138, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -1341,7 +1341,7 @@ }, { "cell_type": "code", - "execution_count": 139, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -1623,7 +1623,7 @@ }, { "cell_type": "code", - "execution_count": 140, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -1731,34 +1731,6 @@ "display_name": "Python 3", "language": "python", "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.5" - }, - "toc": { - "base_numbering": 1, - "nav_menu": { - "height": "140px", - "width": "202px" - }, - "number_sections": false, - "sideBar": true, - "skip_h1_title": true, - "title_cell": "Table of Contents", - "title_sidebar": "Contents", - "toc_cell": false, - "toc_position": {}, - "toc_section_display": true, - "toc_window_display": false } }, "nbformat": 4, diff --git a/22_conclusion.ipynb b/20_conclusion.ipynb similarity index 95% rename from 22_conclusion.ipynb rename to 20_conclusion.ipynb index 32efd3e..1ad376e 100644 --- a/22_conclusion.ipynb +++ b/20_conclusion.ipynb @@ -20,7 +20,7 @@ "source": [ "Congratulations! You've made it! If you have worked through all of the notebooks to this point, then you have joined a small, but growing group of people that are able to harness the power of deep learning to solve real problems. You may not feel that way; in fact you probably do not feel that way. We have seen again and again that students that complete the fast.AI courses dramatically underestimate how effective they are as deep learning practitioners. We've also seen that these people are often underestimated by those that have come out of a classic academic background. So for you to rise above your own expectations and the expectations of others what you do next, after closing this book, is even more important than what you've done to get to this point.\n", "\n", - "The most important thing is to keep the momentum going. In fact, as you know from your study of optimisers, momentum is something which can build upon itself! So think about what it is you can do now to maintain and accelerate your deep learning journey. Here's a few ideas:" + "The most important thing is to keep the momentum going. In fact, as you know from your study of optimisers, momentum is something which can build upon itself! So think about what it is you can do now to maintain and accelerate your deep learning journey. <> can give you a few ideas." ] }, { @@ -69,18 +69,6 @@ "display_name": "Python 3", "language": "python", "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.3" } }, "nbformat": 4,