diff --git a/.coveragerc b/.coveragerc
index 3e8f879f798..77faedc8971 100644
--- a/.coveragerc
+++ b/.coveragerc
@@ -1,6 +1,16 @@
 [report]
-fail_under = 84
+# Regexes for lines to exclude from consideration
+exclude_lines =
+    # Don't complain if tests don't hit defensive assertion code:
+    raise ImportError
+    raise NotImplementedError
+
+    # Don't complain if legacy support codes are not performed:
+    if original_keras_version == '1':
+
+fail_under = 85
 show_missing = True
 omit =
     keras/applications/*
     keras/datasets/*
+    keras/legacy/*
diff --git a/.travis.yml b/.travis.yml
index 6ec77e076d3..d52eb878d67 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -38,10 +38,10 @@ install:
   # Useful for debugging any issues with conda
   - conda info -a
 
-  - conda create -q -n test-environment python=$TRAVIS_PYTHON_VERSION numpy nose scipy matplotlib pandas pytest h5py
+  - conda create -q -n test-environment python=$TRAVIS_PYTHON_VERSION pytest pandas
   - source activate test-environment
+  - pip install --only-binary=numpy,scipy numpy nose scipy matplotlib h5py theano
   - conda install mkl mkl-service
-  - pip install theano
 
   # set library path
   - export LD_LIBRARY_PATH=$HOME/miniconda/envs/test-environment/lib/:$LD_LIBRARY_PATH
@@ -111,5 +111,5 @@ script:
     elif [[ "$TEST_MODE" == "DOC" ]]; then
       PYTHONPATH=$PWD:$PYTHONPATH py.test tests/test_documentation.py;
     else
-      PYTHONPATH=$PWD:$PYTHONPATH py.test tests/ --ignore=tests/integration_tests --ignore=tests/test_documentation.py --cov-config .coveragerc --cov=keras tests/;
+      PYTHONPATH=$PWD:$PYTHONPATH py.test tests/ --ignore=tests/integration_tests --ignore=tests/test_documentation.py --ignore=tests/keras/legacy/layers_test.py --cov-config .coveragerc --cov=keras tests/;
     fi
diff --git a/LICENSE b/LICENSE
index 1790e873698..aa8e4be0952 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,19 +1,19 @@
 COPYRIGHT
 
 All contributions by François Chollet:
-Copyright (c) 2015, François Chollet.
+Copyright (c) 2015 - 2018, François Chollet.
 All rights reserved.
 
 All contributions by Google:
-Copyright (c) 2015, Google, Inc.
+Copyright (c) 2015 - 2018, Google, Inc.
 All rights reserved.
 
 All contributions by Microsoft:
-Copyright (c) 2017, Microsoft, Inc.
+Copyright (c) 2017 - 2018, Microsoft, Inc.
 All rights reserved.
 
 All other contributions:
-Copyright (c) 2015 - 2017, the respective contributors.
+Copyright (c) 2015 - 2018, the respective contributors.
 All rights reserved.
 
 Each contributor holds copyright over their respective contributions.
diff --git a/README.md b/README.md
index 234ad7d15dc..6396f8b83d0 100644
--- a/README.md
+++ b/README.md
@@ -155,7 +155,7 @@ sudo python setup.py install
 ------------------
 
 
-## Switching from TensorFlow to CNTK or Theano
+## Using a different backend than TensorFlow
 
 By default, Keras will use TensorFlow as its tensor manipulation library. [Follow these instructions](https://keras.io/backend/) to configure the Keras backend.
 
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 5debe26e5c0..b89b1396bbd 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -1,5 +1,5 @@
-ARG cuda_version=8.0
-ARG cudnn_version=6
+ARG cuda_version=9.0
+ARG cudnn_version=7
 FROM nvidia/cuda:${cuda_version}-cudnn${cudnn_version}-devel
 
 ENV CONDA_DIR /opt/conda
diff --git a/docker/Makefile b/docker/Makefile
index 42c398807d9..180961b8b42 100644
--- a/docker/Makefile
+++ b/docker/Makefile
@@ -7,8 +7,8 @@ DOCKER_FILE=Dockerfile
 DOCKER=GPU=$(GPU) nvidia-docker
 BACKEND=tensorflow
 PYTHON_VERSION?=3.6
-CUDA_VERSION?=8.0
-CUDNN_VERSION?=6
+CUDA_VERSION?=9.0
+CUDNN_VERSION?=7
 TEST=tests/
 SRC?=$(shell dirname `pwd`)
 
diff --git a/docs/autogen.py b/docs/autogen.py
index 3919a5359b9..2f02c21a8c2 100644
--- a/docs/autogen.py
+++ b/docs/autogen.py
@@ -489,13 +489,18 @@ def process_docstring(docstring):
             new_fpath = fpath.replace('templates', 'sources')
             shutil.copy(fpath, new_fpath)
 
+
 # Take care of index page.
-readme = open('../README.md').read()
-index = open('templates/index.md').read()
+def read_file(path):
+    with open(path) as f:
+        return f.read()
+
+
+readme = read_file('../README.md')
+index = read_file('templates/index.md')
 index = index.replace('{{autogenerated}}', readme[readme.find('##'):])
-f = open('sources/index.md', 'w')
-f.write(index)
-f.close()
+with open('sources/index.md', 'w') as f:
+    f.write(index)
 
 print('Starting autogeneration.')
 for page_data in PAGES:
@@ -564,7 +569,7 @@ def process_docstring(docstring):
     page_name = page_data['page']
     path = os.path.join('sources', page_name)
     if os.path.exists(path):
-        template = open(path).read()
+        template = read_file(path)
         assert '{{autogenerated}}' in template, ('Template found for ' + path +
                                                  ' but missing {{autogenerated}} tag.')
         mkdown = template.replace('{{autogenerated}}', mkdown)
@@ -574,6 +579,7 @@ def process_docstring(docstring):
     subdir = os.path.dirname(path)
     if not os.path.exists(subdir):
         os.makedirs(subdir)
-    open(path, 'w').write(mkdown)
+    with open(path, 'w') as f:
+        f.write(mkdown)
 
 shutil.copyfile('../CONTRIBUTING.md', 'sources/contributing.md')
diff --git a/docs/templates/getting-started/functional-api-guide.md b/docs/templates/getting-started/functional-api-guide.md
index 8ce256b7520..3f35d66f896 100644
--- a/docs/templates/getting-started/functional-api-guide.md
+++ b/docs/templates/getting-started/functional-api-guide.md
@@ -168,15 +168,15 @@ One way to achieve this is to build a model that encodes two tweets into two vec
 
 Because the problem is symmetric, the mechanism that encodes the first tweet should be reused (weights and all) to encode the second tweet. Here we use a shared LSTM layer to encode the tweets.
 
-Let's build this with the functional API. We will take as input for a tweet a binary matrix of shape `(140, 256)`, i.e. a sequence of 140 vectors of size 256, where each dimension in the 256-dimensional vector encodes the presence/absence of a character (out of an alphabet of 256 frequent characters).
+Let's build this with the functional API. We will take as input for a tweet a binary matrix of shape `(280, 256)`, i.e. a sequence of 280 vectors of size 256, where each dimension in the 256-dimensional vector encodes the presence/absence of a character (out of an alphabet of 256 frequent characters).
 
 ```python
 import keras
 from keras.layers import Input, LSTM, Dense
 from keras.models import Model
 
-tweet_a = Input(shape=(140, 256))
-tweet_b = Input(shape=(140, 256))
+tweet_a = Input(shape=(280, 256))
+tweet_b = Input(shape=(280, 256))
 ```
 
 To share a layer across different inputs, simply instantiate the layer once, then call it on as many inputs as you want:
@@ -222,7 +222,7 @@ In previous versions of Keras, you could obtain the output tensor of a layer ins
 As long as a layer is only connected to one input, there is no confusion, and `.output` will return the one output of the layer:
 
 ```python
-a = Input(shape=(140, 256))
+a = Input(shape=(280, 256))
 
 lstm = LSTM(32)
 encoded_a = lstm(a)
@@ -232,8 +232,8 @@ assert lstm.output == encoded_a
 
 Not so if the layer has multiple inputs:
 ```python
-a = Input(shape=(140, 256))
-b = Input(shape=(140, 256))
+a = Input(shape=(280, 256))
+b = Input(shape=(280, 256))
 
 lstm = LSTM(32)
 encoded_a = lstm(a)
diff --git a/docs/templates/preprocessing/image.md b/docs/templates/preprocessing/image.md
index c8cd70afd58..2de15d67954 100644
--- a/docs/templates/preprocessing/image.md
+++ b/docs/templates/preprocessing/image.md
@@ -82,7 +82,7 @@ Generate batches of tensor image data with real-time data augmentation. The data
             - __batch_size__: int (default: 32).
             - __shuffle__: boolean (default: True).
             - __seed__: int (default: None).
-            - __save_to_dir__: None or str (default: None). This allows you to optimally specify a directory to which to save the augmented pictures being generated (useful for visualizing what you are doing).
+            - __save_to_dir__: None or str (default: None). This allows you to optionally specify a directory to which to save the augmented pictures being generated (useful for visualizing what you are doing).
             - __save_prefix__: str (default: `''`). Prefix to use for filenames of saved pictures (only relevant if `save_to_dir` is set).
             - __save_format__: one of "png", "jpeg" (only relevant if `save_to_dir` is set). Default: "png".
         - __yields__: Tuples of `(x, y)` where `x` is a numpy array of image data and `y` is a numpy array of corresponding labels.
@@ -90,7 +90,7 @@ Generate batches of tensor image data with real-time data augmentation. The data
     - __flow_from_directory(directory)__: Takes the path to a directory, and generates batches of augmented/normalized data. Yields batches indefinitely, in an infinite loop.
         - __Arguments__:
             - __directory__: path to the target directory. It should contain one subdirectory per class.
-                Any PNG, JPG, BMP or PPM images inside each of the subdirectories directory tree will be included in the generator.
+                Any PNG, JPG, BMP, PPM or TIF images inside each of the subdirectories directory tree will be included in the generator.
                 See [this script](https://gist.github.com/fchollet/0830affa1f7f19fd47b06d4cf89ed44d) for more details.
             - __target_size__: tuple of integers `(height, width)`, default: `(256, 256)`. 
                 The dimensions to which all images found will be resized.
@@ -100,7 +100,7 @@ Generate batches of tensor image data with real-time data augmentation. The data
             - __batch_size__: size of the batches of data (default: 32).
             - __shuffle__: whether to shuffle the data (default: True)
             - __seed__: optional random seed for shuffling and transformations.
-            - __save_to_dir__: None or str (default: None). This allows you to optimally specify a directory to which to save the augmented pictures being generated (useful for visualizing what you are doing).
+            - __save_to_dir__: None or str (default: None). This allows you to optionally specify a directory to which to save the augmented pictures being generated (useful for visualizing what you are doing).
             - __save_prefix__: str. Prefix to use for filenames of saved pictures (only relevant if `save_to_dir` is set).
             - __save_format__: one of "png", "jpeg" (only relevant if `save_to_dir` is set). Default: "png".
             - __follow_links__: whether to follow symlinks inside class subdirectories (default: False).
diff --git a/docs/templates/preprocessing/text.md b/docs/templates/preprocessing/text.md
index 08c65619451..d8674bc8ce5 100644
--- a/docs/templates/preprocessing/text.md
+++ b/docs/templates/preprocessing/text.md
@@ -81,7 +81,8 @@ keras.preprocessing.text.Tokenizer(num_words=None,
                                    filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                                    lower=True,
                                    split=" ",
-                                   char_level=False)
+                                   char_level=False,
+                                   oov_token=None)
 ```
 
 Class for vectorizing texts, or/and turning texts into sequences (=list of word indexes, where the word of rank i in the dataset (starting at 1) has index i).
@@ -89,6 +90,7 @@ Class for vectorizing texts, or/and turning texts into sequences (=list of word
 - __Arguments__: Same as `text_to_word_sequence` above.
     - __num_words__: None or int. Maximum number of words to work with (if set, tokenization will be restricted to the top num_words most common words in the dataset).
     - __char_level__: if True, every character will be treated as a token.
+    - __oov_token__: None or str. If given, it will be added to word_index and used to replace out-of-vocabulary words during text_to_sequence calls.
 
 - __Methods__:
 
diff --git a/docs/templates/why-use-keras.md b/docs/templates/why-use-keras.md
index d41d244f44e..41423a0f950 100644
--- a/docs/templates/why-use-keras.md
+++ b/docs/templates/why-use-keras.md
@@ -34,7 +34,7 @@ Your Keras models can be easily deployed across a greater range of platforms tha
 - On Android, via the TensorFlow Android runtime. Example: [Not Hotdog app](https://medium.com/@timanglade/how-hbos-silicon-valley-built-not-hotdog-with-mobile-tensorflow-keras-react-native-ef03260747f3)
 - In the browser, via GPU-accelerated JavaScript runtimes such as [Keras.js](https://transcranial.github.io/keras-js/#/) and [WebDNN](https://mil-tokyo.github.io/webdnn/)
 - On Google Cloud, via [TensorFlow-Serving](https://www.tensorflow.org/serving/)
-- In a Python webapp backend (such as a Flask app)
+- [In a Python webapp backend (such as a Flask app)](https://blog.keras.io/building-a-simple-keras-deep-learning-rest-api.html)
 - On the JVM, via [DL4J model import provided by SkyMind](https://deeplearning4j.org/model-import-keras)
 - On Raspberry Pi
 
@@ -54,7 +54,7 @@ As such, your Keras model can be trained on a number of different hardware platf
 
 - [NVIDIA GPUs](https://developer.nvidia.com/deep-learning)
 - [Google TPUs](https://cloud.google.com/tpu/), via the TensorFlow backend and Google Cloud
-- OpenGL-enabled GPUs, such as those from AMD, via [the PlaidML Keras backend](https://github.com/plaidml/plaidml)
+- OpenCL-enabled GPUs, such as those from AMD, via [the PlaidML Keras backend](https://github.com/plaidml/plaidml)
 
 ---
 
diff --git a/examples/babi_memnn.py b/examples/babi_memnn.py
index cbbccba2eba..c7c5e5da614 100644
--- a/examples/babi_memnn.py
+++ b/examples/babi_memnn.py
@@ -100,7 +100,7 @@ def vectorize_stories(data):
           '$ wget http://www.thespermwhale.com/jaseweston/babi/tasks_1-20_v1-2.tar.gz\n'
           '$ mv tasks_1-20_v1-2.tar.gz ~/.keras/datasets/babi-tasks-v1-2.tar.gz')
     raise
-tar = tarfile.open(path)
+
 
 challenges = {
     # QA1 with 10,000 samples
@@ -112,8 +112,9 @@ def vectorize_stories(data):
 challenge = challenges[challenge_type]
 
 print('Extracting stories for the challenge:', challenge_type)
-train_stories = get_stories(tar.extractfile(challenge.format('train')))
-test_stories = get_stories(tar.extractfile(challenge.format('test')))
+with tarfile.open(path) as tar:
+    train_stories = get_stories(tar.extractfile(challenge.format('train')))
+    test_stories = get_stories(tar.extractfile(challenge.format('test')))
 
 vocab = set()
 for story, q, answer in train_stories + test_stories:
diff --git a/examples/babi_rnn.py b/examples/babi_rnn.py
index 515c6ebd70c..8b2afccf4e6 100644
--- a/examples/babi_rnn.py
+++ b/examples/babi_rnn.py
@@ -160,7 +160,7 @@ def vectorize_stories(data, word_idx, story_maxlen, query_maxlen):
           '$ wget http://www.thespermwhale.com/jaseweston/babi/tasks_1-20_v1-2.tar.gz\n'
           '$ mv tasks_1-20_v1-2.tar.gz ~/.keras/datasets/babi-tasks-v1-2.tar.gz')
     raise
-tar = tarfile.open(path)
+
 # Default QA1 with 1000 samples
 # challenge = 'tasks_1-20_v1-2/en/qa1_single-supporting-fact_{}.txt'
 # QA1 with 10,000 samples
@@ -169,8 +169,9 @@ def vectorize_stories(data, word_idx, story_maxlen, query_maxlen):
 challenge = 'tasks_1-20_v1-2/en/qa2_two-supporting-facts_{}.txt'
 # QA2 with 10,000 samples
 # challenge = 'tasks_1-20_v1-2/en-10k/qa2_two-supporting-facts_{}.txt'
-train = get_stories(tar.extractfile(challenge.format('train')))
-test = get_stories(tar.extractfile(challenge.format('test')))
+with tarfile.open(path) as tar:
+    train = get_stories(tar.extractfile(challenge.format('train')))
+    test = get_stories(tar.extractfile(challenge.format('test')))
 
 vocab = set()
 for story, q, answer in train + test:
diff --git a/examples/cifar10_cnn_capsule.py b/examples/cifar10_cnn_capsule.py
new file mode 100644
index 00000000000..b9f09b3fb1b
--- /dev/null
+++ b/examples/cifar10_cnn_capsule.py
@@ -0,0 +1,222 @@
+"""Train a simple CNN-Capsule Network on the CIFAR10 small images dataset.
+
+Without Data Augmentation:
+It gets to 75% validation accuracy in 10 epochs,
+and 79% after 15 epochs, and overfitting after 20 epochs
+
+With Data Augmentation:
+It gets to 75% validation accuracy in 10 epochs,
+and 79% after 15 epochs, and 83% after 30 epcohs.
+In my test, highest validation accuracy is 83.79% after 50 epcohs.
+
+This is a fast Implement, just 20s/epcoh with a gtx 1070 gpu.
+"""
+
+from __future__ import print_function
+from keras import backend as K
+from keras.engine.topology import Layer
+from keras import activations
+from keras import utils
+from keras.datasets import cifar10
+from keras.models import Model
+from keras.layers import *
+from keras.preprocessing.image import ImageDataGenerator
+
+
+# the squashing function.
+# we use 0.5 in stead of 1 in hinton's paper.
+# if 1, the norm of vector will be zoomed out.
+# if 0.5, the norm will be zoomed in while original norm is less than 0.5
+# and be zoomed out while original norm is greater than 0.5.
+def squash(x, axis=-1):
+    s_squared_norm = K.sum(K.square(x), axis, keepdims=True) + K.epsilon()
+    scale = K.sqrt(s_squared_norm) / (0.5 + s_squared_norm)
+    return scale * x
+
+
+# define our own softmax function instead of K.softmax
+# because K.softmax can not specify axis.
+def softmax(x, axis=-1):
+    ex = K.exp(x - K.max(x, axis=axis, keepdims=True))
+    return ex / K.sum(ex, axis=axis, keepdims=True)
+
+
+# define the margin loss like hinge loss
+def margin_loss(y_true, y_pred):
+    lamb, margin = 0.5, 0.1
+    return y_true * K.square(K.relu(1 - margin - y_pred)) + lamb * (
+        1 - y_true) * K.square(K.relu(y_pred - margin))
+
+
+class Capsule(Layer):
+    """A Capsule Implement with Pure Keras
+    There are two vesions of Capsule.
+    One is like dense layer (for the fixed-shape input),
+    and the other is like timedistributed dense (for various length input).
+
+    The input shape of Capsule must be (batch_size,
+                                        input_num_capsule,
+                                        input_dim_capsule
+                                       )
+    and the output shape is (batch_size,
+                             num_capsule,
+                             dim_capsule
+                            )
+
+    Capsule Implement is from https://github.com/bojone/Capsule/
+    Capsule Paper: https://arxiv.org/abs/1710.09829
+    """
+
+    def __init__(self,
+                 num_capsule,
+                 dim_capsule,
+                 routings=3,
+                 share_weights=True,
+                 activation='squash',
+                 **kwargs):
+        super(Capsule, self).__init__(**kwargs)
+        self.num_capsule = num_capsule
+        self.dim_capsule = dim_capsule
+        self.routings = routings
+        self.share_weights = share_weights
+        if activation == 'squash':
+            self.activation = squash
+        else:
+            self.activation = activations.get(activation)
+
+    def build(self, input_shape):
+        input_dim_capsule = input_shape[-1]
+        if self.share_weights:
+            self.kernel = self.add_weight(
+                name='capsule_kernel',
+                shape=(1, input_dim_capsule,
+                       self.num_capsule * self.dim_capsule),
+                initializer='glorot_uniform',
+                trainable=True)
+        else:
+            input_num_capsule = input_shape[-2]
+            self.kernel = self.add_weight(
+                name='capsule_kernel',
+                shape=(input_num_capsule, input_dim_capsule,
+                       self.num_capsule * self.dim_capsule),
+                initializer='glorot_uniform',
+                trainable=True)
+
+    def call(self, inputs):
+        """Following the routing algorithm from Hinton's paper,
+        but replace b = b + <u,v> with b = <u,v>.
+
+        This change can improve the feature representation of Capsule.
+
+        However, you can replace
+            b = K.batch_dot(outputs, hat_inputs, [2, 3])
+        with
+            b += K.batch_dot(outputs, hat_inputs, [2, 3])
+        to realize a standard routing.
+        """
+
+        if self.share_weights:
+            hat_inputs = K.conv1d(inputs, self.kernel)
+        else:
+            hat_inputs = K.local_conv1d(inputs, self.kernel, [1], [1])
+
+        batch_size = K.shape(inputs)[0]
+        input_num_capsule = K.shape(inputs)[1]
+        hat_inputs = K.reshape(hat_inputs,
+                               (batch_size, input_num_capsule,
+                                self.num_capsule, self.dim_capsule))
+        hat_inputs = K.permute_dimensions(hat_inputs, (0, 2, 1, 3))
+
+        b = K.zeros_like(hat_inputs[:, :, :, 0])
+        for i in range(self.routings):
+            c = softmax(b, 1)
+            if K.backend() == 'theano':
+                o = K.sum(o, axis=1)
+            o = self.activation(K.batch_dot(c, hat_inputs, [2, 2]))
+            if i < self.routings - 1:
+                b = K.batch_dot(o, hat_inputs, [2, 3])
+                if K.backend() == 'theano':
+                    o = K.sum(o, axis=1)
+
+        return o
+
+    def compute_output_shape(self, input_shape):
+        return (None, self.num_capsule, self.dim_capsule)
+
+
+batch_size = 128
+num_classes = 10
+epochs = 100
+(x_train, y_train), (x_test, y_test) = cifar10.load_data()
+
+x_train = x_train.astype('float32')
+x_test = x_test.astype('float32')
+x_train /= 255
+x_test /= 255
+y_train = utils.to_categorical(y_train, num_classes)
+y_test = utils.to_categorical(y_test, num_classes)
+
+# A common Conv2D model
+input_image = Input(shape=(None, None, 3))
+x = Conv2D(64, (3, 3), activation='relu')(input_image)
+x = Conv2D(64, (3, 3), activation='relu')(x)
+x = AveragePooling2D((2, 2))(x)
+x = Conv2D(128, (3, 3), activation='relu')(x)
+x = Conv2D(128, (3, 3), activation='relu')(x)
+
+
+"""now we reshape it as (batch_size, input_num_capsule, input_dim_capsule)
+then connect a Capsule layer.
+
+the output of final model is the lengths of 10 Capsule, whose dim=16.
+
+the length of Capsule is the proba,
+so the problem becomes a 10 two-classification problem.
+"""
+
+x = Reshape((-1, 128))(x)
+capsule = Capsule(10, 16, 3, True)(x)
+output = Lambda(lambda z: K.sqrt(K.sum(K.square(z), 2)))(capsule)
+model = Model(inputs=input_image, outputs=output)
+
+# we use a margin loss
+model.compile(loss=margin_loss, optimizer='adam', metrics=['accuracy'])
+model.summary()
+
+# we can compare the performance with or without data augmentation
+data_augmentation = True
+
+if not data_augmentation:
+    print('Not using data augmentation.')
+    model.fit(
+        x_train,
+        y_train,
+        batch_size=batch_size,
+        epochs=epochs,
+        validation_data=(x_test, y_test),
+        shuffle=True)
+else:
+    print('Using real-time data augmentation.')
+    # This will do preprocessing and realtime data augmentation:
+    datagen = ImageDataGenerator(
+        featurewise_center=False,  # set input mean to 0 over the dataset
+        samplewise_center=False,  # set each sample mean to 0
+        featurewise_std_normalization=False,  # divide inputs by dataset std
+        samplewise_std_normalization=False,  # divide each input by its std
+        zca_whitening=False,  # apply ZCA whitening
+        rotation_range=0,  # randomly rotate images in 0 to 180 degrees
+        width_shift_range=0.1,  # randomly shift images horizontally
+        height_shift_range=0.1,  # randomly shift images vertically
+        horizontal_flip=True,  # randomly flip images
+        vertical_flip=False)  # randomly flip images
+
+    # Compute quantities required for feature-wise normalization
+    # (std, mean, and principal components if ZCA whitening is applied).
+    datagen.fit(x_train)
+
+    # Fit the model on the batches generated by datagen.flow().
+    model.fit_generator(
+        datagen.flow(x_train, y_train, batch_size=batch_size),
+        epochs=epochs,
+        validation_data=(x_test, y_test),
+        workers=4)
diff --git a/examples/cifar10_cnn_tfaugment2d.py b/examples/cifar10_cnn_tfaugment2d.py
index c1e65348aa3..167e974c7c4 100644
--- a/examples/cifar10_cnn_tfaugment2d.py
+++ b/examples/cifar10_cnn_tfaugment2d.py
@@ -57,9 +57,9 @@ def augment_2d(inputs, rotation=0, horizontal_flip=False, vertical_flip=False):
       rotation: A float, the degree range for rotation (0 <= rotation < 180),
           e.g. 3 for random image rotation between (-3.0, 3.0).
       horizontal_flip: A boolean, whether to allow random horizontal flip,
-          e.g. true for 50% possiblity to flip image horizontally.
+          e.g. true for 50% possibility to flip image horizontally.
       vertical_flip: A boolean, whether to allow random vertical flip,
-          e.g. true for 50% possiblity to flip image vertically.
+          e.g. true for 50% possibility to flip image vertically.
 
     # Returns
       input data after augmentation, whose shape is the same as its original.
diff --git a/examples/lstm_seq2seq.py b/examples/lstm_seq2seq.py
index cd9b9b7f328..a53a3dcde66 100644
--- a/examples/lstm_seq2seq.py
+++ b/examples/lstm_seq2seq.py
@@ -66,7 +66,8 @@
 target_texts = []
 input_characters = set()
 target_characters = set()
-lines = open(data_path, 'r', encoding='utf-8').read().split('\n')
+with open(data_path, 'r', encoding='utf-8') as f:
+    lines = f.read().split('\n')
 for line in lines[: min(num_samples, len(lines) - 1)]:
     input_text, target_text = line.split('\t')
     # We use "tab" as the "start sequence" character
diff --git a/examples/lstm_seq2seq_restore.py b/examples/lstm_seq2seq_restore.py
index 5f1d2ebcd8b..5890c18bfde 100644
--- a/examples/lstm_seq2seq_restore.py
+++ b/examples/lstm_seq2seq_restore.py
@@ -29,7 +29,8 @@
 target_texts = []
 input_characters = set()
 target_characters = set()
-lines = open(data_path, 'r', encoding='utf-8').read().split('\n')
+with open(data_path, 'r', encoding='utf-8') as f:
+    lines = f.read().split('\n')
 for line in lines[: min(num_samples, len(lines) - 1)]:
     input_text, target_text = line.split('\t')
     # We use "tab" as the "start sequence" character
diff --git a/examples/lstm_text_generation.py b/examples/lstm_text_generation.py
index 1968535b469..961850a8422 100644
--- a/examples/lstm_text_generation.py
+++ b/examples/lstm_text_generation.py
@@ -23,7 +23,8 @@
 import io
 
 path = get_file('nietzsche.txt', origin='https://s3.amazonaws.com/text-datasets/nietzsche.txt')
-text = io.open(path, encoding='utf-8').read().lower()
+with io.open(path, encoding='utf-8') as f:
+    text = f.read().lower()
 print('corpus length:', len(text))
 
 chars = sorted(list(set(text)))
diff --git a/examples/mnist_acgan.py b/examples/mnist_acgan.py
index ec81a62e290..93fd09597cb 100644
--- a/examples/mnist_acgan.py
+++ b/examples/mnist_acgan.py
@@ -341,5 +341,5 @@ def build_discriminator():
         Image.fromarray(img).save(
             'plot_epoch_{0:03d}_generated.png'.format(epoch))
 
-    pickle.dump({'train': train_history, 'test': test_history},
-                open('acgan-history.pkl', 'wb'))
+    with open('acgan-history.pkl', 'wb') as f:
+        pickle.dump({'train': train_history, 'test': test_history}, f)
diff --git a/examples/mnist_net2net.py b/examples/mnist_net2net.py
index c8c05bff393..77a16b6a0a8 100644
--- a/examples/mnist_net2net.py
+++ b/examples/mnist_net2net.py
@@ -208,7 +208,7 @@ def deeper2net_conv2d(teacher_w):
     kh, kw, num_channel, filters = teacher_w.shape
     student_w = np.zeros_like(teacher_w)
     for i in range(filters):
-        student_w[(kh - 1) / 2, (kw - 1) / 2, i, i] = 1.
+        student_w[(kh - 1) // 2, (kw - 1) // 2, i, i] = 1.
     student_b = np.zeros(filters)
     return student_w, student_b
 
diff --git a/examples/mnist_sklearn_wrapper.py b/examples/mnist_sklearn_wrapper.py
index 567f53a6da9..51001065e6e 100644
--- a/examples/mnist_sklearn_wrapper.py
+++ b/examples/mnist_sklearn_wrapper.py
@@ -65,7 +65,7 @@ def make_model(dense_layer_sizes, filters, kernel_size, pool_size):
     model.add(Flatten())
     for layer_size in dense_layer_sizes:
         model.add(Dense(layer_size))
-    model.add(Activation('relu'))
+        model.add(Activation('relu'))
     model.add(Dropout(0.5))
     model.add(Dense(num_classes))
     model.add(Activation('softmax'))
diff --git a/examples/mnist_tfrecord.py b/examples/mnist_tfrecord.py
index c3fdc591544..a8cc93ea194 100644
--- a/examples/mnist_tfrecord.py
+++ b/examples/mnist_tfrecord.py
@@ -33,11 +33,11 @@
 tensors, save the model weights, and then evaluate the
 model using the numpy based Keras API.
 
-Gets to ~99.1% validation accuracy after 5 epochs
+Gets to ~99.1% test accuracy after 5 epochs
 (high variance from run to run: 98.9-99.3).
 '''
 import numpy as np
-
+import os
 import tensorflow as tf
 import keras
 from keras import backend as K
@@ -118,7 +118,6 @@ def cnn_layers(x_train_input):
 
 batch_size = 100
 batch_shape = (batch_size, 28, 28, 1)
-steps_per_epoch = 600
 epochs = 5
 num_classes = 10
 
@@ -141,7 +140,10 @@ def cnn_layers(x_train_input):
 # output will have shape `[batch_size, x, y, z]`.
 enqueue_many = True
 
-data = mnist.load_mnist()
+cache_dir = os.path.expanduser(
+    os.path.join('~', '.keras', 'datasets', 'MNIST-data'))
+data = mnist.read_data_sets(cache_dir, validation_size=0)
+
 x_train_batch, y_train_batch = tf.train.shuffle_batch(
     tensors=[data.train.images, data.train.labels.astype(np.int32)],
     batch_size=batch_size,
@@ -205,7 +207,7 @@ def cnn_layers(x_train_input):
 threads = tf.train.start_queue_runners(sess, coord)
 
 train_model.fit(epochs=epochs,
-                steps_per_epoch=steps_per_epoch,
+                steps_per_epoch=int(np.ceil(data.train.num_examples / float(batch_size))),
                 callbacks=[EvaluateInputTensor(test_model, steps=100)])
 
 # Save the model weights.
@@ -217,8 +219,8 @@ def cnn_layers(x_train_input):
 K.clear_session()
 
 # Second Session to test loading trained model without tensors
-x_test = np.reshape(data.validation.images, (data.validation.images.shape[0], 28, 28, 1))
-y_test = data.validation.labels
+x_test = np.reshape(data.test.images, (data.test.images.shape[0], 28, 28, 1))
+y_test = data.test.labels
 x_test_inp = layers.Input(shape=(x_test.shape[1:]))
 test_out = cnn_layers(x_test_inp)
 test_model = keras.models.Model(inputs=x_test_inp, outputs=test_out)
diff --git a/examples/pretrained_word_embeddings.py b/examples/pretrained_word_embeddings.py
index d5f931ebee0..5e066902f5b 100644
--- a/examples/pretrained_word_embeddings.py
+++ b/examples/pretrained_word_embeddings.py
@@ -38,13 +38,12 @@
 print('Indexing word vectors.')
 
 embeddings_index = {}
-f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))
-for line in f:
-    values = line.split()
-    word = values[0]
-    coefs = np.asarray(values[1:], dtype='float32')
-    embeddings_index[word] = coefs
-f.close()
+with open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt')) as f:
+    for line in f:
+        values = line.split()
+        word = values[0]
+        coefs = np.asarray(values[1:], dtype='float32')
+        embeddings_index[word] = coefs
 
 print('Found %s word vectors.' % len(embeddings_index))
 
@@ -62,16 +61,13 @@
         for fname in sorted(os.listdir(path)):
             if fname.isdigit():
                 fpath = os.path.join(path, fname)
-                if sys.version_info < (3,):
-                    f = open(fpath)
-                else:
-                    f = open(fpath, encoding='latin-1')
-                t = f.read()
-                i = t.find('\n\n')  # skip header
-                if 0 < i:
-                    t = t[i:]
-                texts.append(t)
-                f.close()
+                args = {} if sys.version_info < (3,) else {'encoding': 'latin-1'}
+                with open(fpath, **args) as f:
+                    t = f.read()
+                    i = t.find('\n\n')  # skip header
+                    if 0 < i:
+                        t = t[i:]
+                    texts.append(t)
                 labels.append(label_id)
 
 print('Found %s texts.' % len(texts))
diff --git a/examples/variational_autoencoder.py b/examples/variational_autoencoder.py
index fd6ff447216..fe1d48aea6c 100644
--- a/examples/variational_autoencoder.py
+++ b/examples/variational_autoencoder.py
@@ -5,11 +5,13 @@
  - Auto-Encoding Variational Bayes
    https://arxiv.org/abs/1312.6114
 '''
+from __future__ import print_function
+
 import numpy as np
 import matplotlib.pyplot as plt
 from scipy.stats import norm
 
-from keras.layers import Input, Dense, Lambda, Layer
+from keras.layers import Input, Dense, Lambda
 from keras.models import Model
 from keras import backend as K
 from keras import metrics
@@ -44,29 +46,17 @@ def sampling(args):
 h_decoded = decoder_h(z)
 x_decoded_mean = decoder_mean(h_decoded)
 
+# instantiate VAE model
+vae = Model(x, x_decoded_mean)
+
+# Compute VAE loss
+xent_loss = original_dim * metrics.binary_crossentropy(x, x_decoded_mean)
+kl_loss = - 0.5 * K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1)
+vae_loss = K.mean(xent_loss + kl_loss)
 
-# Custom loss layer
-class CustomVariationalLayer(Layer):
-    def __init__(self, **kwargs):
-        self.is_placeholder = True
-        super(CustomVariationalLayer, self).__init__(**kwargs)
-
-    def vae_loss(self, x, x_decoded_mean):
-        xent_loss = original_dim * metrics.binary_crossentropy(x, x_decoded_mean)
-        kl_loss = - 0.5 * K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1)
-        return K.mean(xent_loss + kl_loss)
-
-    def call(self, inputs):
-        x = inputs[0]
-        x_decoded_mean = inputs[1]
-        loss = self.vae_loss(x, x_decoded_mean)
-        self.add_loss(loss, inputs=inputs)
-        # We won't actually use the output.
-        return x
-
-y = CustomVariationalLayer()([x, x_decoded_mean])
-vae = Model(x, y)
-vae.compile(optimizer='rmsprop', loss=None)
+vae.add_loss(vae_loss)
+vae.compile(optimizer='rmsprop')
+vae.summary()
 
 
 # train the VAE on MNIST digits
diff --git a/examples/variational_autoencoder_deconv.py b/examples/variational_autoencoder_deconv.py
index 4d6dcb079f2..a32d63d40ed 100644
--- a/examples/variational_autoencoder_deconv.py
+++ b/examples/variational_autoencoder_deconv.py
@@ -6,11 +6,13 @@
 - Auto-Encoding Variational Bayes
   https://arxiv.org/abs/1312.6114
 '''
+from __future__ import print_function
+
 import numpy as np
 import matplotlib.pyplot as plt
 from scipy.stats import norm
 
-from keras.layers import Input, Dense, Lambda, Flatten, Reshape, Layer
+from keras.layers import Input, Dense, Lambda, Flatten, Reshape
 from keras.layers import Conv2D, Conv2DTranspose
 from keras.models import Model
 from keras import backend as K
@@ -109,32 +111,18 @@ def sampling(args):
 x_decoded_relu = decoder_deconv_3_upsamp(deconv_2_decoded)
 x_decoded_mean_squash = decoder_mean_squash(x_decoded_relu)
 
+# instantiate VAE model
+vae = Model(x, x_decoded_mean_squash)
 
-# Custom loss layer
-class CustomVariationalLayer(Layer):
-    def __init__(self, **kwargs):
-        self.is_placeholder = True
-        super(CustomVariationalLayer, self).__init__(**kwargs)
-
-    def vae_loss(self, x, x_decoded_mean_squash):
-        x = K.flatten(x)
-        x_decoded_mean_squash = K.flatten(x_decoded_mean_squash)
-        xent_loss = img_rows * img_cols * metrics.binary_crossentropy(x, x_decoded_mean_squash)
-        kl_loss = - 0.5 * K.mean(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1)
-        return K.mean(xent_loss + kl_loss)
-
-    def call(self, inputs):
-        x = inputs[0]
-        x_decoded_mean_squash = inputs[1]
-        loss = self.vae_loss(x, x_decoded_mean_squash)
-        self.add_loss(loss, inputs=inputs)
-        # We don't use this output.
-        return x
-
+# Compute VAE loss
+xent_loss = img_rows * img_cols * metrics.binary_crossentropy(
+    K.flatten(x),
+    K.flatten(x_decoded_mean_squash))
+kl_loss = - 0.5 * K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1)
+vae_loss = K.mean(xent_loss + kl_loss)
+vae.add_loss(vae_loss)
 
-y = CustomVariationalLayer()([x, x_decoded_mean_squash])
-vae = Model(x, y)
-vae.compile(optimizer='rmsprop', loss=None)
+vae.compile(optimizer='rmsprop')
 vae.summary()
 
 # train the VAE on MNIST digits
diff --git a/keras/__init__.py b/keras/__init__.py
index e5546040d7b..97865ffe4ae 100644
--- a/keras/__init__.py
+++ b/keras/__init__.py
@@ -23,4 +23,4 @@
 from .models import Model
 from .models import Sequential
 
-__version__ = '2.1.3'
+__version__ = '2.1.4'
diff --git a/keras/applications/imagenet_utils.py b/keras/applications/imagenet_utils.py
index ec937c9b54f..ef44690b59a 100644
--- a/keras/applications/imagenet_utils.py
+++ b/keras/applications/imagenet_utils.py
@@ -205,7 +205,8 @@ def decode_predictions(preds, top=5):
                          CLASS_INDEX_PATH,
                          cache_subdir='models',
                          file_hash='c2c37ea517e94d9795004a39431a14cb')
-        CLASS_INDEX = json.load(open(fpath))
+        with open(fpath) as f:
+            CLASS_INDEX = json.load(f)
     results = []
     for pred in preds:
         top_indices = pred.argsort()[-top:][::-1]
diff --git a/keras/backend/__init__.py b/keras/backend/__init__.py
index 2e7208cf5af..6f0b870cde0 100644
--- a/keras/backend/__init__.py
+++ b/keras/backend/__init__.py
@@ -24,7 +24,8 @@
 _config_path = os.path.expanduser(os.path.join(_keras_dir, 'keras.json'))
 if os.path.exists(_config_path):
     try:
-        _config = json.load(open(_config_path))
+        with open(_config_path) as f:
+            _config = json.load(f)
     except ValueError:
         _config = {}
     _floatx = _config.get('floatx', floatx())
diff --git a/keras/backend/cntk_backend.py b/keras/backend/cntk_backend.py
index 84ae05343a6..9228d717bc9 100644
--- a/keras/backend/cntk_backend.py
+++ b/keras/backend/cntk_backend.py
@@ -447,7 +447,7 @@ def random_normal(shape, mean=0.0, stddev=1.0, dtype=None, seed=None):
                              'Please provide fixed dimension '
                              'instead of `None`.')
     # how to apply mean and stddev
-    return random_normal_variable(shape=shape, mean=mean, scale=1.0)
+    return random_normal_variable(shape=shape, mean=mean, scale=1.0, seed=seed)
 
 
 def truncated_normal(shape, mean=0.0, stddev=1.0, dtype=None, seed=None):
@@ -565,7 +565,7 @@ def transpose(x):
 def gather(reference, indices):
     # There is a bug in cntk gather op which may cause crash.
     # We have made a fix but not catched in CNTK 2.1 release.
-    # Will udpate with gather op in next release
+    # Will update with gather op in next release
     if _get_cntk_version() >= 2.2:
         return C.ops.gather(reference, indices)
     else:
@@ -2039,8 +2039,8 @@ def batch_get_value(xs):
 def set_value(x, value):
     if (isinstance(x, C.variables.Parameter) or
        isinstance(x, C.variables.Constant)):
-        if isinstance(value, float):
-            value = np.full(x.shape, value)
+        if isinstance(value, (float, int)):
+            value = np.full(x.shape, value, dtype=floatx())
         x.value = value
     else:
         raise NotImplementedError
@@ -2079,8 +2079,8 @@ def switch(condition, then_expression, else_expression):
         raise ValueError('Rank of condition should be less'
                          ' than or equal to rank of then and'
                          ' else expressions. ndim(condition)=' +
-                         str(cond_ndim) + ', ndim(then_expression)'
-                         '=' + str(expr_ndim))
+                         str(ndim_cond) + ', ndim(then_expression)'
+                         '=' + str(ndim_expr))
     elif ndim_cond < ndim_expr:
         shape_expr = int_shape(then_expression)
         ndim_diff = ndim_expr - ndim_cond
diff --git a/keras/backend/tensorflow_backend.py b/keras/backend/tensorflow_backend.py
index ef6e6c35395..27f8afed8c2 100644
--- a/keras/backend/tensorflow_backend.py
+++ b/keras/backend/tensorflow_backend.py
@@ -1939,7 +1939,7 @@ def resize_images(x, height_factor, width_factor, data_format):
                      original_shape[2] * width_factor if original_shape[2] is not None else None, None))
         return x
     else:
-        raise ValueError('Invalid data_format:', data_format)
+        raise ValueError('Unknown data_format: ' + str(data_format))
 
 
 def resize_volumes(x, depth_factor, height_factor, width_factor, data_format):
@@ -1969,7 +1969,7 @@ def resize_volumes(x, depth_factor, height_factor, width_factor, data_format):
         output = repeat_elements(output, width_factor, axis=3)
         return output
     else:
-        raise ValueError('Invalid data_format:', data_format)
+        raise ValueError('Unknown data_format: ' + str(data_format))
 
 
 def repeat_elements(x, rep, axis):
@@ -2046,7 +2046,7 @@ def arange(start, stop=None, step=1, dtype='int32'):
 
     The function arguments use the same convention as
     Theano's arange: if only one argument is provided,
-    it is in fact the "stop" argument.
+    it is in fact the "stop" argument and "start" is 0.
 
     The default type of the returned tensor is `'int32'` to
     match TensorFlow's default.
@@ -2061,7 +2061,7 @@ def arange(start, stop=None, step=1, dtype='int32'):
         An integer tensor.
 
     """
-    # Match the behavior of numpy and Theano by returning an empty seqence.
+    # Match the behavior of numpy and Theano by returning an empty sequence.
     if stop is None:
         try:
             if start < 0:
@@ -2183,7 +2183,7 @@ def spatial_2d_padding(x, padding=((1, 1), (1, 1)), data_format=None):
     if data_format is None:
         data_format = image_data_format()
     if data_format not in {'channels_first', 'channels_last'}:
-        raise ValueError('Unknown data_format ' + str(data_format))
+        raise ValueError('Unknown data_format: ' + str(data_format))
 
     if data_format == 'channels_first':
         pattern = [[0, 0],
@@ -2227,7 +2227,7 @@ def spatial_3d_padding(x, padding=((1, 1), (1, 1), (1, 1)), data_format=None):
     if data_format is None:
         data_format = image_data_format()
     if data_format not in {'channels_first', 'channels_last'}:
-        raise ValueError('Unknown data_format ' + str(data_format))
+        raise ValueError('Unknown data_format: ' + str(data_format))
 
     if data_format == 'channels_first':
         pattern = [
@@ -3247,7 +3247,7 @@ def _preprocess_padding(padding):
     elif padding == 'valid':
         padding = 'VALID'
     else:
-        raise ValueError('Invalid padding:', padding)
+        raise ValueError('Invalid padding: ' + str(padding))
     return padding
 
 
@@ -3272,7 +3272,7 @@ def conv1d(x, kernel, strides=1, padding='valid',
     if data_format is None:
         data_format = image_data_format()
     if data_format not in {'channels_first', 'channels_last'}:
-        raise ValueError('Unknown data_format ' + str(data_format))
+        raise ValueError('Unknown data_format: ' + str(data_format))
 
     kernel_shape = kernel.get_shape().as_list()
     if padding == 'causal':
@@ -3318,7 +3318,7 @@ def conv2d(x, kernel, strides=(1, 1), padding='valid',
     if data_format is None:
         data_format = image_data_format()
     if data_format not in {'channels_first', 'channels_last'}:
-        raise ValueError('Unknown data_format ' + str(data_format))
+        raise ValueError('Unknown data_format: ' + str(data_format))
 
     x, tf_data_format = _preprocess_conv2d_input(x, data_format)
 
@@ -3359,7 +3359,7 @@ def conv2d_transpose(x, kernel, output_shape, strides=(1, 1),
     if data_format is None:
         data_format = image_data_format()
     if data_format not in {'channels_first', 'channels_last'}:
-        raise ValueError('Unknown data_format ' + str(data_format))
+        raise ValueError('Unknown data_format: ' + str(data_format))
     if isinstance(output_shape, (tuple, list)):
         output_shape = tf.stack(output_shape)
 
@@ -3410,16 +3410,16 @@ def separable_conv1d(x, depthwise_kernel, pointwise_kernel, strides=1,
     if data_format is None:
         data_format = image_data_format()
     if data_format not in {'channels_first', 'channels_last'}:
-        raise ValueError('Unknown data_format ' + str(data_format))
+        raise ValueError('Unknown data_format: ' + str(data_format))
 
     x, tf_data_format = _preprocess_conv1d_input(x, data_format)
     padding = _preprocess_padding(padding)
     if tf_data_format == 'NHWC':
         spatial_start_dim = 1
-        strides = (1, 1) + strides + (1,)
+        strides = (1,) + strides * 2 + (1,)
     else:
         spatial_start_dim = 2
-        strides = (1, 1, 1) + strides
+        strides = (1, 1) + strides * 2
     x = tf.expand_dims(x, spatial_start_dim)
     depthwise_kernel = tf.expand_dims(depthwise_kernel, 0)
     pointwise_kernel = tf.expand_dims(pointwise_kernel, 0)
@@ -3462,7 +3462,7 @@ def separable_conv2d(x, depthwise_kernel, pointwise_kernel, strides=(1, 1),
     if data_format is None:
         data_format = image_data_format()
     if data_format not in {'channels_first', 'channels_last'}:
-        raise ValueError('Unknown data_format ' + str(data_format))
+        raise ValueError('Unknown data_format: ' + str(data_format))
 
     x, tf_data_format = _preprocess_conv2d_input(x, data_format)
     padding = _preprocess_padding(padding)
@@ -3503,7 +3503,7 @@ def depthwise_conv2d(x, depthwise_kernel, strides=(1, 1), padding='valid',
     if data_format is None:
         data_format = image_data_format()
     if data_format not in {'channels_first', 'channels_last'}:
-        raise ValueError('Unknown data_format ' + str(data_format))
+        raise ValueError('Unknown data_format: ' + str(data_format))
 
     x, tf_data_format = _preprocess_conv2d_input(x, data_format)
     padding = _preprocess_padding(padding)
@@ -3545,7 +3545,7 @@ def conv3d(x, kernel, strides=(1, 1, 1), padding='valid',
     if data_format is None:
         data_format = image_data_format()
     if data_format not in {'channels_first', 'channels_last'}:
-        raise ValueError('Unknown data_format ' + str(data_format))
+        raise ValueError('Unknown data_format: ' + str(data_format))
 
     x, tf_data_format = _preprocess_conv3d_input(x, data_format)
     padding = _preprocess_padding(padding)
@@ -3584,7 +3584,7 @@ def conv3d_transpose(x, kernel, output_shape, strides=(1, 1, 1),
     if data_format is None:
         data_format = image_data_format()
     if data_format not in {'channels_first', 'channels_last'}:
-        raise ValueError('Unknown data_format ' + str(data_format))
+        raise ValueError('Unknown data_format: ' + str(data_format))
     if isinstance(output_shape, (tuple, list)):
         output_shape = tf.stack(output_shape)
 
@@ -3637,7 +3637,7 @@ def pool2d(x, pool_size, strides=(1, 1),
     if data_format is None:
         data_format = image_data_format()
     if data_format not in {'channels_first', 'channels_last'}:
-        raise ValueError('Unknown data_format ' + str(data_format))
+        raise ValueError('Unknown data_format: ' + str(data_format))
 
     x, tf_data_format = _preprocess_conv2d_input(x, data_format)
     padding = _preprocess_padding(padding)
@@ -3657,7 +3657,7 @@ def pool2d(x, pool_size, strides=(1, 1),
                            padding=padding,
                            data_format=tf_data_format)
     else:
-        raise ValueError('Invalid pooling mode:', pool_mode)
+        raise ValueError('Invalid pool_mode: ' + str(pool_mode))
 
     if data_format == 'channels_first' and tf_data_format == 'NHWC':
         x = tf.transpose(x, (0, 3, 1, 2))  # NHWC -> NCHW
@@ -3686,7 +3686,7 @@ def pool3d(x, pool_size, strides=(1, 1, 1), padding='valid',
     if data_format is None:
         data_format = image_data_format()
     if data_format not in {'channels_first', 'channels_last'}:
-        raise ValueError('Unknown data_format ' + str(data_format))
+        raise ValueError('Unknown data_format: ' + str(data_format))
 
     x, tf_data_format = _preprocess_conv3d_input(x, data_format)
     padding = _preprocess_padding(padding)
@@ -3706,7 +3706,7 @@ def pool3d(x, pool_size, strides=(1, 1, 1), padding='valid',
                              padding=padding,
                              data_format=tf_data_format)
     else:
-        raise ValueError('Invalid pooling mode:', pool_mode)
+        raise ValueError('Invalid pool_mode: ' + str(pool_mode))
 
     if data_format == 'channels_first' and tf_data_format == 'NDHWC':
         x = tf.transpose(x, (0, 4, 1, 2, 3))
@@ -3734,7 +3734,7 @@ def bias_add(x, bias, data_format=None):
     if data_format is None:
         data_format = image_data_format()
     if data_format not in {'channels_first', 'channels_last'}:
-        raise ValueError('Unknown data_format ' + str(data_format))
+        raise ValueError('Unknown data_format: ' + str(data_format))
     bias_shape = int_shape(bias)
     if len(bias_shape) != 1 and len(bias_shape) != ndim(x) - 1:
         raise ValueError('Unexpected bias dimensions %d, expect to be 1 or %d dimensions'
@@ -4065,7 +4065,7 @@ def local_conv1d(inputs, kernel, kernel_size, strides, data_format=None):
     if data_format is None:
         data_format = image_data_format()
     if data_format not in {'channels_first', 'channels_last'}:
-        raise ValueError('Unknown data_format ' + str(data_format))
+        raise ValueError('Unknown data_format: ' + str(data_format))
 
     stride = strides[0]
     kernel_shape = int_shape(kernel)
@@ -4117,7 +4117,7 @@ def local_conv2d(inputs, kernel, kernel_size, strides, output_shape, data_format
     if data_format is None:
         data_format = image_data_format()
     if data_format not in {'channels_first', 'channels_last'}:
-        raise ValueError('Unknown data_format ' + str(data_format))
+        raise ValueError('Unknown data_format: ' + str(data_format))
 
     stride_row, stride_col = strides
     output_row, output_col = output_shape
diff --git a/keras/callbacks.py b/keras/callbacks.py
index 7abbb82dfde..8b5ec5f341b 100644
--- a/keras/callbacks.py
+++ b/keras/callbacks.py
@@ -18,6 +18,7 @@
 from collections import Iterable
 from .utils.generic_utils import Progbar
 from . import backend as K
+from .engine.topology import Layer
 
 try:
     import requests
@@ -202,8 +203,20 @@ class BaseLogger(Callback):
     """Callback that accumulates epoch averages of metrics.
 
     This callback is automatically applied to every Keras model.
+
+    # Arguments
+        stateful_metrics: Iterable of string names of metrics that
+            should *not* be averaged over an epoch.
+            Metrics in this list will be logged as-is in `on_epoch_end`.
+            All others will be averaged in `on_epoch_end`.
     """
 
+    def __init__(self, stateful_metrics=None):
+        if stateful_metrics:
+            self.stateful_metrics = set(stateful_metrics)
+        else:
+            self.stateful_metrics = set()
+
     def on_epoch_begin(self, epoch, logs=None):
         self.seen = 0
         self.totals = {}
@@ -214,17 +227,23 @@ def on_batch_end(self, batch, logs=None):
         self.seen += batch_size
 
         for k, v in logs.items():
-            if k in self.totals:
-                self.totals[k] += v * batch_size
+            if k in self.stateful_metrics:
+                self.totals[k] = v
             else:
-                self.totals[k] = v * batch_size
+                if k in self.totals:
+                    self.totals[k] += v * batch_size
+                else:
+                    self.totals[k] = v * batch_size
 
     def on_epoch_end(self, epoch, logs=None):
         if logs is not None:
             for k in self.params['metrics']:
                 if k in self.totals:
                     # Make value available to next callbacks.
-                    logs[k] = self.totals[k] / self.seen
+                    if k in self.stateful_metrics:
+                        logs[k] = self.totals[k]
+                    else:
+                        logs[k] = self.totals[k] / self.seen
 
 
 class TerminateOnNaN(Callback):
@@ -250,12 +269,17 @@ class ProgbarLogger(Callback):
         count_mode: One of "steps" or "samples".
             Whether the progress bar should
             count samples seen or steps (batches) seen.
+        stateful_metrics: Iterable of string names of metrics that
+            should *not* be averaged over an epoch.
+            Metrics in this list will be logged as-is.
+            All others will be averaged over time (e.g. loss, etc).
 
     # Raises
         ValueError: In case of invalid `count_mode`.
     """
 
-    def __init__(self, count_mode='samples'):
+    def __init__(self, count_mode='samples',
+                 stateful_metrics=None):
         super(ProgbarLogger, self).__init__()
         if count_mode == 'samples':
             self.use_steps = False
@@ -263,6 +287,10 @@ def __init__(self, count_mode='samples'):
             self.use_steps = True
         else:
             raise ValueError('Unknown `count_mode`: ' + str(count_mode))
+        if stateful_metrics:
+            self.stateful_metrics = set(stateful_metrics)
+        else:
+            self.stateful_metrics = set()
 
     def on_train_begin(self, logs=None):
         self.verbose = self.params['verbose']
@@ -277,7 +305,8 @@ def on_epoch_begin(self, epoch, logs=None):
                 target = self.params['samples']
             self.target = target
             self.progbar = Progbar(target=self.target,
-                                   verbose=self.verbose)
+                                   verbose=self.verbose,
+                                   stateful_metrics=self.stateful_metrics)
         self.seen = 0
 
     def on_batch_begin(self, batch, logs=None):
@@ -307,7 +336,7 @@ def on_epoch_end(self, epoch, logs=None):
             if k in logs:
                 self.log_values.append((k, logs[k]))
         if self.verbose:
-            self.progbar.update(self.seen, self.log_values, force=True)
+            self.progbar.update(self.seen, self.log_values)
 
 
 class History(Callback):
@@ -546,7 +575,10 @@ def on_epoch_end(self, epoch, logs=None):
         send = {}
         send['epoch'] = epoch
         for k, v in logs.items():
-            send[k] = v
+            if isinstance(v, (np.ndarray, np.generic)):
+                send[k] = v.item()
+            else:
+                send[k] = v
         try:
             requests.post(self.root + self.path,
                           {self.field: json.dumps(send)},
@@ -561,8 +593,8 @@ class LearningRateScheduler(Callback):
 
     # Arguments
         schedule: a function that takes an epoch index as input
-            (integer, indexed from 0) and returns a new
-            learning rate as output (float).
+            (integer, indexed from 0) and current learning rate
+            and returns a new learning rate as output (float).
         verbose: int. 0: quiet, 1: update messages.
     """
 
@@ -574,7 +606,11 @@ def __init__(self, schedule, verbose=0):
     def on_epoch_begin(self, epoch, logs=None):
         if not hasattr(self.model.optimizer, 'lr'):
             raise ValueError('Optimizer must have a "lr" attribute.')
-        lr = self.schedule(epoch)
+        lr = float(K.get_value(self.model.optimizer.lr))
+        try:  # new API
+            lr = self.schedule(epoch, lr=lr)
+        except TypeError:  # old API for backward compatibility
+            lr = self.schedule(epoch)
         if not isinstance(lr, (float, np.float32, np.float64)):
             raise ValueError('The output of the "schedule" function '
                              'should be float.')
@@ -585,7 +621,7 @@ def on_epoch_begin(self, epoch, logs=None):
 
 
 class TensorBoard(Callback):
-    """Tensorboard basic visualizations.
+    """TensorBoard basic visualizations.
 
     [TensorBoard](https://www.tensorflow.org/get_started/summaries_and_tensorboard)
     is a visualization tool provided with TensorFlow.
@@ -601,6 +637,10 @@ class TensorBoard(Callback):
     tensorboard --logdir=/full_path_to_your_logs
     ```
 
+    When using a backend other than TensorFlow, TensorBoard will still work
+    (if you have TensorFlow installed), but the only feature available will
+    be the display of the losses and metrics plots.
+
     # Arguments
         log_dir: the path of the directory where to save the log
             files to be parsed by TensorBoard.
@@ -638,12 +678,31 @@ def __init__(self, log_dir='./logs',
                  embeddings_layer_names=None,
                  embeddings_metadata=None):
         super(TensorBoard, self).__init__()
-        if K.backend() != 'tensorflow':
-            raise RuntimeError('TensorBoard callback only works '
-                               'with the TensorFlow backend.')
         global tf, projector
-        import tensorflow as tf
-        from tensorflow.contrib.tensorboard.plugins import projector
+        try:
+            import tensorflow as tf
+            from tensorflow.contrib.tensorboard.plugins import projector
+        except ImportError:
+            raise ImportError('You need the TensorFlow module installed to use TensorBoard.')
+
+        if K.backend() != 'tensorflow':
+            if histogram_freq != 0:
+                warnings.warn('You are not using the TensorFlow backend. '
+                              'histogram_freq was set to 0')
+                histogram_freq = 0
+            if write_graph:
+                warnings.warn('You are not using the TensorFlow backend. '
+                              'write_graph was set to False')
+                write_graph = False
+            if write_images:
+                warnings.warn('You are not using the TensorFlow backend. '
+                              'write_images was set to False')
+                write_images = False
+            if embeddings_freq != 0:
+                warnings.warn('You are not using the TensorFlow backend. '
+                              'embeddings_freq was set to 0')
+                embeddings_freq = 0
+
         self.log_dir = log_dir
         self.histogram_freq = histogram_freq
         self.merged = None
@@ -657,7 +716,8 @@ def __init__(self, log_dir='./logs',
 
     def set_model(self, model):
         self.model = model
-        self.sess = K.get_session()
+        if K.backend() == 'tensorflow':
+            self.sess = K.get_session()
         if self.histogram_freq and self.merged is None:
             for layer in self.model.layers:
 
diff --git a/keras/constraints.py b/keras/constraints.py
index 840b7eb03e0..b3a97edb533 100644
--- a/keras/constraints.py
+++ b/keras/constraints.py
@@ -181,5 +181,5 @@ def get(identifier):
     elif callable(identifier):
         return identifier
     else:
-        raise ValueError('Could not interpret constraint identifier:',
-                         identifier)
+        raise ValueError('Could not interpret constraint identifier: ' +
+                         str(identifier))
diff --git a/keras/datasets/cifar.py b/keras/datasets/cifar.py
index 196691bd662..62b40fd6119 100644
--- a/keras/datasets/cifar.py
+++ b/keras/datasets/cifar.py
@@ -20,17 +20,16 @@ def load_batch(fpath, label_key='labels'):
     # Returns
         A tuple `(data, labels)`.
     """
-    f = open(fpath, 'rb')
-    if sys.version_info < (3,):
-        d = cPickle.load(f)
-    else:
-        d = cPickle.load(f, encoding='bytes')
-        # decode utf8
-        d_decoded = {}
-        for k, v in d.items():
-            d_decoded[k.decode('utf8')] = v
-        d = d_decoded
-    f.close()
+    with open(fpath, 'rb') as f:
+        if sys.version_info < (3,):
+            d = cPickle.load(f)
+        else:
+            d = cPickle.load(f, encoding='bytes')
+            # decode utf8
+            d_decoded = {}
+            for k, v in d.items():
+                d_decoded[k.decode('utf8')] = v
+            d = d_decoded
     data = d['data']
     labels = d[label_key]
 
diff --git a/keras/datasets/imdb.py b/keras/datasets/imdb.py
index 86f76a48f1d..4dc032eb78f 100644
--- a/keras/datasets/imdb.py
+++ b/keras/datasets/imdb.py
@@ -114,7 +114,5 @@ def get_word_index(path='imdb_word_index.json'):
     path = get_file(path,
                     origin='https://s3.amazonaws.com/text-datasets/imdb_word_index.json',
                     file_hash='bfafd718b763782e994055a2d397834f')
-    f = open(path)
-    data = json.load(f)
-    f.close()
-    return data
+    with open(path) as f:
+        return json.load(f)
diff --git a/keras/engine/topology.py b/keras/engine/topology.py
index 301f7e07fc6..21a71670ef1 100644
--- a/keras/engine/topology.py
+++ b/keras/engine/topology.py
@@ -1944,7 +1944,10 @@ def losses(self):
                 losses += layer.get_losses_for(None)
         # Add any potential unconditional model-level loss.
         losses += self.get_losses_for(None)
-        return losses
+
+        unique_tensors = list(set(x for x in losses if not isinstance(x, (float, int))))
+        non_tensors = [x for x in losses if isinstance(x, (float, int))]
+        return unique_tensors + non_tensors
 
     @property
     def uses_learning_phase(self):
@@ -2244,7 +2247,7 @@ def run_internal_graph(self, inputs, masks=None):
 
                         # Apply activity regularizer if any:
                         if hasattr(layer, 'activity_regularizer') and layer.activity_regularizer is not None:
-                            regularization_losses = [layer.activity_regularizer(x) for x in computed_tensors]
+                            regularization_losses = [layer.activity_regularizer(x) for x in output_tensors]
                             layer.add_loss(regularization_losses, computed_tensors)
 
                     # Update model updates and losses:
@@ -2601,12 +2604,12 @@ def save_weights(self, filepath, overwrite=True):
             proceed = ask_to_proceed_with_overwrite(filepath)
             if not proceed:
                 return
-        f = h5py.File(filepath, 'w')
-        save_weights_to_hdf5_group(f, self.layers)
-        f.flush()
-        f.close()
+        with h5py.File(filepath, 'w') as f:
+            save_weights_to_hdf5_group(f, self.layers)
+            f.flush()
 
-    def load_weights(self, filepath, by_name=False, skip_mismatch=False):
+    def load_weights(self, filepath, by_name=False,
+                     skip_mismatch=False, reshape=False):
         """Loads all layer weights from a HDF5 save file.
 
         If `by_name` is False (default) weights are loaded
@@ -2629,6 +2632,8 @@ def load_weights(self, filepath, by_name=False, skip_mismatch=False):
                 where there is a mismatch in the number of weights,
                 or a mismatch in the shape of the weight
                 (only valid when `by_name`=True).
+            reshape: Reshape weights to fit the layer when the correct number
+                of weight arrays is present but their shape does not match.
 
 
         # Raises
@@ -2636,17 +2641,16 @@ def load_weights(self, filepath, by_name=False, skip_mismatch=False):
         """
         if h5py is None:
             raise ImportError('`load_weights` requires h5py.')
-        f = h5py.File(filepath, mode='r')
-        if 'layer_names' not in f.attrs and 'model_weights' in f:
-            f = f['model_weights']
-        if by_name:
-            load_weights_from_hdf5_group_by_name(
-                f, self.layers, skip_mismatch=skip_mismatch)
-        else:
-            load_weights_from_hdf5_group(f, self.layers)
-
-        if hasattr(f, 'close'):
-            f.close()
+        with h5py.File(filepath, mode='r') as f:
+            if 'layer_names' not in f.attrs and 'model_weights' in f:
+                f = f['model_weights']
+            if by_name:
+                load_weights_from_hdf5_group_by_name(
+                    f, self.layers, skip_mismatch=skip_mismatch,
+                    reshape=reshape)
+            else:
+                load_weights_from_hdf5_group(
+                    f, self.layers, reshape=reshape)
 
     def _updated_config(self):
         """Util hared between different serialization methods.
@@ -2957,7 +2961,8 @@ def save_weights_to_hdf5_group(f, layers):
 
 def preprocess_weights_for_loading(layer, weights,
                                    original_keras_version=None,
-                                   original_backend=None):
+                                   original_backend=None,
+                                   reshape=False):
     """Converts layers weights from Keras 1 format to Keras 2.
 
     # Arguments
@@ -2966,6 +2971,8 @@ def preprocess_weights_for_loading(layer, weights,
         original_keras_version: Keras version for the weights, as a string.
         original_backend: Keras backend the weights were trained with,
             as a string.
+        reshape: Reshape weights to fit the layer when the correct number
+            of values are present but the shape does not match.
 
     # Returns
         A list of weights values (Numpy arrays).
@@ -3105,11 +3112,24 @@ def preprocess_weights_for_loading(layer, weights,
                    'Conv2DTranspose',
                    'ConvLSTM2D']
     if layer.__class__.__name__ in conv_layers:
+        layer_weights_shape = K.int_shape(layer.weights[0])
         if _need_convert_kernel(original_backend):
             weights[0] = conv_utils.convert_kernel(weights[0])
             if layer.__class__.__name__ == 'ConvLSTM2D':
                 weights[1] = conv_utils.convert_kernel(weights[1])
-        if K.int_shape(layer.weights[0]) != weights[0].shape:
+        if reshape and layer_weights_shape != weights[0].shape:
+            if weights[0].size != np.prod(layer_weights_shape):
+                raise ValueError('Weights must be of equal size to ' +
+                                 'apply a reshape operation. ' +
+                                 'Layer ' + layer.name +
+                                 '\'s weights have shape ' +
+                                 str(layer_weights_shape) + ' and size ' +
+                                 str(np.prod(layer_weights_shape)) + '. ' +
+                                 'The weights for loading have shape ' +
+                                 str(weights[0].shape) + ' and size ' +
+                                 str(weights[0].size) + '. ')
+            weights[0] = np.reshape(weights[0], layer_weights_shape)
+        elif layer_weights_shape != weights[0].shape:
             weights[0] = np.transpose(weights[0], (3, 2, 0, 1))
             if layer.__class__.__name__ == 'ConvLSTM2D':
                 weights[1] = np.transpose(weights[1], (3, 2, 0, 1))
@@ -3161,12 +3181,14 @@ def _need_convert_kernel(original_backend):
     return uses_correlation[original_backend] != uses_correlation[K.backend()]
 
 
-def load_weights_from_hdf5_group(f, layers):
+def load_weights_from_hdf5_group(f, layers, reshape=False):
     """Implements topological (order-based) weight loading.
 
     # Arguments
         f: A pointer to a HDF5 group.
         layers: a list of target layers.
+        reshape: Reshape weights to fit the layer when the correct number
+            of values are present but the shape does not match.
 
     # Raises
         ValueError: in case of mismatch between provided layers
@@ -3213,7 +3235,8 @@ def load_weights_from_hdf5_group(f, layers):
         weight_values = preprocess_weights_for_loading(layer,
                                                        weight_values,
                                                        original_keras_version,
-                                                       original_backend)
+                                                       original_backend,
+                                                       reshape=reshape)
         if len(weight_values) != len(symbolic_weights):
             raise ValueError('Layer #' + str(k) +
                              ' (named "' + layer.name +
@@ -3229,7 +3252,8 @@ def load_weights_from_hdf5_group(f, layers):
     K.batch_set_value(weight_value_tuples)
 
 
-def load_weights_from_hdf5_group_by_name(f, layers, skip_mismatch=False):
+def load_weights_from_hdf5_group_by_name(f, layers, skip_mismatch=False,
+                                         reshape=False):
     """Implements name-based weight loading.
 
     (instead of topological weight loading).
@@ -3242,6 +3266,8 @@ def load_weights_from_hdf5_group_by_name(f, layers, skip_mismatch=False):
         skip_mismatch: Boolean, whether to skip loading of layers
             where there is a mismatch in the number of weights,
             or a mismatch in the shape of the weights.
+        reshape: Reshape weights to fit the layer when the correct number
+            of values are present but the shape does not match.
 
     # Raises
         ValueError: in case of mismatch between provided layers
@@ -3279,7 +3305,8 @@ def load_weights_from_hdf5_group_by_name(f, layers, skip_mismatch=False):
                 layer,
                 weight_values,
                 original_keras_version,
-                original_backend)
+                original_backend,
+                reshape=reshape)
             if len(weight_values) != len(symbolic_weights):
                 if skip_mismatch:
                     warnings.warn('Skipping loading of weights for layer {}'.format(layer.name) +
diff --git a/keras/engine/training.py b/keras/engine/training.py
index ba83775beeb..103b8e58593 100644
--- a/keras/engine/training.py
+++ b/keras/engine/training.py
@@ -11,6 +11,7 @@
 from scipy.sparse import issparse
 
 from .topology import Container
+from .topology import Layer
 from .. import backend as K
 from .. import optimizers
 from .. import losses
@@ -60,17 +61,19 @@ def _standardize_input_data(data, names, shapes=None,
     if isinstance(data, dict):
         try:
             data = [data[x].values if data[x].__class__.__name__ == 'DataFrame' else data[x] for x in names]
-            data = [np.expand_dims(x, 1) if x.ndim == 1 else x for x in data]
         except KeyError as e:
             raise ValueError(
                 'No data provided for "' + e.args[0] + '". Need data '
                 'for each key in: ' + str(names))
     elif isinstance(data, list):
-        data = [x.values if x.__class__.__name__ == 'DataFrame' else x for x in data]
-        data = [np.expand_dims(x, 1) if x is not None and x.ndim == 1 else x for x in data]
+        if len(names) == 1 and data and isinstance(data[0], (float, int)):
+            data = [np.asarray(data)]
+        else:
+            data = [x.values if x.__class__.__name__ == 'DataFrame' else x for x in data]
     else:
         data = data.values if data.__class__.__name__ == 'DataFrame' else data
-        data = [np.expand_dims(data, 1)] if data.ndim == 1 else [data]
+        data = [data]
+    data = [np.expand_dims(x, 1) if x is not None and x.ndim == 1 else x for x in data]
 
     if len(data) != len(names):
         if data and hasattr(data[0], 'shape'):
@@ -181,7 +184,7 @@ def _standardize_sample_weights(sample_weight, output_names):
 
 
 def _check_array_lengths(inputs, targets, weights=None):
-    """Does user input validation for numpy arrays.
+    """Checks if batch axes are the same for numpy arrays.
 
     # Arguments
         inputs: list of Numpy arrays of inputs.
@@ -514,7 +517,7 @@ def _standardize_weights(y, sample_weight=None, class_weight=None,
             raise ValueError('`class_weight` not supported for '
                              '3+ dimensional targets.')
         if y.shape[1] > 1:
-            y_classes = y.argmax(axis=1)
+            y_classes = np.argmax(y, axis=1)
         elif y.shape[1] == 1:
             y_classes = np.reshape(y, y.shape[0])
         else:
@@ -543,7 +546,7 @@ class Model(Container):
     """The `Model` class adds training & evaluation routines to a `Container`.
     """
 
-    def compile(self, optimizer, loss, metrics=None, loss_weights=None,
+    def compile(self, optimizer, loss=None, metrics=None, loss_weights=None,
                 sample_weight_mode=None, weighted_metrics=None,
                 target_tensors=None, **kwargs):
         """Configures the model for training.
@@ -805,7 +808,7 @@ def compile(self, optimizer, loss, metrics=None, loss_weights=None,
                 self._feed_sample_weight_modes.append(self.sample_weight_modes[i])
 
         # Prepare metrics.
-        self.metrics = metrics
+        self.metrics = metrics or []
         self.weighted_metrics = weighted_metrics
         self.metrics_names = ['loss']
         self.metrics_tensors = []
@@ -848,14 +851,8 @@ def compile(self, optimizer, loss, metrics=None, loss_weights=None,
         # contains tuples (metrics for output, names of metrics).
         nested_metrics = _collect_metrics(metrics, self.output_names)
         nested_weighted_metrics = _collect_metrics(weighted_metrics, self.output_names)
-
-        def append_metric(layer_index, metric_name, metric_tensor):
-            """Helper function used in loop below."""
-            if len(self.output_names) > 1:
-                metric_name = self.output_names[layer_index] + '_' + metric_name
-            self.metrics_names.append(metric_name)
-            self.metrics_tensors.append(metric_tensor)
-
+        self.metrics_updates = []
+        self.stateful_metric_names = []
         with K.name_scope('metrics'):
             for i in range(len(self.outputs)):
                 if i in skip_target_indices:
@@ -879,37 +876,60 @@ def handle_metrics(metrics, weights=None):
                                self.loss_functions[i] == losses.binary_crossentropy):
                                 # case: binary accuracy/crossentropy
                                 if metric in ('accuracy', 'acc'):
-                                    acc_fn = metrics_module.binary_accuracy
+                                    metric_fn = metrics_module.binary_accuracy
                                 elif metric in ('crossentropy', 'ce'):
-                                    acc_fn = metrics_module.binary_crossentropy
+                                    metric_fn = metrics_module.binary_crossentropy
                             elif self.loss_functions[i] == losses.sparse_categorical_crossentropy:
                                 # case: categorical accuracy/crossentropy with sparse targets
                                 if metric in ('accuracy', 'acc'):
-                                    acc_fn = metrics_module.sparse_categorical_accuracy
+                                    metric_fn = metrics_module.sparse_categorical_accuracy
                                 elif metric in ('crossentropy', 'ce'):
-                                    acc_fn = metrics_module.sparse_categorical_crossentropy
+                                    metric_fn = metrics_module.sparse_categorical_crossentropy
                             else:
                                 # case: categorical accuracy/crossentropy
                                 if metric in ('accuracy', 'acc'):
-                                    acc_fn = metrics_module.categorical_accuracy
+                                    metric_fn = metrics_module.categorical_accuracy
                                 elif metric in ('crossentropy', 'ce'):
-                                    acc_fn = metrics_module.categorical_crossentropy
+                                    metric_fn = metrics_module.categorical_crossentropy
                             if metric in ('accuracy', 'acc'):
                                     suffix = 'acc'
                             elif metric in ('crossentropy', 'ce'):
                                     suffix = 'ce'
-                            weighted_metric_fn = _weighted_masked_objective(acc_fn)
+                            weighted_metric_fn = _weighted_masked_objective(metric_fn)
                             metric_name = metric_name_prefix + suffix
                         else:
                             metric_fn = metrics_module.get(metric)
                             weighted_metric_fn = _weighted_masked_objective(metric_fn)
-                            metric_name = metric_name_prefix + metric_fn.__name__
+                            # Get metric name as string
+                            if hasattr(metric_fn, 'name'):
+                                metric_name = metric_fn.name
+                            else:
+                                metric_name = metric_fn.__name__
+                            metric_name = metric_name_prefix + metric_name
 
                         with K.name_scope(metric_name):
                             metric_result = weighted_metric_fn(y_true, y_pred,
                                                                weights=weights,
                                                                mask=masks[i])
-                        append_metric(i, metric_name, metric_result)
+
+                        # Append to self.metrics_names, self.metric_tensors,
+                        # self.stateful_metric_names
+                        if len(self.output_names) > 1:
+                            metric_name = self.output_names[i] + '_' + metric_name
+                        # Dedupe name
+                        j = 1
+                        base_metric_name = metric_name
+                        while metric_name in self.metrics_names:
+                            metric_name = base_metric_name + '_' + str(j)
+                            j += 1
+                        self.metrics_names.append(metric_name)
+                        self.metrics_tensors.append(metric_result)
+
+                        # Keep track of state updates created by
+                        # stateful metrics (i.e. metrics layers).
+                        if isinstance(metric_fn, Layer):
+                            self.stateful_metric_names.append(metric_name)
+                            self.metrics_updates += metric_fn.updates
 
                 handle_metrics(output_metrics)
                 handle_metrics(output_weighted_metrics, weights=weights)
@@ -968,7 +988,7 @@ def _make_train_function(self):
                     training_updates = self.optimizer.get_updates(
                         params=self._collected_trainable_weights,
                         loss=self.total_loss)
-                updates = self.updates + training_updates
+                updates = self.updates + training_updates + self.metrics_updates
                 # Gets loss and metrics. Updates weights at each call.
                 self.train_function = K.function(inputs,
                                                  [self.total_loss] + self.metrics_tensors,
@@ -987,7 +1007,7 @@ def _make_test_function(self):
             # Does update the network states.
             self.test_function = K.function(inputs,
                                             [self.total_loss] + self.metrics_tensors,
-                                            updates=self.state_updates,
+                                            updates=self.state_updates + self.metrics_updates,
                                             name='test_function',
                                             **self._function_kwargs)
 
@@ -1108,14 +1128,19 @@ def _fit_loop(self, f, ins, out_labels=None, batch_size=None,
             index_array = np.arange(num_train_samples)
 
         self.history = cbks.History()
-        callbacks = [cbks.BaseLogger()] + (callbacks or []) + [self.history]
+        _callbacks = [cbks.BaseLogger(
+            stateful_metrics=self.stateful_metric_names)]
         if verbose:
             if steps_per_epoch is not None:
                 count_mode = 'steps'
             else:
                 count_mode = 'samples'
-            callbacks += [cbks.ProgbarLogger(count_mode)]
-        callbacks = cbks.CallbackList(callbacks)
+            _callbacks.append(
+                cbks.ProgbarLogger(
+                    count_mode,
+                    stateful_metrics=self.stateful_metric_names))
+        _callbacks += (callbacks or []) + [self.history]
+        callbacks = cbks.CallbackList(_callbacks)
         out_labels = out_labels or []
 
         # it's possible to callback a different model than self
@@ -1148,6 +1173,10 @@ def _fit_loop(self, f, ins, out_labels=None, batch_size=None,
                 indices_for_conversion_to_dense.append(i)
 
         for epoch in range(initial_epoch, epochs):
+            # Reset stateful metrics
+            for m in self.metrics:
+                if isinstance(m, Layer):
+                    m.reset_states()
             callbacks.on_epoch_begin(epoch)
             epoch_logs = {}
             if steps_per_epoch is not None:
@@ -1246,14 +1275,21 @@ def _predict_loop(self, f, ins, batch_size=32, verbose=0, steps=None):
             or list of arrays of predictions
             (if the model has multiple outputs).
         """
+
+        if hasattr(self, 'metrics'):
+            for m in self.metrics:
+                if isinstance(m, Layer):
+                    m.reset_states()
         num_samples = self._check_num_samples(ins, batch_size,
                                               steps,
                                               'steps')
         if verbose == 1:
             if steps is not None:
-                progbar = Progbar(target=steps)
+                progbar = Progbar(target=steps,
+                                  stateful_metrics=self.stateful_metric_names)
             else:
-                progbar = Progbar(target=num_samples)
+                progbar = Progbar(target=num_samples,
+                                  stateful_metrics=self.stateful_metric_names)
 
         indices_for_conversion_to_dense = []
         for i in range(len(self._feed_inputs)):
@@ -1332,6 +1368,17 @@ def _test_loop(self, f, ins, batch_size=None, verbose=0, steps=None):
             and/or metrics). The attribute `model.metrics_names` will give you
             the display labels for the scalar outputs.
         """
+
+        if hasattr(self, 'metrics'):
+            for m in self.metrics:
+                if isinstance(m, Layer):
+                    m.reset_states()
+            stateful_metric_indices = [
+                i for i, name in enumerate(self.metrics_names)
+                if str(name) in self.stateful_metric_names]
+        else:
+            stateful_metric_indices = []
+
         num_samples = self._check_num_samples(ins, batch_size,
                                               steps,
                                               'steps')
@@ -1357,7 +1404,10 @@ def _test_loop(self, f, ins, batch_size=None, verbose=0, steps=None):
                         for _ in enumerate(batch_outs):
                             outs.append(0.)
                     for i, batch_out in enumerate(batch_outs):
-                        outs[i] += batch_out
+                        if i in stateful_metric_indices:
+                            outs[i] = batch_out
+                        else:
+                            outs[i] += batch_out
                 else:
                     if step == 0:
                         outs.append(0.)
@@ -1365,7 +1415,8 @@ def _test_loop(self, f, ins, batch_size=None, verbose=0, steps=None):
                 if verbose == 1:
                     progbar.update(step + 1)
             for i in range(len(outs)):
-                outs[i] /= steps
+                if i not in stateful_metric_indices:
+                    outs[i] /= steps
         else:
             batches = _make_batches(num_samples, batch_size)
             index_array = np.arange(num_samples)
@@ -1385,7 +1436,10 @@ def _test_loop(self, f, ins, batch_size=None, verbose=0, steps=None):
                         for batch_out in enumerate(batch_outs):
                             outs.append(0.)
                     for i, batch_out in enumerate(batch_outs):
-                        outs[i] += batch_out * len(batch_ids)
+                        if i in stateful_metric_indices:
+                            outs[i] = batch_out
+                        else:
+                            outs[i] += batch_out * len(batch_ids)
                 else:
                     if batch_index == 0:
                         outs.append(0.)
@@ -1394,14 +1448,15 @@ def _test_loop(self, f, ins, batch_size=None, verbose=0, steps=None):
                 if verbose == 1:
                     progbar.update(batch_end)
             for i in range(len(outs)):
-                outs[i] /= num_samples
+                if i not in stateful_metric_indices:
+                    outs[i] /= num_samples
         if len(outs) == 1:
             return outs[0]
         return outs
 
     def _standardize_user_data(self, x, y,
                                sample_weight=None, class_weight=None,
-                               check_batch_axis=True, batch_size=None):
+                               check_array_lengths=True, batch_size=None):
         if not hasattr(self, 'optimizer'):
             raise RuntimeError('You must compile a model before '
                                'training/testing. '
@@ -1420,6 +1475,8 @@ def _standardize_user_data(self, x, y,
                 output_shapes.append(None)
             else:
                 output_shapes.append(output_shape)
+        # `check_batch_axis` is set to False since `x` may contain multiple batches
+        #  and in general `x[0].shape[0] != self._feed_input_shapes[0][0]`
         x = _standardize_input_data(x, self._feed_input_names,
                                     self._feed_input_shapes,
                                     check_batch_axis=False,
@@ -1435,7 +1492,9 @@ def _standardize_user_data(self, x, y,
         sample_weights = [_standardize_weights(ref, sw, cw, mode)
                           for (ref, sw, cw, mode)
                           in zip(y, sample_weights, class_weights, self._feed_sample_weight_modes)]
-        _check_array_lengths(x, y, sample_weights)
+
+        if check_array_lengths:
+            _check_array_lengths(x, y, sample_weights)
         _check_loss_and_target_compatibility(y,
                                              self._feed_loss_fns,
                                              self._feed_output_shapes)
@@ -1448,20 +1507,6 @@ def _standardize_user_data(self, x, y,
                                  str(x[0].shape[0]) + ' samples')
         return x, y, sample_weights
 
-    def _get_deduped_metrics_names(self):
-        out_labels = self.metrics_names
-
-        # Rename duplicated metrics name
-        # (can happen with an output layer shared among multiple dataflows).
-        deduped_out_labels = []
-        for i, label in enumerate(out_labels):
-            new_label = label
-            if out_labels.count(label) > 1:
-                dup_idx = out_labels[:i].count(label)
-                new_label += '_' + str(dup_idx + 1)
-            deduped_out_labels.append(new_label)
-        return deduped_out_labels
-
     def fit(self,
             x=None,
             y=None,
@@ -1589,7 +1634,6 @@ def fit(self,
             x, y,
             sample_weight=sample_weight,
             class_weight=class_weight,
-            check_batch_axis=False,
             batch_size=batch_size)
         # Prepare validation data.
         do_validation = False
@@ -1610,7 +1654,6 @@ def fit(self,
             val_x, val_y, val_sample_weights = self._standardize_user_data(
                 val_x, val_y,
                 sample_weight=val_sample_weight,
-                check_batch_axis=False,
                 batch_size=batch_size)
             if self.uses_learning_phase and not isinstance(K.learning_phase(), int):
                 val_ins = val_x + val_y + val_sample_weights + [0.]
@@ -1647,7 +1690,7 @@ def fit(self,
         f = self.train_function
 
         # Prepare display labels.
-        out_labels = self._get_deduped_metrics_names()
+        out_labels = self.metrics_names
 
         if do_validation:
             self._make_test_function()
@@ -1729,7 +1772,6 @@ def evaluate(self, x=None, y=None,
         x, y, sample_weights = self._standardize_user_data(
             x, y,
             sample_weight=sample_weight,
-            check_batch_axis=False,
             batch_size=batch_size)
         # Prepare inputs, delegate logic to `_test_loop`.
         if self.uses_learning_phase and not isinstance(K.learning_phase(), int):
@@ -1839,8 +1881,7 @@ class indices (integers) to
         x, y, sample_weights = self._standardize_user_data(
             x, y,
             sample_weight=sample_weight,
-            class_weight=class_weight,
-            check_batch_axis=True)
+            class_weight=class_weight)
         if self.uses_learning_phase and not isinstance(K.learning_phase(), int):
             ins = x + y + sample_weights + [1.]
         else:
@@ -1881,8 +1922,7 @@ def test_on_batch(self, x, y, sample_weight=None):
         """
         x, y, sample_weights = self._standardize_user_data(
             x, y,
-            sample_weight=sample_weight,
-            check_batch_axis=True)
+            sample_weight=sample_weight)
         if self.uses_learning_phase and not isinstance(K.learning_phase(), int):
             ins = x + y + sample_weights + [0.]
         else:
@@ -1929,7 +1969,7 @@ def fit_generator(self,
                       use_multiprocessing=False,
                       shuffle=True,
                       initial_epoch=0):
-        """Fits the model on data yielded batch-by-batch by a Python generator.
+        """Trains the model on data yielded batch-by-batch by a Python generator.
 
         The generator is run in parallel to the model, for efficiency.
         For instance, this allows you to do real-time data augmentation
@@ -1940,41 +1980,60 @@ def fit_generator(self,
         using `use_multiprocessing=True`.
 
         # Arguments
-            generator: A generator or an instance of `Sequence` (`keras.utils.Sequence`)
-                object in order to avoid duplicate data
-                when using multiprocessing.
+            generator: A generator or an instance of `Sequence`
+                (`keras.utils.Sequence`) object in order to avoid
+                duplicate data when using multiprocessing.
                 The output of the generator must be either
                 - a tuple `(inputs, targets)`
                 - a tuple `(inputs, targets, sample_weights)`.
-                This tuple (a single output of the generator) makes a single batch.
-                Therefore, all arrays in this tuple must have the same length (equal
-                to the size of this batch). Different batches may have different sizes.
-                For example, the last batch of the epoch is commonly smaller than the
-                others, if the size of the dataset is not divisible by the batch size.
+                This tuple (a single output of the generator) makes a single
+                batch. Therefore, all arrays in this tuple must have the same
+                length (equal to the size of this batch). Different batches
+                may have different sizes. For example, the last batch of the
+                epoch is commonly smaller than the others, if the size of the
+                dataset is not divisible by the batch size.
                 The generator is expected to loop over its data
                 indefinitely. An epoch finishes when `steps_per_epoch`
                 batches have been seen by the model.
-            steps_per_epoch: Total number of steps (batches of samples)
+            steps_per_epoch: Integer.
+                Total number of steps (batches of samples)
                 to yield from `generator` before declaring one epoch
                 finished and starting the next epoch. It should typically
                 be equal to the number of samples of your dataset
                 divided by the batch size.
                 Optional for `Sequence`: if unspecified, will use
                 the `len(generator)` as a number of steps.
-            epochs: Integer, total number of iterations on the data.
-            verbose: Verbosity mode, 0, 1, or 2.
-            callbacks: List of callbacks to be called during training.
+            epochs: Integer. Number of epochs to train the model.
+                An epoch is an iteration over the entire data provided,
+                as defined by `steps_per_epoch`.
+                Note that in conjunction with `initial_epoch`,
+                `epochs` is to be understood as "final epoch".
+                The model is not trained for a number of iterations
+                given by `epochs`, but merely until the epoch
+                of index `epochs` is reached.
+            verbose: Integer. 0, 1, or 2. Verbosity mode.
+                0 = silent, 1 = progress bar, 2 = one line per epoch.
+            callbacks: List of `keras.callbacks.Callback` instances.
+                List of callbacks to apply during training.
+                See [callbacks](/callbacks).
             validation_data: This can be either
                 - a generator for the validation data
-                - a tuple (inputs, targets)
-                - a tuple (inputs, targets, sample_weights).
+                - tuple `(x_val, y_val)`
+                - tuple `(x_val, y_val, val_sample_weights)`
+                on which to evaluate
+                the loss and any model metrics at the end of each epoch.
+                The model will not be trained on this data.
             validation_steps: Only relevant if `validation_data`
                 is a generator. Total number of steps (batches of samples)
-                to yield from `generator` before stopping.
+                to yield from `validation_data` generator before stopping.
                 Optional for `Sequence`: if unspecified, will use
                 the `len(validation_data)` as a number of steps.
-            class_weight: Dictionary mapping class indices to a weight
-                for the class.
+            class_weight: Optional dictionary mapping class indices (integers)
+                to a weight (float) value, used for weighting the loss function
+                (during training only).
+                This can be useful to tell the model to
+                "pay more attention" to samples from
+                an under-represented class.
             max_queue_size: Integer. Maximum size for the generator queue.
                 If unspecified, `max_queue_size` will default to 10.
             workers: Integer. Maximum number of processes to spin up
@@ -1989,27 +2048,30 @@ def fit_generator(self,
                 non picklable arguments to the generator
                 as they can't be passed
                 easily to children processes.
-            shuffle: Whether to shuffle the order of the batches at
-                the beginning of each epoch. Only used with instances
-                of `Sequence` (keras.utils.Sequence).
-            initial_epoch: Epoch at which to start training
-                (useful for resuming a previous training run)
+            shuffle: Boolean. Whether to shuffle the training data
+                in batch-sized chunks before each epoch.
+                Only used with instances of `Sequence` (`keras.utils.Sequence`).
+            initial_epoch: Integer.
+                Epoch at which to start training
+                (useful for resuming a previous training run).
 
         # Returns
-            A `History` object.
+            A `History` object. Its `History.history` attribute is
+            a record of training loss values and metrics values
+            at successive epochs, as well as validation loss values
+            and validation metrics values (if applicable).
 
         # Example
 
         ```python
             def generate_arrays_from_file(path):
                 while 1:
-                    f = open(path)
-                    for line in f:
-                        # create numpy arrays of input data
-                        # and labels, from each line in the file
-                        x1, x2, y = process_line(line)
-                        yield ({'input_1': x1, 'input_2': x2}, {'output': y})
-                    f.close()
+                    with open(path) as f:
+                        for line in f:
+                            # create numpy arrays of input data
+                            # and labels, from each line in the file
+                            x1, x2, y = process_line(line)
+                            yield ({'input_1': x1, 'input_2': x2}, {'output': y})
 
             model.fit_generator(generate_arrays_from_file('/my_file.txt'),
                                 steps_per_epoch=10000, epochs=10)
@@ -2056,15 +2118,20 @@ def generate_arrays_from_file(path):
                              ' the `keras.utils.Sequence` class.')
 
         # Prepare display labels.
-        out_labels = self._get_deduped_metrics_names()
+        out_labels = self.metrics_names
         callback_metrics = out_labels + ['val_' + n for n in out_labels]
 
         # prepare callbacks
         self.history = cbks.History()
-        callbacks = [cbks.BaseLogger()] + (callbacks or []) + [self.history]
+        _callbacks = [cbks.BaseLogger(
+            stateful_metrics=self.stateful_metric_names)]
         if verbose:
-            callbacks += [cbks.ProgbarLogger(count_mode='steps')]
-        callbacks = cbks.CallbackList(callbacks)
+            _callbacks.append(
+                cbks.ProgbarLogger(
+                    count_mode='steps',
+                    stateful_metrics=self.stateful_metric_names))
+        _callbacks += (callbacks or []) + [self.history]
+        callbacks = cbks.CallbackList(_callbacks)
 
         # it's possible to callback a different model than self:
         if hasattr(self, 'callback_model') and self.callback_model:
diff --git a/keras/initializers.py b/keras/initializers.py
index 7d90bd7d79c..7032e9bf59a 100644
--- a/keras/initializers.py
+++ b/keras/initializers.py
@@ -499,5 +499,5 @@ def get(identifier):
     elif callable(identifier):
         return identifier
     else:
-        raise ValueError('Could not interpret initializer identifier:',
-                         identifier)
+        raise ValueError('Could not interpret initializer identifier: ' +
+                         str(identifier))
diff --git a/keras/layers/normalization.py b/keras/layers/normalization.py
index 2faf88cf7f5..e762bfd11ca 100644
--- a/keras/layers/normalization.py
+++ b/keras/layers/normalization.py
@@ -180,6 +180,14 @@ def normalize_inference():
             inputs, self.gamma, self.beta, reduction_axes,
             epsilon=self.epsilon)
 
+        if K.backend() != 'cntk':
+            sample_size = K.prod([K.shape(inputs)[axis]
+                                  for axis in reduction_axes])
+            sample_size = K.cast(sample_size, dtype=K.dtype(inputs))
+
+            # sample variance - unbiased estimator of population variance
+            variance *= sample_size / (sample_size - (1.0 + self.epsilon))
+
         self.add_update([K.moving_average_update(self.moving_mean,
                                                  mean,
                                                  self.momentum),
diff --git a/keras/layers/recurrent.py b/keras/layers/recurrent.py
index c3ddd5c9448..81c367f92d9 100644
--- a/keras/layers/recurrent.py
+++ b/keras/layers/recurrent.py
@@ -518,12 +518,14 @@ def __call__(self, inputs, initial_state=None, constants=None, **kwargs):
             self._num_constants = len(constants)
             additional_specs += self.constants_spec
         # at this point additional_inputs cannot be empty
-        is_keras_tensor = hasattr(additional_inputs[0], '_keras_history')
+        is_keras_tensor = K.is_keras_tensor(additional_inputs[0])
         for tensor in additional_inputs:
-            if hasattr(tensor, '_keras_history') != is_keras_tensor:
+            if K.is_keras_tensor(tensor) != is_keras_tensor:
                 raise ValueError('The initial state or constants of an RNN'
                                  ' layer cannot be specified with a mix of'
-                                 ' Keras tensors and non-Keras tensors')
+                                 ' Keras tensors and non-Keras tensors'
+                                 ' (a "Keras tensor" is a tensor that was'
+                                 ' returned by a Keras layer, or by `Input`)')
 
         if is_keras_tensor:
             # Compute the full input spec, including state and constants
@@ -783,7 +785,8 @@ class SimpleRNNCell(Layer):
         units: Positive integer, dimensionality of the output space.
         activation: Activation function to use
             (see [activations](../activations.md)).
-            If you pass None, no activation is applied
+            Default: hyperbolic tangent (`tanh`).
+            If you pass `None`, no activation is applied
             (ie. "linear" activation: `a(x) = x`).
         use_bias: Boolean, whether the layer uses a bias vector.
         kernel_initializer: Initializer for the `kernel` weights matrix,
@@ -941,7 +944,8 @@ class SimpleRNN(RNN):
         units: Positive integer, dimensionality of the output space.
         activation: Activation function to use
             (see [activations](../activations.md)).
-            If you pass None, no activation is applied
+            Default: hyperbolic tangent (`tanh`).
+            If you pass `None`, no activation is applied
             (ie. "linear" activation: `a(x) = x`).
         use_bias: Boolean, whether the layer uses a bias vector.
         kernel_initializer: Initializer for the `kernel` weights matrix,
@@ -1153,11 +1157,15 @@ class GRUCell(Layer):
         units: Positive integer, dimensionality of the output space.
         activation: Activation function to use
             (see [activations](../activations.md)).
-            If you pass None, no activation is applied
+            Default: hyperbolic tangent (`tanh`).
+            If you pass `None`, no activation is applied
             (ie. "linear" activation: `a(x) = x`).
         recurrent_activation: Activation function to use
             for the recurrent step
             (see [activations](../activations.md)).
+            Default: hard sigmoid (`hard_sigmoid`).
+            If you pass `None`, no activation is applied
+            (ie. "linear" activation: `a(x) = x`).
         use_bias: Boolean, whether the layer uses a bias vector.
         kernel_initializer: Initializer for the `kernel` weights matrix,
             used for the linear transformation of the inputs
@@ -1393,11 +1401,15 @@ class GRU(RNN):
         units: Positive integer, dimensionality of the output space.
         activation: Activation function to use
             (see [activations](../activations.md)).
-            If you pass None, no activation is applied
+            Default: hyperbolic tangent (`tanh`).
+            If you pass `None`, no activation is applied
             (ie. "linear" activation: `a(x) = x`).
         recurrent_activation: Activation function to use
             for the recurrent step
             (see [activations](../activations.md)).
+            Default: hard sigmoid (`hard_sigmoid`).
+            If you pass `None`, no activation is applied
+            (ie. "linear" activation: `a(x) = x`).
         use_bias: Boolean, whether the layer uses a bias vector.
         kernel_initializer: Initializer for the `kernel` weights matrix,
             used for the linear transformation of the inputs
@@ -1632,11 +1644,15 @@ class LSTMCell(Layer):
         units: Positive integer, dimensionality of the output space.
         activation: Activation function to use
             (see [activations](../activations.md)).
-            If you pass None, no activation is applied
+            Default: hyperbolic tangent (`tanh`).
+            If you pass `None`, no activation is applied
             (ie. "linear" activation: `a(x) = x`).
         recurrent_activation: Activation function to use
             for the recurrent step
             (see [activations](../activations.md)).
+            Default: hard sigmoid (`hard_sigmoid`).
+            If you pass `None`, no activation is applied
+            (ie. "linear" activation: `a(x) = x`).x
         use_bias: Boolean, whether the layer uses a bias vector.
         kernel_initializer: Initializer for the `kernel` weights matrix,
             used for the linear transformation of the inputs
@@ -1896,11 +1912,15 @@ class LSTM(RNN):
         units: Positive integer, dimensionality of the output space.
         activation: Activation function to use
             (see [activations](../activations.md)).
-            If you pass None, no activation is applied
+            Default: hyperbolic tangent (`tanh`).
+            If you pass `None`, no activation is applied
             (ie. "linear" activation: `a(x) = x`).
         recurrent_activation: Activation function to use
             for the recurrent step
             (see [activations](../activations.md)).
+            Default: hard sigmoid (`hard_sigmoid`).
+            If you pass `None`, no activation is applied
+            (ie. "linear" activation: `a(x) = x`).
         use_bias: Boolean, whether the layer uses a bias vector.
         kernel_initializer: Initializer for the `kernel` weights matrix,
             used for the linear transformation of the inputs.
diff --git a/keras/layers/wrappers.py b/keras/layers/wrappers.py
index 7588a065f0b..a5eed289a4c 100644
--- a/keras/layers/wrappers.py
+++ b/keras/layers/wrappers.py
@@ -42,6 +42,14 @@ def activity_regularizer(self):
         else:
             return None
 
+    @property
+    def trainable(self):
+        return self.layer.trainable
+
+    @trainable.setter
+    def trainable(self, value):
+        self.layer.trainable = value
+
     @property
     def trainable_weights(self):
         return self.layer.trainable_weights
@@ -246,7 +254,6 @@ class Bidirectional(Wrapper):
     """
 
     def __init__(self, layer, merge_mode='concat', weights=None, **kwargs):
-        super(Bidirectional, self).__init__(layer, **kwargs)
         if merge_mode not in ['sum', 'mul', 'ave', 'concat', None]:
             raise ValueError('Invalid merge mode. '
                              'Merge mode should be one of '
@@ -266,6 +273,19 @@ def __init__(self, layer, merge_mode='concat', weights=None, **kwargs):
         self.return_sequences = layer.return_sequences
         self.return_state = layer.return_state
         self.supports_masking = True
+        self._trainable = True
+        super(Bidirectional, self).__init__(layer, **kwargs)
+        self.input_spec = layer.input_spec
+
+    @property
+    def trainable(self):
+        return self._trainable
+
+    @trainable.setter
+    def trainable(self, value):
+        self._trainable = value
+        self.forward_layer.trainable = value
+        self.backward_layer.trainable = value
 
     def get_weights(self):
         return self.forward_layer.get_weights() + self.backward_layer.get_weights()
@@ -294,6 +314,60 @@ def compute_output_shape(self, input_shape):
             return [output_shape] + state_shape + copy.copy(state_shape)
         return output_shape
 
+    def __call__(self, inputs, initial_state=None, **kwargs):
+        if isinstance(inputs, list):
+            if len(inputs) > 1:
+                initial_state = inputs[1:]
+            inputs = inputs[0]
+
+        if initial_state is None:
+            return super(Bidirectional, self).__call__(inputs, **kwargs)
+
+        # Standardize `initial_state` into list
+        if isinstance(initial_state, tuple):
+            initial_state = list(initial_state)
+        elif not isinstance(initial_state, list):
+            initial_state = [initial_state]
+
+        # Check if `initial_state` can be splitted into half
+        num_states = len(initial_state)
+        if num_states % 2 > 0:
+            raise ValueError(
+                'When passing `initial_state` to a Bidirectional RNN, the state '
+                'should be a list containing the states of the underlying RNNs. '
+                'Found: ' + str(initial_state))
+
+        # Applies the same workaround as in `RNN.__call__`, without handling constants
+        kwargs['initial_state'] = initial_state
+        additional_inputs = initial_state
+        additional_specs = [InputSpec(shape=K.int_shape(state))
+                            for state in initial_state]
+        self.forward_layer.state_spec = additional_specs[:num_states // 2]
+        self.backward_layer.state_spec = additional_specs[num_states // 2:]
+
+        is_keras_tensor = K.is_keras_tensor(additional_inputs[0])
+        for tensor in additional_inputs:
+            if K.is_keras_tensor(tensor) != is_keras_tensor:
+                raise ValueError('The initial state of a Bidirectional'
+                                 ' layer cannot be specified with a mix of'
+                                 ' Keras tensors and non-Keras tensors'
+                                 ' (a "Keras tensor" is a tensor that was'
+                                 ' returned by a Keras layer, or by `Input`)')
+
+        if is_keras_tensor:
+            # Compute the full input spec, including state
+            full_input = [inputs] + additional_inputs
+            full_input_spec = self.input_spec + additional_specs
+
+            # Perform the call with temporarily replaced input_spec
+            original_input_spec = self.input_spec
+            self.input_spec = full_input_spec
+            output = super(Bidirectional, self).__call__(full_input, **kwargs)
+            self.input_spec = original_input_spec
+            return output
+        else:
+            return super(Bidirectional, self).__call__(inputs, **kwargs)
+
     def call(self, inputs, training=None, mask=None, initial_state=None):
         kwargs = {}
         if has_arg(self.layer.call, 'training'):
@@ -302,11 +376,6 @@ def call(self, inputs, training=None, mask=None, initial_state=None):
             kwargs['mask'] = mask
 
         if initial_state is not None and has_arg(self.layer.call, 'initial_state'):
-            if not isinstance(initial_state, list):
-                raise ValueError(
-                    'When passing `initial_state` to a Bidirectional RNN, the state '
-                    'should be a list containing the states of the underlying RNNs. '
-                    'Found: ' + str(initial_state))
             forward_state = initial_state[:len(initial_state) // 2]
             backward_state = initial_state[len(initial_state) // 2:]
             y = self.forward_layer.call(inputs, initial_state=forward_state, **kwargs)
diff --git a/keras/metrics.py b/keras/metrics.py
index bcc625bc69c..3d5df23b9ec 100644
--- a/keras/metrics.py
+++ b/keras/metrics.py
@@ -20,6 +20,7 @@
 from .losses import poisson
 from .losses import cosine_proximity
 from .utils.generic_utils import deserialize_keras_object
+from .utils.generic_utils import serialize_keras_object
 
 
 def binary_accuracy(y_true, y_pred):
@@ -56,20 +57,22 @@ def sparse_top_k_categorical_accuracy(y_true, y_pred, k=5):
 
 
 def serialize(metric):
-    return metric.__name__
+    return serialize_keras_object(metric)
 
 
-def deserialize(name, custom_objects=None):
-    return deserialize_keras_object(name,
+def deserialize(config, custom_objects=None):
+    return deserialize_keras_object(config,
                                     module_objects=globals(),
                                     custom_objects=custom_objects,
                                     printable_module_name='metric function')
 
 
 def get(identifier):
-    if isinstance(identifier, six.string_types):
-        identifier = str(identifier)
-        return deserialize(identifier)
+    if isinstance(identifier, dict):
+        config = {'class_name': str(identifier), 'config': {}}
+        return deserialize(config)
+    elif isinstance(identifier, six.string_types):
+        return deserialize(str(identifier))
     elif callable(identifier):
         return identifier
     else:
diff --git a/keras/models.py b/keras/models.py
index 1dcee09608c..246e5aefa84 100644
--- a/keras/models.py
+++ b/keras/models.py
@@ -718,25 +718,24 @@ def set_weights(self, weights):
             self.build()
         self.model.set_weights(weights)
 
-    def load_weights(self, filepath, by_name=False, skip_mismatch=False):
+    def load_weights(self, filepath, by_name=False, skip_mismatch=False, reshape=False):
         if h5py is None:
             raise ImportError('`load_weights` requires h5py.')
-        f = h5py.File(filepath, mode='r')
-        if 'layer_names' not in f.attrs and 'model_weights' in f:
-            f = f['model_weights']
+        with h5py.File(filepath, mode='r') as f:
+            if 'layer_names' not in f.attrs and 'model_weights' in f:
+                f = f['model_weights']
 
-        # Legacy support
-        if legacy_models.needs_legacy_support(self):
-            layers = legacy_models.legacy_sequential_layers(self)
-        else:
-            layers = self.layers
-        if by_name:
-            topology.load_weights_from_hdf5_group_by_name(f, layers,
-                                                          skip_mismatch=skip_mismatch)
-        else:
-            topology.load_weights_from_hdf5_group(f, layers)
-        if hasattr(f, 'close'):
-            f.close()
+            # Legacy support
+            if legacy_models.needs_legacy_support(self):
+                layers = legacy_models.legacy_sequential_layers(self)
+            else:
+                layers = self.layers
+            if by_name:
+                topology.load_weights_from_hdf5_group_by_name(f, layers,
+                                                              skip_mismatch=skip_mismatch,
+                                                              reshape=reshape)
+            else:
+                topology.load_weights_from_hdf5_group(f, layers, reshape=reshape)
 
     def save_weights(self, filepath, overwrite=True):
         if h5py is None:
@@ -752,10 +751,9 @@ def save_weights(self, filepath, overwrite=True):
         else:
             layers = self.layers
 
-        f = h5py.File(filepath, 'w')
-        topology.save_weights_to_hdf5_group(f, layers)
-        f.flush()
-        f.close()
+        with h5py.File(filepath, 'w') as f:
+            topology.save_weights_to_hdf5_group(f, layers)
+            f.flush()
 
     def compile(self, optimizer, loss,
                 metrics=None,
@@ -1226,13 +1224,12 @@ def fit_generator(self, generator,
         ```python
             def generate_arrays_from_file(path):
                 while 1:
-                    f = open(path)
-                    for line in f:
-                        # create Numpy arrays of input data
-                        # and labels, from each line in the file
-                        x, y = process_line(line)
-                        yield (x, y)
-                    f.close()
+                    with open(path) as f:
+                        for line in f:
+                            # create Numpy arrays of input data
+                            # and labels, from each line in the file
+                            x, y = process_line(line)
+                            yield (x, y)
 
             model.fit_generator(generate_arrays_from_file('/my_file.txt'),
                                 steps_per_epoch=1000, epochs=10)
diff --git a/keras/optimizers.py b/keras/optimizers.py
index 83349fcd6f4..5ea63dccb62 100644
--- a/keras/optimizers.py
+++ b/keras/optimizers.py
@@ -76,6 +76,12 @@ def get_updates(self, loss, params):
 
     def get_gradients(self, loss, params):
         grads = K.gradients(loss, params)
+        if None in grads:
+            raise ValueError('An operation has `None` for gradient. '
+                             'Please make sure that all of your ops have a '
+                             'gradient defined (i.e. are differentiable). '
+                             'Common ops without gradient: '
+                             'K.argmax, K.round, K.eval.')
         if hasattr(self, 'clipnorm') and self.clipnorm > 0:
             norm = K.sqrt(sum([K.sum(K.square(g)) for g in grads]))
             grads = [clip_norm(g, self.clipnorm, norm) for g in grads]
@@ -602,8 +608,10 @@ def get_updates(self, loss, params):
         t = K.cast(self.iterations, K.floatx()) + 1
 
         # Due to the recommendations in [2], i.e. warming momentum schedule
-        momentum_cache_t = self.beta_1 * (1. - 0.5 * (K.pow(K.cast_to_floatx(0.96), t * self.schedule_decay)))
-        momentum_cache_t_1 = self.beta_1 * (1. - 0.5 * (K.pow(K.cast_to_floatx(0.96), (t + 1) * self.schedule_decay)))
+        momentum_cache_t = self.beta_1 * (
+            1. - 0.5 * (K.pow(K.cast_to_floatx(0.96), t * self.schedule_decay)))
+        momentum_cache_t_1 = self.beta_1 * (
+            1. - 0.5 * (K.pow(K.cast_to_floatx(0.96), (t + 1) * self.schedule_decay)))
         m_schedule_new = self.m_schedule * momentum_cache_t
         m_schedule_next = self.m_schedule * momentum_cache_t * momentum_cache_t_1
         self.updates.append((self.m_schedule, m_schedule_new))
@@ -751,5 +759,5 @@ def get(identifier):
     if isinstance(identifier, Optimizer):
         return identifier
     else:
-        raise ValueError('Could not interpret optimizer identifier:',
-                         identifier)
+        raise ValueError('Could not interpret optimizer identifier: ' +
+                         str(identifier))
diff --git a/keras/preprocessing/image.py b/keras/preprocessing/image.py
index 4d951ac4d0e..d230baf7114 100644
--- a/keras/preprocessing/image.py
+++ b/keras/preprocessing/image.py
@@ -221,7 +221,7 @@ def apply_transform(x,
         x_channel,
         final_affine_matrix,
         final_offset,
-        order=0,
+        order=1,
         mode=fill_mode,
         cval=cval) for x_channel in x]
     x = np.stack(channel_images, axis=0)
@@ -748,11 +748,10 @@ def fit(self, x,
 
         if self.zca_whitening:
             flat_x = np.reshape(x, (x.shape[0], x.shape[1] * x.shape[2] * x.shape[3]))
-            num_examples = flat_x.shape[0]
-            _, s, vt = linalg.svd(flat_x / np.sqrt(num_examples))
-            s_expand = np.hstack((s, np.zeros(vt.shape[0] - num_examples,
-                                              dtype=flat_x.dtype)))
-            self.principal_components = (vt.T / np.sqrt(s_expand ** 2 + self.zca_epsilon)).dot(vt)
+            sigma = np.dot(flat_x.T, flat_x) / flat_x.shape[0]
+            u, s, _ = linalg.svd(sigma)
+            s_inv = 1. / np.sqrt(s[np.newaxis] + self.zca_epsilon)
+            self.principal_components = (u * s_inv).dot(u.T)
 
 
 class Iterator(Sequence):
@@ -873,9 +872,9 @@ def __init__(self, x, y, image_data_generator,
                  data_format=None,
                  save_to_dir=None, save_prefix='', save_format='png'):
         if y is not None and len(x) != len(y):
-            raise ValueError('X (images tensor) and y (labels) '
+            raise ValueError('x (images tensor) and y (labels) '
                              'should have the same length. '
-                             'Found: X.shape = %s, y.shape = %s' %
+                             'Found: x.shape = %s, y.shape = %s' %
                              (np.asarray(x).shape, np.asarray(y).shape))
 
         if data_format is None:
@@ -956,13 +955,16 @@ def _count_valid_files_in_directory(directory, white_list_formats, follow_links)
         the directory.
     """
     def _recursive_list(subpath):
-        return sorted(os.walk(subpath, followlinks=follow_links), key=lambda tpl: tpl[0])
+        return sorted(os.walk(subpath, followlinks=follow_links), key=lambda x: x[0])
 
     samples = 0
     for _, _, files in _recursive_list(directory):
         for fname in files:
             is_valid = False
             for extension in white_list_formats:
+                if fname.lower().endswith('.tiff'):
+                    warnings.warn('Using \'.tiff\' files with multiple bands will cause distortion. '
+                                  'Please verify your output.')
                 if fname.lower().endswith('.' + extension):
                     is_valid = True
                     break
@@ -990,7 +992,7 @@ def _list_valid_filenames_in_directory(directory, white_list_formats,
             the filenames will be ["class1/file1.jpg", "class1/file2.jpg", ...]).
     """
     def _recursive_list(subpath):
-        return sorted(os.walk(subpath, followlinks=follow_links), key=lambda tpl: tpl[0])
+        return sorted(os.walk(subpath, followlinks=follow_links), key=lambda x: x[0])
 
     classes = []
     filenames = []
@@ -1094,7 +1096,7 @@ def __init__(self, directory, image_data_generator,
         self.save_format = save_format
         self.interpolation = interpolation
 
-        white_list_formats = {'png', 'jpg', 'jpeg', 'bmp', 'ppm'}
+        white_list_formats = {'png', 'jpg', 'jpeg', 'bmp', 'ppm', 'tif', 'tiff'}
 
         # first, count the number of samples and classes
         self.samples = 0
diff --git a/keras/regularizers.py b/keras/regularizers.py
index ff851ddb3f4..d2175307c69 100644
--- a/keras/regularizers.py
+++ b/keras/regularizers.py
@@ -84,5 +84,5 @@ def get(identifier):
     elif callable(identifier):
         return identifier
     else:
-        raise ValueError('Could not interpret regularizer identifier:',
-                         identifier)
+        raise ValueError('Could not interpret regularizer identifier: ' +
+                         str(identifier))
diff --git a/keras/utils/data_utils.py b/keras/utils/data_utils.py
index b420fc67923..ccab6a355fc 100644
--- a/keras/utils/data_utils.py
+++ b/keras/utils/data_utils.py
@@ -66,10 +66,9 @@ def chunk_read(response, chunk_size=8192, reporthook=None):
                 else:
                     break
 
-        response = urlopen(url, data)
-        with open(filename, 'wb') as fd:
-            for chunk in chunk_read(response, reporthook=reporthook):
-                fd.write(chunk)
+        with closing(urlopen(url, data)) as response, open(filename, 'wb') as fd:
+                for chunk in chunk_read(response, reporthook=reporthook):
+                    fd.write(chunk)
 else:
     from six.moves.urllib.request import urlretrieve
 
diff --git a/keras/utils/generic_utils.py b/keras/utils/generic_utils.py
index 532e6a691d2..c54f6a7545a 100644
--- a/keras/utils/generic_utils.py
+++ b/keras/utils/generic_utils.py
@@ -13,6 +13,7 @@
 import types as python_types
 import inspect
 import codecs
+import collections
 
 _GLOBAL_CUSTOM_OBJECTS = {}
 
@@ -286,52 +287,67 @@ class Progbar(object):
 
     # Arguments
         target: Total number of steps expected, None if unknown.
+        width: Progress bar width on screen.
+        verbose: Verbosity mode, 0 (silent), 1 (verbose), 2 (semi-verbose)
+        stateful_metrics: Iterable of string names of metrics that
+            should *not* be averaged over time. Metrics in this list
+            will be displayed as-is. All others will be averaged
+            by the progbar before display.
         interval: Minimum visual progress update interval (in seconds).
     """
 
-    def __init__(self, target, width=30, verbose=1, interval=0.05):
-        self.width = width
+    def __init__(self, target, width=30, verbose=1, interval=0.05,
+                 stateful_metrics=None):
         self.target = target
-        self.sum_values = {}
-        self.unique_values = []
-        self.start = time.time()
-        self.last_update = 0
-        self.interval = interval
-        self.total_width = 0
-        self.seen_so_far = 0
+        self.width = width
         self.verbose = verbose
+        self.interval = interval
+        if stateful_metrics:
+            self.stateful_metrics = set(stateful_metrics)
+        else:
+            self.stateful_metrics = set()
+
         self._dynamic_display = ((hasattr(sys.stdout, 'isatty') and
                                   sys.stdout.isatty()) or
                                  'ipykernel' in sys.modules)
+        self._total_width = 0
+        self._seen_so_far = 0
+        self._values = collections.OrderedDict()
+        self._start = time.time()
+        self._last_update = 0
 
-    def update(self, current, values=None, force=False):
+    def update(self, current, values=None):
         """Updates the progress bar.
 
         # Arguments
             current: Index of current step.
-            values: List of tuples (name, value_for_last_step).
-                The progress bar will display averages for these values.
-            force: Whether to force visual progress update.
+            values: List of tuples:
+                `(name, value_for_last_step)`.
+                If `name` is in `stateful_metrics`,
+                `value_for_last_step` will be displayed as-is.
+                Else, an average of the metric over time will be displayed.
         """
         values = values or []
         for k, v in values:
-            if k not in self.sum_values:
-                self.sum_values[k] = [v * (current - self.seen_so_far),
-                                      current - self.seen_so_far]
-                self.unique_values.append(k)
+            if k not in self.stateful_metrics:
+                if k not in self._values:
+                    self._values[k] = [v * (current - self._seen_so_far),
+                                       current - self._seen_so_far]
+                else:
+                    self._values[k][0] += v * (current - self._seen_so_far)
+                    self._values[k][1] += (current - self._seen_so_far)
             else:
-                self.sum_values[k][0] += v * (current - self.seen_so_far)
-                self.sum_values[k][1] += (current - self.seen_so_far)
-        self.seen_so_far = current
+                self._values[k] = v
+        self._seen_so_far = current
 
         now = time.time()
-        info = ' - %.0fs' % (now - self.start)
+        info = ' - %.0fs' % (now - self._start)
         if self.verbose == 1:
-            if (not force and (now - self.last_update) < self.interval and
+            if (now - self._last_update < self.interval and
                     self.target is not None and current < self.target):
                 return
 
-            prev_total_width = self.total_width
+            prev_total_width = self._total_width
             if self._dynamic_display:
                 sys.stdout.write('\b' * prev_total_width)
                 sys.stdout.write('\r')
@@ -355,11 +371,11 @@ def update(self, current, values=None, force=False):
             else:
                 bar = '%7d/Unknown' % current
 
-            self.total_width = len(bar)
+            self._total_width = len(bar)
             sys.stdout.write(bar)
 
             if current:
-                time_per_unit = (now - self.start) / current
+                time_per_unit = (now - self._start) / current
             else:
                 time_per_unit = 0
             if self.target is not None and current < self.target:
@@ -380,21 +396,21 @@ def update(self, current, values=None, force=False):
                 else:
                     info += ' %.0fus/step' % (time_per_unit * 1e6)
 
-            for k in self.unique_values:
+            for k in self._values:
                 info += ' - %s:' % k
-                if isinstance(self.sum_values[k], list):
+                if isinstance(self._values[k], list):
                     avg = np.mean(
-                        self.sum_values[k][0] / max(1, self.sum_values[k][1]))
+                        self._values[k][0] / max(1, self._values[k][1]))
                     if abs(avg) > 1e-3:
                         info += ' %.4f' % avg
                     else:
                         info += ' %.4e' % avg
                 else:
-                    info += ' %s' % self.sum_values[k]
+                    info += ' %s' % self._values[k]
 
-            self.total_width += len(info)
-            if prev_total_width > self.total_width:
-                info += (' ' * (prev_total_width - self.total_width))
+            self._total_width += len(info)
+            if prev_total_width > self._total_width:
+                info += (' ' * (prev_total_width - self._total_width))
 
             if self.target is not None and current >= self.target:
                 info += '\n'
@@ -404,10 +420,10 @@ def update(self, current, values=None, force=False):
 
         elif self.verbose == 2:
             if self.target is None or current >= self.target:
-                for k in self.unique_values:
+                for k in self._values:
                     info += ' - %s:' % k
                     avg = np.mean(
-                        self.sum_values[k][0] / max(1, self.sum_values[k][1]))
+                        self._values[k][0] / max(1, self._values[k][1]))
                     if avg > 1e-3:
                         info += ' %.4f' % avg
                     else:
@@ -417,7 +433,7 @@ def update(self, current, values=None, force=False):
                 sys.stdout.write(info)
                 sys.stdout.flush()
 
-        self.last_update = now
+        self._last_update = now
 
     def add(self, n, values=None):
-        self.update(self.seen_so_far + n, values)
+        self.update(self._seen_so_far + n, values)
diff --git a/keras/utils/io_utils.py b/keras/utils/io_utils.py
index deac41b2484..8e663f499cb 100644
--- a/keras/utils/io_utils.py
+++ b/keras/utils/io_utils.py
@@ -4,9 +4,9 @@
 from __future__ import print_function
 
 import numpy as np
-import sys
 from collections import defaultdict
 
+import six
 try:
     import h5py
 except ImportError:
@@ -141,13 +141,11 @@ def ask_to_proceed_with_overwrite(filepath):
     # Returns
         True if we can proceed with overwrite, False otherwise.
     """
-    get_input = input
-    if sys.version_info[:2] <= (2, 7):
-        get_input = raw_input
-    overwrite = get_input('[WARNING] %s already exists - overwrite? '
-                          '[y/n]' % (filepath))
-    while overwrite not in ['y', 'n']:
-        overwrite = get_input('Enter "y" (overwrite) or "n" (cancel).')
+    overwrite = six.moves.input('[WARNING] %s already exists - overwrite? '
+                                '[y/n]' % (filepath)).strip().lower()
+    while overwrite not in ('y', 'n'):
+        overwrite = six.moves.input('Enter "y" (overwrite) or "n" '
+                                    '(cancel).').strip().lower()
     if overwrite == 'n':
         return False
     print('[TIP] Next time specify overwrite=True!')
diff --git a/keras/utils/training_utils.py b/keras/utils/training_utils.py
index d5e889ac5f4..48d452434c8 100644
--- a/keras/utils/training_utils.py
+++ b/keras/utils/training_utils.py
@@ -19,7 +19,7 @@ def _normalize_device_name(name):
     return name
 
 
-def multi_gpu_model(model, gpus):
+def multi_gpu_model(model, gpus=None):
     """Replicates a model on different GPUs.
 
     Specifically, this function implements single-machine
@@ -101,6 +101,14 @@ def multi_gpu_model(model, gpus):
     if K.backend() != 'tensorflow':
         raise ValueError('`multi_gpu_model` is only available '
                          'with the TensorFlow backend.')
+
+    available_devices = _get_available_devices()
+    available_devices = [_normalize_device_name(name) for name in available_devices]
+    if not gpus:
+        # Using all visible GPUs when not specifying `gpus`
+        # e.g. CUDA_VISIBLE_DEVICES=0,2 python3 keras_mgpu.py
+        gpus = len([x for x in available_devices if 'gpu' in x])
+
     if isinstance(gpus, (list, tuple)):
         if len(gpus) <= 1:
             raise ValueError('For multi-gpu usage to be effective, '
@@ -119,8 +127,6 @@ def multi_gpu_model(model, gpus):
     import tensorflow as tf
 
     target_devices = ['/cpu:0'] + ['/gpu:%d' % i for i in target_gpu_ids]
-    available_devices = _get_available_devices()
-    available_devices = [_normalize_device_name(name) for name in available_devices]
     for device in target_devices:
         if device not in available_devices:
             raise ValueError(
diff --git a/keras/wrappers/scikit_learn.py b/keras/wrappers/scikit_learn.py
index 28449171e40..c9663c62af4 100644
--- a/keras/wrappers/scikit_learn.py
+++ b/keras/wrappers/scikit_learn.py
@@ -176,7 +176,7 @@ class KerasClassifier(BaseWrapper):
     """Implementation of the scikit-learn classifier API for Keras.
     """
 
-    def fit(self, x, y, **kwargs):
+    def fit(self, x, y, sample_weight=None, **kwargs):
         """Constructs a new model with `build_fn` & fit the model to `(x, y)`.
 
         # Arguments
@@ -204,6 +204,8 @@ def fit(self, x, y, **kwargs):
         else:
             raise ValueError('Invalid shape for y: ' + str(y.shape))
         self.n_classes_ = len(self.classes_)
+        if sample_weight is not None:
+            kwargs['sample_weight'] = sample_weight
         return super(KerasClassifier, self).fit(x, y, **kwargs)
 
     def predict(self, x, **kwargs):
@@ -222,7 +224,12 @@ def predict(self, x, **kwargs):
                 Class predictions.
         """
         kwargs = self.filter_sk_params(Sequential.predict_classes, kwargs)
-        classes = self.model.predict_classes(x, **kwargs)
+
+        proba = self.model.predict(x, **kwargs)
+        if proba.shape[-1] > 1:
+            classes = proba.argmax(axis=-1)
+        else:
+            classes = (proba > 0.5).astype('int32')
         return self.classes_[classes]
 
     def predict_proba(self, x, **kwargs):
@@ -245,7 +252,7 @@ def predict_proba(self, x, **kwargs):
                 (instead of `(n_sample, 1)` as in Keras).
         """
         kwargs = self.filter_sk_params(Sequential.predict_proba, kwargs)
-        probs = self.model.predict_proba(x, **kwargs)
+        probs = self.model.predict(x, **kwargs)
 
         # check if binary classification
         if probs.shape[1] == 1:
diff --git a/setup.py b/setup.py
index dcc8ceb3073..ae91146760d 100644
--- a/setup.py
+++ b/setup.py
@@ -3,12 +3,12 @@
 
 
 setup(name='Keras',
-      version='2.1.3',
+      version='2.1.4',
       description='Deep Learning for humans',
       author='Francois Chollet',
       author_email='francois.chollet@gmail.com',
       url='https://github.com/keras-team/keras',
-      download_url='https://github.com/keras-team/keras/tarball/2.1.3',
+      download_url='https://github.com/keras-team/keras/tarball/2.1.4',
       license='MIT',
       install_requires=['numpy>=1.9.1',
                         'scipy>=0.14',
diff --git a/tests/keras/applications/applications_test.py b/tests/keras/applications/applications_test.py
index a75e79c44eb..a5c2c65470b 100644
--- a/tests/keras/applications/applications_test.py
+++ b/tests/keras/applications/applications_test.py
@@ -4,7 +4,6 @@
 from multiprocessing import Process, Queue
 from keras.utils.test_utils import keras_test
 from keras.utils.test_utils import layer_test
-from keras.utils.generic_utils import CustomObjectScope
 from keras.models import Sequential
 from keras import applications
 from keras import backend as K
@@ -170,54 +169,5 @@ def test_nasnet():
     _test_app_pooling(app, last_dim)
 
 
-@pytest.mark.skipif(K.backend() != 'tensorflow', reason='Requires TF backend')
-@keras_test
-def test_depthwise_conv_2d():
-    _convolution_paddings = ['valid', 'same']
-    num_samples = 2
-    stack_size = 3
-    num_row = 7
-    num_col = 6
-
-    with CustomObjectScope(
-        {'relu6': applications.mobilenet.relu6,
-         'DepthwiseConv2D': applications.mobilenet.DepthwiseConv2D}):
-        for padding in _convolution_paddings:
-            for strides in [(1, 1), (2, 2)]:
-                for multiplier in [1, 2]:
-                    if padding == 'same' and strides != (1, 1):
-                        continue
-
-                    layer_test(applications.mobilenet.DepthwiseConv2D,
-                               kwargs={'kernel_size': (3, 3),
-                                       'padding': padding,
-                                       'strides': strides,
-                                       'depth_multiplier': multiplier},
-                               input_shape=(num_samples,
-                                            num_row,
-                                            num_col,
-                                            stack_size))
-
-        layer_test(applications.mobilenet.DepthwiseConv2D,
-                   kwargs={'kernel_size': 3,
-                           'padding': padding,
-                           'data_format': 'channels_first',
-                           'activation': None,
-                           'depthwise_regularizer': 'l2',
-                           'bias_regularizer': 'l2',
-                           'activity_regularizer': 'l2',
-                           'depthwise_constraint': 'unit_norm',
-                           'strides': strides,
-                           'depth_multiplier': multiplier},
-                   input_shape=(num_samples, stack_size, num_row, num_col))
-
-        # Test invalid use case
-        with pytest.raises(ValueError):
-            Sequential([applications.mobilenet.DepthwiseConv2D(
-                kernel_size=3,
-                padding=padding,
-                batch_input_shape=(None, None, 5, None))])
-
-
 if __name__ == '__main__':
     pytest.main([__file__])
diff --git a/tests/keras/backend/backend_test.py b/tests/keras/backend/backend_test.py
index 0938bf4b162..cfc0885ca38 100644
--- a/tests/keras/backend/backend_test.py
+++ b/tests/keras/backend/backend_test.py
@@ -844,6 +844,28 @@ def test_conv3d(self):
             with pytest.raises(ValueError):
                 k.conv3d(k.variable(xval), k.variable(kernel_val), data_format='channels_middle')
 
+    @pytest.mark.parametrize('k', [KTF], ids=['TensorFlow'])
+    def test_depthwise_conv_2d(self, k):
+        for data_format in ['channels_first', 'channels_last']:
+            x_shape = (4, 4)
+            if data_format == 'channels_first':
+                input_shape = (2, 3) + x_shape
+            elif data_format == 'channels_last':
+                input_shape = (2,) + x_shape + (3,)
+            kernel_shape = (3, 3, 3, 2)
+
+            x_val = np.ones(input_shape)
+            kernel_val = np.arange(np.prod(kernel_shape)).reshape(kernel_shape)
+            z = k.eval(k.depthwise_conv2d(k.variable(x_val), k.variable(kernel_val),
+                                          data_format=data_format))
+
+            for z_i in np.split(z, 6, axis=1 if data_format == 'channels_first' else -1):
+                assert_allclose(z_i, z_i[0] * np.ones_like(z_i))
+
+        # Test invalid use cases
+        with pytest.raises(ValueError):
+            k.depthwise_conv2d(k.variable(x_val), k.variable(kernel_val), data_format='channels_middle')
+
     def test_pool2d(self):
         check_single_tensor_operation('pool2d', (5, 10, 12, 3),
                                       BACKENDS, cntk_dynamicity=True,
@@ -885,8 +907,8 @@ def test_random_normal(self):
         mean = 0.
         std = 1.
         for k in BACKENDS:
-            rand = k.eval(k.random_normal((300, 100), mean=mean, stddev=std))
-            assert rand.shape == (300, 100)
+            rand = k.eval(k.random_normal((300, 200), mean=mean, stddev=std, seed=1337))
+            assert rand.shape == (300, 200)
             assert np.abs(np.mean(rand) - mean) < 0.015
             assert np.abs(np.std(rand) - std) < 0.015
 
diff --git a/tests/keras/engine/test_topology.py b/tests/keras/engine/test_topology.py
index 59662d49d79..91cbdda0b32 100644
--- a/tests/keras/engine/test_topology.py
+++ b/tests/keras/engine/test_topology.py
@@ -656,6 +656,29 @@ def test_recursion_with_bn_and_loss():
     model2.fit(x, y, verbose=0, epochs=1)
 
 
+@keras_test
+def test_activity_regularization_with_model_composition():
+
+    def reg(x):
+        return K.sum(x)
+
+    net_a_input = Input((2,))
+    net_a = net_a_input
+    net_a = Dense(2, kernel_initializer='ones',
+                  use_bias=False,
+                  activity_regularizer=reg)(net_a)
+    model_a = Model([net_a_input], [net_a])
+
+    net_b_input = Input((2,))
+    net_b = model_a(net_b_input)
+    model_b = Model([net_b_input], [net_b])
+
+    model_b.compile(optimizer='sgd', loss=None)
+    x = np.ones((1, 2))
+    loss = model_b.evaluate(x)
+    assert loss == 4
+
+
 @keras_test
 def test_shared_layer_depth_is_correct():
     # Basic outline here: we have a shared embedding layer, and two inputs that go through
diff --git a/tests/keras/engine/test_training.py b/tests/keras/engine/test_training.py
index 6846a682d70..a928bdc65f5 100644
--- a/tests/keras/engine/test_training.py
+++ b/tests/keras/engine/test_training.py
@@ -568,6 +568,17 @@ def test_trainable_argument():
     assert_allclose(out, out_2)
 
 
+@keras_test
+def test_with_list_as_targets():
+    model = Sequential()
+    model.add(Dense(1, input_dim=3, trainable=False))
+    model.compile('rmsprop', 'mse')
+
+    x = np.random.random((2, 3))
+    y = [0, 1]
+    model.train_on_batch(x, y)
+
+
 @keras_test
 def test_check_not_failing():
     a = np.random.random((2, 1, 3))
diff --git a/tests/keras/layers/convolutional_test.py b/tests/keras/layers/convolutional_test.py
index 4bb3fe58430..296d9a15b1d 100644
--- a/tests/keras/layers/convolutional_test.py
+++ b/tests/keras/layers/convolutional_test.py
@@ -238,21 +238,22 @@ def test_separable_conv_1d():
     num_step = 9
 
     for padding in _convolution_paddings:
-        for multiplier in [1, 2]:
-            for dilation_rate in [1, 2]:
-                if padding == 'same':
-                    continue
-                if dilation_rate != 1:
-                    continue
+        for strides in [1, 2]:
+            for multiplier in [1, 2]:
+                for dilation_rate in [1, 2]:
+                    if padding == 'same' and strides != 1:
+                        continue
+                    if dilation_rate != 1 and strides != 1:
+                        continue
 
-                layer_test(convolutional.SeparableConv1D,
-                           kwargs={'filters': filters,
-                                   'kernel_size': 3,
-                                   'padding': padding,
-                                   'strides': 1,
-                                   'depth_multiplier': multiplier,
-                                   'dilation_rate': dilation_rate},
-                           input_shape=(num_samples, num_step, stack_size))
+                    layer_test(convolutional.SeparableConv1D,
+                               kwargs={'filters': filters,
+                                       'kernel_size': 3,
+                                       'padding': padding,
+                                       'strides': strides,
+                                       'depth_multiplier': multiplier,
+                                       'dilation_rate': dilation_rate},
+                               input_shape=(num_samples, num_step, stack_size))
 
     layer_test(convolutional.SeparableConv1D,
                kwargs={'filters': filters,
diff --git a/tests/keras/layers/wrappers_test.py b/tests/keras/layers/wrappers_test.py
index 50665647c3a..c471a49ed37 100644
--- a/tests/keras/layers/wrappers_test.py
+++ b/tests/keras/layers/wrappers_test.py
@@ -137,6 +137,22 @@ def test_TimeDistributed_learning_phase():
     assert_allclose(np.mean(y), 0., atol=1e-1, rtol=1e-1)
 
 
+@keras_test
+def test_TimeDistributed_trainable():
+    # test layers that need learning_phase to be set
+    x = Input(shape=(3, 2))
+    layer = wrappers.TimeDistributed(layers.BatchNormalization())
+    _ = layer(x)
+    assert len(layer.updates) == 2
+    assert len(layer.trainable_weights) == 2
+    layer.trainable = False
+    assert len(layer.updates) == 0
+    assert len(layer.trainable_weights) == 0
+    layer.trainable = True
+    assert len(layer.updates) == 2
+    assert len(layer.trainable_weights) == 2
+
+
 @keras_test
 def test_regularizers():
     model = Sequential()
@@ -309,21 +325,37 @@ def test_Bidirectional_state_reuse():
     timesteps = 3
     units = 3
 
-    inputs = Input((timesteps, dim))
+    input1 = Input((timesteps, dim))
     layer = wrappers.Bidirectional(rnn(units, return_state=True, return_sequences=True))
-    outputs = layer(inputs)
-    output, state = outputs[0], outputs[1:]
+    state = layer(input1)[1:]
 
     # test passing invalid initial_state: passing a tensor
+    input2 = Input((timesteps, dim))
     with pytest.raises(ValueError):
-        output = wrappers.Bidirectional(rnn(units))(output, initial_state=state[0])
+        output = wrappers.Bidirectional(rnn(units))(input2, initial_state=state[0])
 
     # test valid usage: passing a list
-    output = wrappers.Bidirectional(rnn(units))(output, initial_state=state)
-    model = Model(inputs, output)
-    inputs = np.random.rand(samples, timesteps, dim)
+    output = wrappers.Bidirectional(rnn(units))(input2, initial_state=state)
+    model = Model([input1, input2], output)
+    assert len(model.layers) == 4
+    assert isinstance(model.layers[-1].input, list)
+    inputs = [np.random.rand(samples, timesteps, dim),
+              np.random.rand(samples, timesteps, dim)]
     outputs = model.predict(inputs)
 
 
+@keras_test
+def test_Bidirectional_trainable():
+    # test layers that need learning_phase to be set
+    x = Input(shape=(3, 2))
+    layer = wrappers.Bidirectional(layers.SimpleRNN(3))
+    _ = layer(x)
+    assert len(layer.trainable_weights) == 6
+    layer.trainable = False
+    assert len(layer.trainable_weights) == 0
+    layer.trainable = True
+    assert len(layer.trainable_weights) == 6
+
+
 if __name__ == '__main__':
     pytest.main([__file__])
diff --git a/tests/keras/metrics_test.py b/tests/keras/metrics_test.py
index 56af403ce07..4d0f8915c9b 100644
--- a/tests/keras/metrics_test.py
+++ b/tests/keras/metrics_test.py
@@ -1,8 +1,10 @@
 import pytest
 import numpy as np
 
+import keras
 from keras import metrics
 from keras import backend as K
+from keras.utils.test_utils import keras_test
 
 all_metrics = [
     metrics.binary_accuracy,
@@ -26,6 +28,7 @@
 ]
 
 
+@keras_test
 def test_metrics():
     y_a = K.variable(np.random.random((6, 7)))
     y_b = K.variable(np.random.random((6, 7)))
@@ -35,6 +38,7 @@ def test_metrics():
         assert K.eval(output).shape == (6,)
 
 
+@keras_test
 def test_sparse_metrics():
     for metric in all_sparse_metrics:
         y_a = K.variable(np.random.randint(0, 7, (6,)), dtype=K.floatx())
@@ -66,7 +70,8 @@ def test_invalid_get():
 
 
 @pytest.mark.skipif((K.backend() == 'cntk'),
-                    reason="keras cntk backend does not support top_k yet")
+                    reason='CNTK backend does not support top_k yet')
+@keras_test
 def test_top_k_categorical_accuracy():
     y_pred = K.variable(np.array([[0.3, 0.2, 0.1], [0.1, 0.2, 0.7]]))
     y_true = K.variable(np.array([[0, 1, 0], [1, 0, 0]]))
@@ -82,7 +87,8 @@ def test_top_k_categorical_accuracy():
 
 
 @pytest.mark.skipif((K.backend() == 'cntk'),
-                    reason="keras cntk backend does not support top_k yet")
+                    reason='CNTK backend does not support top_k yet')
+@keras_test
 def test_sparse_top_k_categorical_accuracy():
     y_pred = K.variable(np.array([[0.3, 0.2, 0.1], [0.1, 0.2, 0.7]]))
     y_true = K.variable(np.array([[1], [0]]))
@@ -100,5 +106,76 @@ def test_sparse_top_k_categorical_accuracy():
     assert failure_result == 0
 
 
+@keras_test
+def test_stateful_metrics():
+    np.random.seed(1334)
+
+    class BinaryTruePositives(keras.layers.Layer):
+        """Stateful Metric to count the total true positives over all batches.
+
+        Assumes predictions and targets of shape `(samples, 1)`.
+
+        # Arguments
+            threshold: Float, lower limit on prediction value that counts as a
+                positive class prediction.
+            name: String, name for the metric.
+        """
+
+        def __init__(self, name='true_positives', **kwargs):
+            super(BinaryTruePositives, self).__init__(name=name, **kwargs)
+            self.true_positives = K.variable(value=0, dtype='int32')
+
+        def reset_states(self):
+            K.set_value(self.true_positives, 0)
+
+        def __call__(self, y_true, y_pred):
+            """Computes the number of true positives in a batch.
+
+            # Arguments
+                y_true: Tensor, batch_wise labels
+                y_pred: Tensor, batch_wise predictions
+
+            # Returns
+                The total number of true positives seen this epoch at the
+                    completion of the batch.
+            """
+            y_true = K.cast(y_true, 'int32')
+            y_pred = K.cast(K.round(y_pred), 'int32')
+            correct_preds = K.cast(K.equal(y_pred, y_true), 'int32')
+            true_pos = K.cast(K.sum(correct_preds * y_true), 'int32')
+            current_true_pos = self.true_positives * 1
+            self.add_update(K.update_add(self.true_positives,
+                                         true_pos),
+                            inputs=[y_true, y_pred])
+            return current_true_pos + true_pos
+
+    metric_fn = BinaryTruePositives()
+    config = metrics.serialize(metric_fn)
+    metric_fn = metrics.deserialize(
+        config, custom_objects={'BinaryTruePositives': BinaryTruePositives})
+
+    # Test on simple model
+    inputs = keras.Input(shape=(2,))
+    outputs = keras.layers.Dense(1, activation='sigmoid')(inputs)
+    model = keras.Model(inputs, outputs)
+    model.compile(optimizer='sgd',
+                  loss='binary_crossentropy',
+                  metrics=['acc', metric_fn])
+
+    # Test fit, evaluate
+    samples = 1000
+    x = np.random.random((samples, 2))
+    y = np.random.randint(2, size=(samples, 1))
+    model.fit(x, y, epochs=1, batch_size=10)
+    outs = model.evaluate(x, y, batch_size=10)
+    preds = model.predict(x)
+
+    def ref_true_pos(y_true, y_pred):
+        return np.sum(np.logical_and(y_pred > 0.5, y_true == 1))
+
+    # Test correctness (e.g. updates should have been run)
+    np.testing.assert_allclose(outs[2], ref_true_pos(y, preds), atol=1e-5)
+
+
 if __name__ == '__main__':
     pytest.main([__file__])
diff --git a/tests/keras/optimizers_test.py b/tests/keras/optimizers_test.py
index e7ba87c217d..eaff119d714 100644
--- a/tests/keras/optimizers_test.py
+++ b/tests/keras/optimizers_test.py
@@ -4,9 +4,9 @@
 from numpy.testing import assert_allclose
 
 from keras.utils import test_utils
-from keras import optimizers
-from keras.models import Sequential
-from keras.layers.core import Dense, Activation
+from keras import optimizers, Input
+from keras.models import Sequential, Model
+from keras.layers.core import Dense, Activation, Lambda
 from keras.utils.test_utils import keras_test
 from keras.utils.np_utils import to_categorical
 from keras import backend as K
@@ -64,10 +64,22 @@ def _test_optimizer(optimizer, target=0.75):
     assert_allclose(bias, 2.)
 
 
+@keras_test
+def _test_no_grad(optimizer):
+    inp = Input([3])
+    x = Dense(10)(inp)
+    x = Lambda(lambda l: 1.0 * K.reshape(K.cast(K.argmax(l), 'float32'), [-1, 1]))(x)
+    mod = Model(inp, x)
+    mod.compile(optimizer, 'mse')
+    with pytest.raises(ValueError):
+        mod.fit(np.zeros([10, 3]), np.zeros([10, 1], np.float32), batch_size=10, epochs=10)
+
+
 @keras_test
 def test_sgd():
     sgd = optimizers.SGD(lr=0.01, momentum=0.9, nesterov=True)
     _test_optimizer(sgd)
+    _test_no_grad(sgd)
 
 
 @keras_test
diff --git a/tests/keras/preprocessing/image_test.py b/tests/keras/preprocessing/image_test.py
index 1f3ca76b750..e83af84c9a2 100644
--- a/tests/keras/preprocessing/image_test.py
+++ b/tests/keras/preprocessing/image_test.py
@@ -123,6 +123,9 @@ def test_image_data_generator_fit(self):
         # Test RBG
         x = np.random.random((32, 10, 10, 3))
         generator.fit(x)
+        # Test more samples than dims
+        x = np.random.random((32, 4, 4, 1))
+        generator.fit(x)
         generator = image.ImageDataGenerator(
             featurewise_center=True,
             samplewise_center=True,
@@ -136,6 +139,9 @@ def test_image_data_generator_fit(self):
         # Test RBG
         x = np.random.random((32, 3, 10, 10))
         generator.fit(x)
+        # Test more samples than dims
+        x = np.random.random((32, 1, 4, 4))
+        generator.fit(x)
 
     def test_directory_iterator(self, tmpdir):
         num_classes = 2
diff --git a/tests/keras/preprocessing/text_test.py b/tests/keras/preprocessing/text_test.py
index e531d9f16be..617c67e63a5 100644
--- a/tests/keras/preprocessing/text_test.py
+++ b/tests/keras/preprocessing/text_test.py
@@ -74,7 +74,7 @@ def test_tokenizer_oov_flag():
     x_train = ['This text has only known words']
     x_test = ['This text has some unknown words']  # 2 OOVs: some, unknown
 
-    # Defalut, without OOV flag
+    # Default, without OOV flag
     tokenizer = Tokenizer()
     tokenizer.fit_on_texts(x_train)
     x_test_seq = tokenizer.texts_to_sequences(x_test)
diff --git a/tests/keras/regularizers_test.py b/tests/keras/regularizers_test.py
index 61e6cec2eb3..aaa8402ddf1 100644
--- a/tests/keras/regularizers_test.py
+++ b/tests/keras/regularizers_test.py
@@ -1,10 +1,11 @@
 import pytest
 
-from keras.models import Sequential
-from keras.layers import Dense
+from keras.models import Sequential, Model
+from keras.layers import Dense, Input, Average
 from keras.utils import np_utils
 from keras.utils import test_utils
 from keras import regularizers
+from keras import backend as K
 
 data_dim = 5
 num_classes = 2
@@ -32,6 +33,19 @@ def create_model(kernel_regularizer=None, activity_regularizer=None):
     return model
 
 
+def create_multi_input_model_from(layer1, layer2):
+    input_1 = Input(shape=(data_dim,))
+    input_2 = Input(shape=(data_dim,))
+    out1 = layer1(input_1)
+    out2 = layer2(input_2)
+    out = Average()([out1, out2])
+    model = Model([input_1, input_2], out)
+    model.add_loss(K.mean(out2))
+    model.add_loss(1)
+    model.add_loss(1)
+    return model
+
+
 def test_kernel_regularization():
     x_train, y_train = get_data()
     for reg in [regularizers.l1(),
@@ -52,5 +66,44 @@ def test_activity_regularization():
         model.train_on_batch(x_train, y_train)
 
 
+def test_regularization_shared_layer():
+    dense_layer = Dense(num_classes,
+                        kernel_regularizer=regularizers.l1(),
+                        activity_regularizer=regularizers.l1())
+
+    model = create_multi_input_model_from(dense_layer, dense_layer)
+    model.compile(loss='categorical_crossentropy', optimizer='sgd')
+    assert len(model.losses) == 6
+
+
+def test_regularization_shared_model():
+    dense_layer = Dense(num_classes,
+                        kernel_regularizer=regularizers.l1(),
+                        activity_regularizer=regularizers.l1())
+
+    input_tensor = Input(shape=(data_dim,))
+    dummy_model = Model(input_tensor, dense_layer(input_tensor))
+
+    model = create_multi_input_model_from(dummy_model, dummy_model)
+    model.compile(loss='categorical_crossentropy', optimizer='sgd')
+    assert len(model.losses) == 6
+
+
+def test_regularization_shared_layer_in_different_models():
+    shared_dense = Dense(num_classes,
+                         kernel_regularizer=regularizers.l1(),
+                         activity_regularizer=regularizers.l1())
+    models = []
+    for _ in range(2):
+        input_tensor = Input(shape=(data_dim,))
+        unshared_dense = Dense(num_classes, kernel_regularizer=regularizers.l1())
+        out = unshared_dense(shared_dense(input_tensor))
+        models.append(Model(input_tensor, out))
+
+    model = create_multi_input_model_from(*models)
+    model.compile(loss='categorical_crossentropy', optimizer='sgd')
+    assert len(model.losses) == 8
+
+
 if __name__ == '__main__':
     pytest.main([__file__])
diff --git a/tests/keras/test_callbacks.py b/tests/keras/test_callbacks.py
index 9f4a105b206..3a556910f37 100644
--- a/tests/keras/test_callbacks.py
+++ b/tests/keras/test_callbacks.py
@@ -403,8 +403,6 @@ def make_model():
 
 
 @keras_test
-@pytest.mark.skipif((K.backend() != 'tensorflow'),
-                    reason='Requires TensorFlow backend')
 def test_TensorBoard(tmpdir):
     np.random.seed(np.random.randint(1, 1e7))
     filepath = str(tmpdir / 'logs')
@@ -551,8 +549,6 @@ def callbacks_factory(histogram_freq):
 
 
 @keras_test
-@pytest.mark.skipif((K.backend() != 'tensorflow'),
-                    reason='Requires TensorFlow backend')
 def test_TensorBoard_multi_input_output(tmpdir):
     np.random.seed(np.random.randint(1, 1e7))
     filepath = str(tmpdir / 'logs')
@@ -628,8 +624,6 @@ def callbacks_factory(histogram_freq):
 
 
 @keras_test
-@pytest.mark.skipif((K.backend() != 'tensorflow'),
-                    reason='Requires TensorFlow backend')
 def test_TensorBoard_convnet(tmpdir):
     np.random.seed(np.random.randint(1, 1e7))
     filepath = str(tmpdir / 'logs')
@@ -753,8 +747,6 @@ def f():
 
 
 @keras_test
-@pytest.mark.skipif((K.backend() != 'tensorflow'),
-                    reason="Requires TensorFlow backend")
 def test_TensorBoard_with_ReduceLROnPlateau(tmpdir):
     import shutil
     np.random.seed(np.random.randint(1, 1e7))
diff --git a/tests/keras/test_sequential_model.py b/tests/keras/test_sequential_model.py
index f9b1adc45bd..b142e54131b 100644
--- a/tests/keras/test_sequential_model.py
+++ b/tests/keras/test_sequential_model.py
@@ -302,6 +302,7 @@ def test_clone_functional_model():
 
     x_a = dense_1(input_a)
     x_a = keras.layers.Dropout(0.5)(x_a)
+    x_a = keras.layers.BatchNormalization()(x_a)
     x_b = dense_1(input_b)
     x_a = dense_2(x_a)
     outputs = keras.layers.add([x_a, x_b])
@@ -340,6 +341,7 @@ def test_clone_sequential_model():
 
     model = keras.models.Sequential()
     model.add(keras.layers.Dense(4, input_shape=(4,)))
+    model.add(keras.layers.BatchNormalization())
     model.add(keras.layers.Dropout(0.5))
     model.add(keras.layers.Dense(4))
 
diff --git a/tests/keras/utils/generic_utils_test.py b/tests/keras/utils/generic_utils_test.py
index 810a315f17e..588a4ecb903 100644
--- a/tests/keras/utils/generic_utils_test.py
+++ b/tests/keras/utils/generic_utils_test.py
@@ -21,9 +21,8 @@ def test_progbar():
     for target in (len(values_s) - 1, None):
         for verbose in (0, 1, 2):
             bar = Progbar(target, width=30, verbose=verbose, interval=0.05)
-            for force in (False, True):
-                for current, values in enumerate(values_s):
-                    bar.update(current, values=values, force=force)
+            for current, values in enumerate(values_s):
+                bar.update(current, values=values)
 
 
 def test_custom_objects_scope():
diff --git a/tests/keras/utils/io_utils_test.py b/tests/keras/utils/io_utils_test.py
index 2ff9396c6b2..6e58fd104a5 100644
--- a/tests/keras/utils/io_utils_test.py
+++ b/tests/keras/utils/io_utils_test.py
@@ -8,6 +8,7 @@
 from keras.utils.io_utils import HDF5Matrix
 from keras.utils.io_utils import ask_to_proceed_with_overwrite
 import numpy as np
+import six
 import warnings
 import h5py
 try:
@@ -30,14 +31,13 @@ def in_tmpdir(tmpdir):
 def create_dataset(h5_path='test.h5'):
     X = np.random.randn(200, 10).astype('float32')
     y = np.random.randint(0, 2, size=(200, 1))
-    f = h5py.File(h5_path, 'w')
-    # Creating dataset to store features
-    X_dset = f.create_dataset('my_data', (200, 10), dtype='f')
-    X_dset[:] = X
-    # Creating dataset to store labels
-    y_dset = f.create_dataset('my_labels', (200, 1), dtype='i')
-    y_dset[:] = y
-    f.close()
+    with h5py.File(h5_path, 'w') as f:
+        # Creating dataset to store features
+        X_dset = f.create_dataset('my_data', (200, 10), dtype='f')
+        X_dset[:] = X
+        # Creating dataset to store labels
+        y_dset = f.create_dataset('my_labels', (200, 1), dtype='i')
+        y_dset[:] = y
 
 
 def test_io_utils(in_tmpdir):
@@ -106,20 +106,12 @@ def test_io_utils(in_tmpdir):
 
 
 def test_ask_to_proceed_with_overwrite():
-    if sys.version_info[:2] <= (2, 7):
-        with patch('__builtin__.raw_input') as mock:
-            mock.return_value = 'y'
-            assert ask_to_proceed_with_overwrite('/tmp/not_exists')
-
-            mock.return_value = 'n'
-            assert not ask_to_proceed_with_overwrite('/tmp/not_exists')
-    else:
-        with patch('builtins.input') as mock:
-            mock.return_value = 'y'
-            assert ask_to_proceed_with_overwrite('/tmp/not_exists')
-
-            mock.return_value = 'n'
-            assert not ask_to_proceed_with_overwrite('/tmp/not_exists')
+    with patch('six.moves.input') as mock:
+        mock.return_value = 'y'
+        assert ask_to_proceed_with_overwrite('/tmp/not_exists')
+
+        mock.return_value = 'n'
+        assert not ask_to_proceed_with_overwrite('/tmp/not_exists')
 
 
 if __name__ == '__main__':
diff --git a/tests/keras/wrappers/scikit_learn_test.py b/tests/keras/wrappers/scikit_learn_test.py
index 1e9bf6ec734..ebc11942595 100644
--- a/tests/keras/wrappers/scikit_learn_test.py
+++ b/tests/keras/wrappers/scikit_learn_test.py
@@ -75,7 +75,7 @@ def __call__(self, hidden_dims):
 
 
 def assert_classification_works(clf):
-    clf.fit(X_train, y_train, batch_size=batch_size, epochs=epochs)
+    clf.fit(X_train, y_train, sample_weight=np.ones(X_train.shape[0]), batch_size=batch_size, epochs=epochs)
 
     score = clf.score(X_train, y_train, batch_size=batch_size)
     assert np.isscalar(score) and np.isfinite(score)
diff --git a/tests/test_model_saving.py b/tests/test_model_saving.py
index d6f43d96eda..f2954fc3db8 100644
--- a/tests/test_model_saving.py
+++ b/tests/test_model_saving.py
@@ -9,6 +9,7 @@
 from keras import backend as K
 from keras.models import Model, Sequential
 from keras.layers import Dense, Lambda, RepeatVector, TimeDistributed, LSTM
+from keras.layers import Conv2D, Flatten
 from keras.layers import Input
 from keras import optimizers
 from keras import losses
@@ -175,7 +176,7 @@ def test_saving_unused_layers_is_ok():
 
 
 @keras_test
-def test_loading_weights_by_name():
+def test_loading_weights_by_name_and_reshape():
     """
     test loading model weights by name on:
         - sequential model
@@ -187,11 +188,12 @@ def test_loading_weights_by_name():
 
     # sequential model
     model = Sequential()
-    model.add(Dense(2, input_shape=(3,), name='rick'))
+    model.add(Conv2D(2, (1, 1), input_shape=(1, 1, 1), name='rick'))
+    model.add(Flatten())
     model.add(Dense(3, name='morty'))
     model.compile(loss=custom_loss, optimizer=custom_opt(), metrics=['acc'])
 
-    x = np.random.random((1, 3))
+    x = np.random.random((1, 1, 1, 1))
     y = np.random.random((1, 3))
     model.train_on_batch(x, y)
 
@@ -204,20 +206,27 @@ def test_loading_weights_by_name():
     # delete and recreate model
     del(model)
     model = Sequential()
-    model.add(Dense(2, input_shape=(3,), name='rick'))
-    model.add(Dense(3, name='morty'))
+    model.add(Conv2D(2, (1, 1), input_shape=(1, 1, 1), name='rick'))
+    model.add(Conv2D(3, (1, 1), name='morty'))
     model.compile(loss=custom_loss, optimizer=custom_opt(), metrics=['acc'])
 
     # load weights from first model
-    model.load_weights(fname, by_name=True)
+    with pytest.raises(ValueError):
+        model.load_weights(fname, by_name=True, reshape=False)
+    with pytest.raises(ValueError):
+        model.load_weights(fname, by_name=False, reshape=False)
+    model.load_weights(fname, by_name=False, reshape=True)
+    model.load_weights(fname, by_name=True, reshape=True)
     os.remove(fname)
 
     out2 = model.predict(x)
-    assert_allclose(out, out2, atol=1e-05)
+    assert_allclose(np.squeeze(out), np.squeeze(out2), atol=1e-05)
     for i in range(len(model.layers)):
         new_weights = model.layers[i].get_weights()
         for j in range(len(new_weights)):
-            assert_allclose(old_weights[i][j], new_weights[j], atol=1e-05)
+            # only compare layers that have weights, skipping Flatten()
+            if old_weights[i]:
+                assert_allclose(old_weights[i][j], new_weights[j], atol=1e-05)
 
 
 @keras_test