sylvainma · msaintja · Jun 6, 2020 · Jun 13, 2020
diff --git a/.gitignore b/.gitignore
@@ -134,5 +134,8 @@ dmypy.json
 .metals/
 .vscode/
 
+# Compiled Cython .pyx to .c (platform-dependent)
+summarizer/utils/KTS/cpd_nonlin.c
+
 # Finally untrack datasets (no money for git lfs bro)
 *.h5
diff --git a/summarizer/README.md b/summarizer/README.md
@@ -18,4 +18,14 @@ python summary2video.py -p logs/1586668539_LogisticRegressionModel/summe_splits.
 ## Generate splits
 ```
 python create_split.py -d datasets/summarizer_dataset_summe_google_pool5.h5 --save-dir splits --save-name summe_splits --num-splits 5
-```
+```
+
+## Build new HDF5 files
+While we provide (in `/datasets`) existing and new HDF5 files corresponding to a few of the classic datasets in video summarization (SumMe, TVSum, Twitch-LOL), you may create your own files using `generate_dataset.py`. This allows the generation of a custom HDF5 dataset following the same template as the existing datasets (including extracted features, keyshots, labels, etc.).
+
+This script was derived from [Shin Donghwan](https://github.com/SinDongHwan/pytorch-vsumm-reinforce/blob/master/utils/generate_dataset.py)'s own generation script, and using [Tatsuya Shirakawa](https://github.com/TatsuyaShirakawa/KTS)'s Cython rewrite of KTS for Python 3.
+
+For example, to rebuild [Twitch-LOL](https://github.com/chengyangfu/Pytorch-Twitch-LOL#dataset-download---google-drive) with a 2-second uniform segmentation, you may use:
+```
+python generate_dataset.py --video datasets/videos/EMNLP17_Twitch_LOL/final_data --annotations datasets/videos/EMNLP17_Twitch_LOL/gt --h5 datasets/summarizer_dataset_LOL_google_pool5.h5 --changepoint-method uniform --changepoint-duration 4 --keyshot-sampling 15 --extractor googlenet --layer-limit=-2
+```
diff --git a/summarizer/datasets/KTS_to_uniform.ipynb b/summarizer/datasets/KTS_to_uniform.ipynb
@@ -0,0 +1,292 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "name": "KTS_to_uniform.ipynb",
+      "provenance": [],
+      "collapsed_sections": [],
+      "include_colab_link": true
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "view-in-github",
+        "colab_type": "text"
+      },
+      "source": [
+        "<a href=\"https://colab.research.google.com/github/sylvainma/Summarizer/blob/hdf5-dataset-generation/summarizer/datasets/KTS_to_uniform.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "xk9SglnqK4AG",
+        "colab_type": "text"
+      },
+      "source": [
+        "# KTS to uniform segmentation\n",
+        "The following notebook edits an HDF5 dataset to change the segmentation method used.  \n",
+        "As a result, `/change_points` and `n_frame_per_seg` will reflect a uniform segmentation of `secs_per_segment` (configurable below).  \n",
+        "\n",
+        "----"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "0sBcMljuMHbe",
+        "colab_type": "text"
+      },
+      "source": [
+        "Run this cell only if you are using this notebook in a standalone way, i.e. you don't already have the [Summarizer](https://github.com/sylvainma/Summarizer) code and datasets locally."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "hkKYZIL5HkoD",
+        "colab_type": "code",
+        "outputId": "d2d3e384-1e02-402d-8481-54724a3783c8",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 187
+        }
+      },
+      "source": [
+        "!git clone -l -s --single-branch --branch hdf5-dataset-generation https://github.com/sylvainma/Summarizer.git summarizer\n",
+        "%cd summarizer\n",
+        "!ls"
+      ],
+      "execution_count": 1,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "Cloning into 'summarizer'...\n",
+            "warning: --local is ignored\n",
+            "remote: Enumerating objects: 429, done.\u001b[K\n",
+            "remote: Counting objects: 100% (429/429), done.\u001b[K\n",
+            "remote: Compressing objects: 100% (254/254), done.\u001b[K\n",
+            "remote: Total 879 (delta 297), reused 292 (delta 175), pack-reused 450\u001b[K\n",
+            "Receiving objects: 100% (879/879), 543.27 KiB | 1.67 MiB/s, done.\n",
+            "Resolving deltas: 100% (576/576), done.\n",
+            "/content/summarizer\n",
+            "README.md  summarizer\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "l0q3Wb9QLrbA",
+        "colab_type": "text"
+      },
+      "source": [
+        "Retrieving datasets.  \n",
+        "You may choose to use your own, in which case you can ignore this cell, and specify the name of your dataset in the next one (parameters). "
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "rPRy9zuo3A7w",
+        "colab_type": "code",
+        "outputId": "20959795-ef53-4668-e4cb-be9cd337933d",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 102
+        }
+      },
+      "source": [
+        "%cd summarizer/datasets\n",
+        "!pip install -q h5py hdf5storage numpy\n",
+        "!python download_datasets.py"
+      ],
+      "execution_count": 2,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "/content/summarizer/summarizer/datasets\n",
+            "\u001b[K     |████████████████████████████████| 61kB 4.2MB/s \n",
+            "\u001b[?25hDownloading summarizer_dataset_summe_google_pool5.h5...\n",
+            "Downloading summarizer_dataset_tvsum_google_pool5.h5...\n",
+            "Downloading summarizer_dataset_LOL_google_pool5.h5...\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "SFP3XoKyx-YJ",
+        "colab_type": "code",
+        "cellView": "form",
+        "colab": {}
+      },
+      "source": [
+        "#@title Parameters for uniform segmentation\n",
+        "#@markdown ---\n",
+        "#@markdown Segment length in seconds:\n",
+        "secs_per_segment = 2 #@param {type:\"slider\", min:1, max:100, step:1}\n",
+        "#@markdown ---\n",
+        "#@markdown Frames per second in original videos:\n",
+        "fps = 30.0 #@param {type:\"number\"}\n",
+        "#@markdown ---\n",
+        "#@markdown Dataset name:\n",
+        "dataset = 'summarizer_dataset_summe_google_pool5.h5' #@param ['summarizer_dataset_summe_google_pool5.h5', 'summarizer_dataset_tvsum_google_pool5.h5'] {allow-input: true}"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "kd6XVMpTMr0Q",
+        "colab_type": "text"
+      },
+      "source": [
+        "Opening the HDF5 dataset for editing."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "-5eTbWSo0dyB",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "import h5py\n",
+        "h5_file = h5py.File(dataset, 'r+')"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "CFUpbDawM9sI",
+        "colab_type": "text"
+      },
+      "source": [
+        "We define a lambda function to retrive the typical number of frames between two picks."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "1xitQ0NQM3e8",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "import numpy as np\n",
+        "trimmed_mean_diff = lambda x: np.mean((x - np.roll(x, 1))[1:-1])"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "KXcG9BRMM0qT",
+        "colab_type": "text"
+      },
+      "source": [
+        "Iterating over videos in dataset."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "__RvNnRs0muo",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "for video in h5_file:\n",
+        "  picks = h5_file[video]['picks'][...]\n",
+        "  keyshot_frequency = trimmed_mean_diff(picks)\n",
+        "  changepoint_duration = int(round(secs_per_segment * fps / keyshot_frequency))\n",
+        "  segment_limits = picks[::changepoint_duration][:-1]\n",
+        "  change_points = np.vstack((segment_limits, np.append(picks[::changepoint_duration][1:len(segment_limits)], [picks[-1]]))).transpose()\n",
+        "  del h5_file[video]['change_points']\n",
+        "  h5_file.create_dataset(f'{video}/change_points', data = change_points.astype(np.int32))\n",
+        "  picks = change_points[:, 1] - change_points[:, 0]\n",
+        "  del h5_file[video]['n_frame_per_seg']\n",
+        "  h5_file.create_dataset(f'{video}/n_frame_per_seg', data = np.array(list(picks)).astype(np.int32))"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "ZMnc1OWaNU95",
+        "colab_type": "text"
+      },
+      "source": [
+        "Closing the HDF5 dataset after editing to remove file lock."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "JwUmXktzD1_g",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "h5_file.close()"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "jldyRnD4KTP_",
+        "colab_type": "text"
+      },
+      "source": [
+        "Don't forget to download the newly created HDF5 dataset:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "8sL0ZSgAFSan",
+        "colab_type": "code",
+        "outputId": "999ffaf6-ed5d-4cdd-87e2-4bd23abfe8f5",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 34
+        }
+      },
+      "source": [
+        "!ls -lah | grep $dataset"
+      ],
+      "execution_count": 8,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "-rw-r--r-- 1 root root  36M Jun 13 19:45 summarizer_dataset_summe_google_pool5.h5\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    }
+  ]
+}