PaddlePaddle · nickyfantasy · Mar 29, 2018 · Mar 28, 2018 · Mar 28, 2018 · Mar 28, 2018
diff --git a/visualdl/logic/pybind.cc b/visualdl/logic/pybind.cc
@@ -79,9 +79,14 @@ PYBIND11_MODULE(core, m) {
              auto tablet = self.tablet(tag);
              return vs::components::ImageReader(self.mode(), tablet);
            })
-      .def("get_text", [](vs::LogReader& self, const std::string& tag) {
+      .def("get_text",
+           [](vs::LogReader& self, const std::string& tag) {
+             auto tablet = self.tablet(tag);
+             return vs::components::TextReader(tablet);
+           })
+      .def("get_audio", [](vs::LogReader& self, const std::string& tag) {
         auto tablet = self.tablet(tag);
-        return vs::components::TextReader(tablet);
+        return vs::components::AudioReader(self.mode(), tablet);
       });
 
   // clang-format on
@@ -119,10 +124,19 @@ PYBIND11_MODULE(core, m) {
              auto tablet = self.AddTablet(tag);
              return vs::components::Image(tablet, num_samples, step_cycle);
            })
-      .def("new_text", [](vs::LogWriter& self, const std::string& tag) {
-        auto tablet = self.AddTablet(tag);
-        return vs::components::Text(tablet);
-      });
+      .def("new_text",
+           [](vs::LogWriter& self, const std::string& tag) {
+             auto tablet = self.AddTablet(tag);
+             return vs::components::Text(tablet);
+           })
+      .def("new_audio",
+           [](vs::LogWriter& self,
+              const std::string& tag,
+              int num_samples,
+              int step_cycle) {
+             auto tablet = self.AddTablet(tag);
+             return vs::components::Audio(tablet, num_samples, step_cycle);
+           });
 
 //------------------- components --------------------
 #define ADD_SCALAR_READER(T)                               \
@@ -219,6 +233,61 @@ PYBIND11_MODULE(core, m) {
       .def("total_records", &cp::TextReader::total_records)
       .def("size", &cp::TextReader::size);
 
+  py::class_<cp::Audio>(m, "AudioWriter", R"pbdoc(
+            PyBind class. Must instantiate through the LogWriter.
+          )pbdoc")
+      .def("set_caption", &cp::Audio::SetCaption, R"pbdoc(
+            PyBind class. Must instantiate through the LogWriter.
+          )pbdoc")
+      .def("start_sampling", &cp::Audio::StartSampling, R"pbdoc(
+            Start a sampling period, this interface will start a new reservoir sampling phase.
+          )pbdoc")
+      .def("is_sample_taken", &cp::Audio::IsSampleTaken, R"pbdoc(
+            Will this sample be taken, this interface is introduced to reduce the cost
+            of copy audio data, by testing whether this audio will be sampled, and only
+            copy data when it should be sampled. In that way, most of un-sampled audio
+            data need not be copied or processed at all.
+
+            :return: Index
+            :rtype: integer
+                  )pbdoc")
+      .def("finish_sampling", &cp::Audio::FinishSampling, R"pbdoc(
+            End a sampling period, it will clear all states for reservoir sampling.
+          )pbdoc")
+      .def("set_sample", &cp::Audio::SetSample, R"pbdoc(
+            Store the flatten audio data with sample rate specified.
+
+            :param index:
+            :type index: integer
+            :param sample_rate: Sample rate of audio
+            :type sample_rate: integer
+            :param audio_data: Flatten audio data
+            :type audio_data: list
+                  )pbdoc")
+      .def("add_sample", &cp::Audio::AddSample, R"pbdoc(
+            A combined interface for is_sample_taken and set_sample, simpler but is less efficient.
+
+            :param sample_rate: Sample rate of audio
+            :type sample_rate: integer
+            :param audio_data: Flatten audio data
+            :type audio_data: list
+                  )pbdoc");
+
+  py::class_<cp::AudioReader::AudioRecord>(m, "AudioRecord")
+      // TODO(ChunweiYan) make these copyless.
+      .def("data", [](cp::AudioReader::AudioRecord& self) { return self.data; })
+      .def("sample_rate",
+           [](cp::AudioReader::AudioRecord& self) { return self.sample_rate; })
+      .def("step_id",
+           [](cp::AudioReader::AudioRecord& self) { return self.step_id; });
+
+  py::class_<cp::AudioReader>(m, "AudioReader")
+      .def("caption", &cp::AudioReader::caption)
+      .def("num_records", &cp::AudioReader::num_records)
+      .def("num_samples", &cp::AudioReader::num_samples)
+      .def("record", &cp::AudioReader::record)
+      .def("timestamp", &cp::AudioReader::timestamp);
+
 #define ADD_HISTOGRAM_WRITER(T)                                          \
   py::class_<cp::Histogram<T>>(m, "HistogramWriter__" #T, \ 
    R"pbdoc(PyBind class. Must instantiate through the LogWriter.)pbdoc") \

diff --git a/visualdl/logic/sdk.cc b/visualdl/logic/sdk.cc
@@ -222,11 +222,6 @@ void Image::SetSample(int index,
   CHECK_LT(index, num_samples_);
   CHECK_LE(index, num_records_);
 
-  // trick to store int8 to protobuf
-  std::vector<byte_t> data_str(data.size());
-  for (int i = 0; i < data.size(); i++) {
-    data_str[i] = data[i];
-  }
   Uint8Image image(new_shape[2], new_shape[0] * new_shape[1]);
   NormalizeImage(&image, &data[0], new_shape[0] * new_shape[1], new_shape[2]);
 
@@ -352,6 +347,111 @@ std::string TextReader::caption() const {
 
 size_t TextReader::size() const { return reader_.total_records(); }
 
+void Audio::StartSampling() {
+  if (!ToSampleThisStep()) return;
+
+  step_ = writer_.AddRecord();
+  step_.SetId(step_id_);
+
+  time_t time = std::time(nullptr);
+  step_.SetTimeStamp(time);
+
+  // resize record
+  for (int i = 0; i < num_samples_; i++) {
+    step_.AddData();
+  }
+  num_records_ = 0;
+}
+
+int Audio::IsSampleTaken() {
+  if (!ToSampleThisStep()) return -1;
+  num_records_++;
+  if (num_records_ <= num_samples_) {
+    return num_records_ - 1;
+  }
+  float prob = float(num_samples_) / num_records_;
+  float randv = (float)rand() / RAND_MAX;
+  if (randv < prob) {
+    // take this sample
+    int index = rand() % num_samples_;
+    return index;
+  }
+  return -1;
+}
+
+void Audio::FinishSampling() {
+  step_id_++;
+  if (ToSampleThisStep()) {
+    writer_.parent()->PersistToDisk();
+  }
+}
+
+void Audio::AddSample(int sample_rate, const std::vector<value_t>& data) {
+  auto idx = IsSampleTaken();
+  if (idx >= 0) {
+    SetSample(idx, sample_rate, data);
+  }
+}
+
+void Audio::SetSample(int index,
+                      int sample_rate,
+                      const std::vector<value_t>& data) {
+  CHECK_GT(sample_rate, 0)
+      << "sample rate should be something like 6000, 8000 or 44100";
+  CHECK_LT(index, num_samples_)
+      << "index should be less than number of samples";
+  CHECK_LE(index, num_records_)
+      << "index should be less than or equal to number of records";
+
+  // convert float vector to char vector
+  std::vector<char> data_str(data.size());
+  for (int i = 0; i < data.size(); i++) {
+    data_str[i] = data[i];
+  }
+
+  BinaryRecord brcd(GenBinaryRecordDir(step_.parent()->dir()),
+                    std::string(data_str.data()));
+  brcd.tofile();
+
+  auto entry = step_.MutableData<std::vector<byte_t>>(index);
+  // update record
+  auto old_hash = entry.reader().GetRaw();
+  if (!old_hash.empty()) {
+    std::string old_path =
+        GenBinaryRecordDir(step_.parent()->dir()) + "/" + old_hash;
+    CHECK_EQ(std::remove(old_path.c_str()), 0) << "delete old binary record "
+                                               << old_path << " failed";
+  }
+  entry.SetRaw(brcd.filename());
+}
+
+std::string AudioReader::caption() {
+  CHECK_EQ(reader_.captions().size(), 1);
+  auto caption = reader_.captions().front();
+  if (LogReader::TagMatchMode(caption, mode_)) {
+    return LogReader::GenReadableTag(mode_, caption);
+  }
+  string::TagDecode(caption);
+  return caption;
+}
+
+AudioReader::AudioRecord AudioReader::record(int offset, int index) {
+  AudioRecord res;
+  auto record = reader_.record(offset);
+  auto entry = record.data(index);
+  auto filename = entry.GetRaw();
+  CHECK(!g_log_dir.empty())
+      << "g_log_dir should be set in LogReader construction";
+  BinaryRecordReader brcd(GenBinaryRecordDir(g_log_dir), filename);
+
+  std::transform(brcd.data.begin(),
+                 brcd.data.end(),
+                 std::back_inserter(res.data),
+                 [](byte_t i) { return (int)(i); });
+  res.step_id = record.id();
+  return res;
+}
+
 }  // namespace components
 
 }  // namespace visualdl
diff --git a/visualdl/logic/sdk.h b/visualdl/logic/sdk.h
@@ -171,7 +171,7 @@ struct Image {
 
   /*
    * A combined interface for IsSampleTaken and SetSample, simpler but might be
-   * low effience.
+   * low efficiency.
    */
   void AddSample(const std::vector<shape_t>& shape,
                  const std::vector<value_t>& data);
@@ -326,6 +326,114 @@ struct TextReader {
   TabletReader reader_;
 };
 
+/*
+ * Image component writer.
+ */
+struct Audio {
+  using value_t = float;
+
+  /*
+   * step_cycle: store every `step_cycle` as a record.
+   * num_samples: how many samples to take in a step.
+   */
+  Audio(Tablet tablet, int num_samples, int step_cycle)
+      : writer_(tablet), num_samples_(num_samples), step_cycle_(step_cycle) {
+    CHECK_GT(step_cycle, 0);
+    CHECK_GT(num_samples, 0);
+
+    writer_.SetType(Tablet::Type::kAudio);
+    // make audio's tag as the default caption.
+    writer_.SetNumSamples(num_samples);
+    SetCaption(tablet.reader().tag());
+  }
+
+  void SetCaption(const std::string& c) {
+    writer_.SetCaptions(std::vector<std::string>({c}));
+  }
+
+  /*
+   * Start a sampling period, this interface will start a new reservior sampling
+   * phase.
+   */
+  void StartSampling();
+  /*
+   * End a sampling period, it will clear all states for reservior sampling.
+   */
+  void FinishSampling();
+
+  /*
+   * A combined interface for IsSampleTaken and SetSample, simpler but might be
+   * low efficiency.
+   */
+  void AddSample(int sample_rate, const std::vector<value_t>& data);
+
+  /*
+   * Will this sample be taken, this interface is introduced to reduce the cost
+   * of copy audio data, by testing whether this audio will be sampled, and only
+   * copy data when it should be sampled. In that way, most of unsampled audio
+   * data need not be copied or processed at all.
+   */
+  int IsSampleTaken();
+  /*
+   * Store audio data with sample rate
+   */
+  void SetSample(int index, int sample_rate, const std::vector<value_t>& data);
+
+protected:
+  bool ToSampleThisStep() { return step_id_ % step_cycle_ == 0; }
+
+private:
+  Tablet writer_;
+  Record step_;
+  int num_records_{0};
+  int num_samples_{0};
+  int step_id_{0};
+  int step_cycle_;
+};
+
+/*
+* Audio reader.
+*/
+struct AudioReader {
+  using value_t = typename Audio::value_t;
+
+  struct AudioRecord {
+    int step_id;
+    int sample_rate;
+    std::vector<int> data;
+  };
+
+  AudioReader(const std::string& mode, TabletReader tablet)
+      : reader_(tablet), mode_{mode} {}
+
+  std::string caption();
+
+  // number of steps.
+  int num_records() { return reader_.total_records(); }
+
+  int num_samples() { return reader_.num_samples(); }
+
+  int64_t timestamp(int step) { return reader_.record(step).timestamp(); }
+
+  /*
+   * offset: offset of a step.
+   * index: index of a sample.
+   */
+  AudioRecord record(int offset, int index);
+
+  /*
+   * offset: offset of a step.
+   * index: index of a sample.
+   */
+  std::vector<value_t> data(int offset, int index);
+
+  int stepid(int offset, int index);
+
+private:
+  TabletReader reader_;
+  std::string mode_;
+};
+
 }  // namespace components
 }  // namespace visualdl
 

diff --git a/visualdl/python/storage.py b/visualdl/python/storage.py
@@ -119,6 +119,16 @@ def text(self, tag):
         check_tag_name_valid(tag)
         return self.reader.get_text(tag)
 
+    def audio(self, tag):
+        """
+        Get an audio reader with tag
+
+        :param tag:  The reader will read the audio data marked with tag
+        :type tag: basestring
+        """
+        check_tag_name_valid(tag)
+        return self.reader.get_audio(tag)
+
     def __enter__(self):
         return self
 
@@ -226,6 +236,22 @@ def histogram(self, tag, num_buckets, type='float'):
         }
         return types[type](tag, num_buckets)
 
+    def audio(self, tag, num_samples, step_cycle=1):
+        """
+        Create an audio writer that used to write audio data.
+
+        :param tag: The audio writer will label the audio with tag
+        :type tag: basestring
+        :param num_samples: how many samples to take in a step.
+        :type num_samples: integer
+        :param step_cycle: store every `step_cycle` as a record.
+        :type step_cycle: integer
+        :return: A audio writer to sample audio
+        :rtype: AudioWriter
+        """
+        check_tag_name_valid(tag)
+        return self.writer.new_audio(tag, num_samples, step_cycle)
+
     def text(self, tag):
         check_tag_name_valid(tag)
         return self.writer.new_text(tag)

diff --git a/visualdl/storage/storage.proto b/visualdl/storage/storage.proto
@@ -108,6 +108,7 @@ message Tablet {
     kHistogram = 1;
     kImage = 2;
     kText = 3;
+    kAudio = 4;
   }
   // The unique identification for this `Tablet`. VisualDL will have no the
   // concept of FileWriter like TB. It will store all the tablets in a single