Picovoice · ksyeo1010 · Nov 23, 2023 · Nov 8, 2023 · Nov 10, 2023 · Nov 20, 2023
diff --git a/.github/workflows/python-demos.yml b/.github/workflows/python-demos.yml
@@ -39,6 +39,12 @@ jobs:
     - name: Pre-build dependencies
       run: python -m pip install --upgrade pip
 
+    # ************** REMOVE AFTER RELEASE ********************
+    - name: Build binding
+      run: |
+        pip install wheel && cd ../../binding/python && python setup.py sdist bdist_wheel && pip install dist/pvleopard-2.0.0-py3-none-any.whl
+    # ********************************************************
+
     - name: Install dependencies
       run: pip install -r requirements.txt
 
@@ -55,12 +61,18 @@ jobs:
     steps:
     - uses: actions/checkout@v3
 
+    # ************** REMOVE AFTER RELEASE ********************
+    - name: Build binding
+      run: |
+        pip3 uninstall -y pvleopard && pip3 install wheel && cd ../../binding/python && python3 setup.py sdist bdist_wheel && pip3 install dist/pvleopard-2.0.0-py3-none-any.whl
+    # ********************************************************
+
     - name: Install dependencies
       run: pip3 install -r requirements.txt
 
     - name: Test
       run: python3 leopard_demo_file.py --access_key ${{secrets.PV_VALID_ACCESS_KEY}} --wav_paths ../../resources/audio_samples/test.wav
-  
+
   build-subtitle-demo:
     runs-on: ${{ matrix.os }}
     defaults:
@@ -83,6 +95,12 @@ jobs:
     - name: Pre-build dependencies
       run: python -m pip install --upgrade pip
 
+    # ************** REMOVE AFTER RELEASE ********************
+    - name: Build binding
+      run: |
+        pip install wheel && cd ../../binding/python && python setup.py sdist bdist_wheel && pip install dist/pvleopard-2.0.0-py3-none-any.whl
+    # ********************************************************
+
     - name: Install dependencies
       run: pip install -r requirements.txt
 
@@ -114,11 +132,17 @@ jobs:
     - name: Pre-build dependencies
       run: python -m pip install --upgrade pip
 
+    # ************** REMOVE AFTER RELEASE ********************
+    - name: Build binding
+      run: |
+        pip install wheel && cd ../../binding/python && python setup.py sdist bdist_wheel && pip install dist/pvleopard-2.0.0-py3-none-any.whl
+    # ********************************************************
+
     - name: Install dependencies
       run: pip install -r requirements.txt
 
     - name: Test
-      run: python main.py 
+      run: python main.py
         --access_key ${{secrets.PV_VALID_ACCESS_KEY}}
         --url https://www.youtube.com/watch?v=b7wzFEDU0U8
         --transcript_path test.txt
diff --git a/.github/workflows/python-perf.yml b/.github/workflows/python-perf.yml
@@ -39,14 +39,14 @@ jobs:
         os: [ubuntu-latest, windows-latest, macos-latest]
         include:
         - os: ubuntu-latest
-          init_performance_threshold_sec: 2.0
+          init_performance_threshold_sec: 3.5
           proc_performance_threshold_sec: 0.7
         - os: windows-latest
-          init_performance_threshold_sec: 2.5
-          proc_performance_threshold_sec: 0.9
+          init_performance_threshold_sec: 3.5
+          proc_performance_threshold_sec: 0.7
         - os: macos-latest
-          init_performance_threshold_sec: 2.4
-          proc_performance_threshold_sec: 0.9
+          init_performance_threshold_sec: 3.8
+          proc_performance_threshold_sec: 1.2
 
     steps:
     - uses: actions/checkout@v3
@@ -74,20 +74,20 @@ jobs:
         machine: [rpi3-32, rpi3-64, rpi4-32, rpi4-64, jetson]
         include:
         - machine: rpi3-32
-          init_performance_threshold_sec: 7.6
-          proc_performance_threshold_sec: 3.3
+          init_performance_threshold_sec: 8.5
+          proc_performance_threshold_sec: 4.5
         - machine: rpi3-64
-          init_performance_threshold_sec: 8.4
-          proc_performance_threshold_sec: 3.3
+          init_performance_threshold_sec: 9.0
+          proc_performance_threshold_sec: 4.5
         - machine: rpi4-32
-          init_performance_threshold_sec: 5.7
-          proc_performance_threshold_sec: 2.1
+          init_performance_threshold_sec: 7.2
+          proc_performance_threshold_sec: 3.0
         - machine: rpi4-64
-          init_performance_threshold_sec: 5.1
-          proc_performance_threshold_sec: 2.0
+          init_performance_threshold_sec: 6.5
+          proc_performance_threshold_sec: 2.8
         - machine: jetson
-          init_performance_threshold_sec: 5.1
-          proc_performance_threshold_sec: 2.0
+          init_performance_threshold_sec: 6.0
+          proc_performance_threshold_sec: 2.8
 
     steps:
     - uses: actions/checkout@v3

diff --git a/binding/python/README.md b/binding/python/README.md
@@ -37,9 +37,9 @@ Create an instance of the engine and transcribe an audio file:
 ```python
 import pvleopard
 
-handle = pvleopard.create(access_key='${ACCESS_KEY}')
+leopard = pvleopard.create(access_key='${ACCESS_KEY}')
 
-transcript, words = handle.process_file('${AUDIO_PATH}')
+transcript, words = leopard.process_file('${AUDIO_PATH}')
 print(transcript)
 for word in words:
     print(
@@ -48,8 +48,12 @@ for word in words:
 ```
 
 Replace `${ACCESS_KEY}` with yours obtained from [Picovoice Console](https://console.picovoice.ai/) and
-`${AUDIO_PATH}` to the path an audio file. Finally, when done be sure to explicitly release the resources using
-`handle.delete()`.
+`${AUDIO_PATH}` to the path an audio file. 
+
+Finally, when done be sure to explicitly release the resources:
+```python
+leopard.delete()
+```
 
 ## Language Model
 
@@ -61,7 +65,7 @@ language models with custom vocabulary and boost words in the existing vocabular
 
 Pass in the `.pv` file via the `model_path` argument:
 ```python
-handle = pvleopard.create(
+leopard = pvleopard.create(
     access_key='${ACCESS_KEY}',
     model_path='${MODEL_PATH}')
 ```

diff --git a/binding/python/_factory.py b/binding/python/_factory.py
@@ -19,7 +19,8 @@ def create(
         access_key: str,
         model_path: Optional[str] = None,
         library_path: Optional[str] = None,
-        enable_automatic_punctuation: bool = False) -> Leopard:
+        enable_automatic_punctuation: bool = False,
+        enable_diarization: bool = False) -> Leopard:
     """
     Factory method for Leopard speech-to-text engine.
 
@@ -28,6 +29,9 @@ def create(
     :param model_path: Absolute path to the file containing model parameters. If not set it will be set to the default
     location.
     :param enable_automatic_punctuation Set to `True` to enable automatic punctuation insertion.
+    :param enable_diarization Set to `true` to enable speaker diarization, which allows Leopard to differentiate
+    speakers as part of the transcription process. Word metadata will include a `speaker_tag` to
+    identify unique speakers.
     :return: An instance of Leopard speech-to-text engine.
     """
 
@@ -41,7 +45,8 @@ def create(
         access_key=access_key,
         model_path=model_path,
         library_path=library_path,
-        enable_automatic_punctuation=enable_automatic_punctuation)
+        enable_automatic_punctuation=enable_automatic_punctuation,
+        enable_diarization=enable_diarization)
 
 
 __all__ = [

diff --git a/binding/python/_leopard.py b/binding/python/_leopard.py
@@ -18,7 +18,27 @@
 
 
 class LeopardError(Exception):
-    pass
+    def __init__(self, message: str = '', message_stack: Sequence[str] = None):
+        super().__init__(message)
+
+        self._message = message
+        self._message_stack = list() if message_stack is None else message_stack
+
+    def __str__(self):
+        message = self._message
+        if len(self._message_stack) > 0:
+            message += ':'
+            for i in range(len(self._message_stack)):
+                message += '\n  [%d] %s' % (i, self._message_stack[i])
+        return message
+
+    @property
+    def message(self) -> str:
+        return self._message
+
+    @property
+    def message_stack(self) -> Sequence[str]:
+        return self._message_stack
 
 
 class LeopardMemoryError(LeopardError):
@@ -119,21 +139,26 @@ class CWord(Structure):
             ("word", c_char_p),
             ("start_sec", c_float),
             ("end_sec", c_float),
-            ("confidence", c_float)]
+            ("confidence", c_float),
+            ("speaker_tag", c_int32)]
 
     def __init__(
             self,
             access_key: str,
             model_path: str,
             library_path: str,
-            enable_automatic_punctuation: bool = False) -> None:
+            enable_automatic_punctuation: bool = False,
+            enable_diarization: bool = False) -> None:
         """
         Constructor.
 
         :param access_key: AccessKey obtained from Picovoice Console (https://console.picovoice.ai/)
         :param model_path: Absolute path to the file containing model parameters.
         :param library_path: Absolute path to Leopard's dynamic library.
         :param enable_automatic_punctuation Set to `True` to enable automatic punctuation insertion.
+        :param enable_diarization Set to `true` to enable speaker diarization, which allows Leopard to differentiate
+        speakers as part of the transcription process. Word metadata will include a `speaker_tag` to
+        identify unique speakers.
         """
 
         if not isinstance(access_key, str) or len(access_key) == 0:
@@ -147,15 +172,36 @@ def __init__(
 
         library = cdll.LoadLibrary(library_path)
 
+        set_sdk_func = library.pv_set_sdk
+        set_sdk_func.argtypes = [c_char_p]
+        set_sdk_func.restype = None
+
+        set_sdk_func('python'.encode('utf-8'))
+
+        self._get_error_stack_func = library.pv_get_error_stack
+        self._get_error_stack_func.argtypes = [POINTER(POINTER(c_char_p)), POINTER(c_int)]
+        self._get_error_stack_func.restype = self.PicovoiceStatuses
+
+        self._free_error_stack_func = library.pv_free_error_stack
+        self._free_error_stack_func.argtypes = [POINTER(c_char_p)]
+        self._free_error_stack_func.restype = None
+
         init_func = library.pv_leopard_init
-        init_func.argtypes = [c_char_p, c_char_p, c_bool, POINTER(POINTER(self.CLeopard))]
+        init_func.argtypes = [c_char_p, c_char_p, c_bool, c_bool, POINTER(POINTER(self.CLeopard))]
         init_func.restype = self.PicovoiceStatuses
 
         self._handle = POINTER(self.CLeopard)()
 
-        status = init_func(access_key.encode(), model_path.encode(), enable_automatic_punctuation, byref(self._handle))
+        status = init_func(
+            access_key.encode(),
+            model_path.encode(),
+            enable_automatic_punctuation,
+            enable_diarization,
+            byref(self._handle))
         if status is not self.PicovoiceStatuses.SUCCESS:
-            raise self._PICOVOICE_STATUS_TO_EXCEPTION[status]()
+            raise self._PICOVOICE_STATUS_TO_EXCEPTION[status](
+                message='Initialization failed',
+                message_stack=self._get_error_stack())
 
         self._delete_func = library.pv_leopard_delete
         self._delete_func.argtypes = [POINTER(self.CLeopard)]
@@ -201,7 +247,7 @@ def __init__(
         ]
         self._words_delete_func.restype = None
 
-    Word = namedtuple('Word', ['word', 'start_sec', 'end_sec', 'confidence'])
+    Word = namedtuple('Word', ['word', 'start_sec', 'end_sec', 'confidence', 'speaker_tag'])
 
     def process(self, pcm: Sequence[int]) -> Tuple[str, Sequence[Word]]:
         """
@@ -227,7 +273,9 @@ def process(self, pcm: Sequence[int]) -> Tuple[str, Sequence[Word]]:
             byref(num_words),
             byref(c_words))
         if status is not self.PicovoiceStatuses.SUCCESS:
-            raise self._PICOVOICE_STATUS_TO_EXCEPTION[status]()
+            raise self._PICOVOICE_STATUS_TO_EXCEPTION[status](
+                message='Process failed',
+                message_stack=self._get_error_stack())
 
         transcript = c_transcript.value.decode('utf-8')
         self._transcript_delete_func(c_transcript)
@@ -238,7 +286,8 @@ def process(self, pcm: Sequence[int]) -> Tuple[str, Sequence[Word]]:
                 word=c_words[i].word.decode('utf-8'),
                 start_sec=c_words[i].start_sec,
                 end_sec=c_words[i].end_sec,
-                confidence=c_words[i].confidence)
+                confidence=c_words[i].confidence,
+                speaker_tag=c_words[i].speaker_tag)
             words.append(word)
 
         self._words_delete_func(c_words)
@@ -267,12 +316,9 @@ def process_file(self, audio_path: str) -> Tuple[str, Sequence[Word]]:
             byref(num_words),
             byref(c_words))
         if status is not self.PicovoiceStatuses.SUCCESS:
-            if status is self.PicovoiceStatuses.INVALID_ARGUMENT:
-                if not audio_path.lower().endswith(self._VALID_EXTENSIONS):
-                    raise self._PICOVOICE_STATUS_TO_EXCEPTION[status](
-                        "Specified file with extension '%s' is not supported" % pathlib.Path(audio_path).suffix
-                    )
-            raise self._PICOVOICE_STATUS_TO_EXCEPTION[status]()
+            raise self._PICOVOICE_STATUS_TO_EXCEPTION[status](
+                message='Process file failed',
+                message_stack=self._get_error_stack())
 
         transcript = c_transcript.value.decode('utf-8')
         self._transcript_delete_func(c_transcript)
@@ -283,7 +329,8 @@ def process_file(self, audio_path: str) -> Tuple[str, Sequence[Word]]:
                 word=c_words[i].word.decode('utf-8'),
                 start_sec=c_words[i].start_sec,
                 end_sec=c_words[i].end_sec,
-                confidence=c_words[i].confidence)
+                confidence=c_words[i].confidence,
+                speaker_tag=c_words[i].speaker_tag)
             words.append(word)
 
         self._words_delete_func(c_words)
@@ -307,6 +354,21 @@ def sample_rate(self) -> int:
 
         return self._sample_rate
 
+    def _get_error_stack(self) -> Sequence[str]:
+        message_stack_ref = POINTER(c_char_p)()
+        message_stack_depth = c_int()
+        status = self._get_error_stack_func(byref(message_stack_ref), byref(message_stack_depth))
+        if status is not self.PicovoiceStatuses.SUCCESS:
+            raise self._PICOVOICE_STATUS_TO_EXCEPTION[status](message='Unable to get Leopard error state')
+
+        message_stack = list()
+        for i in range(message_stack_depth.value):
+            message_stack.append(message_stack_ref[i].decode('utf-8'))
+
+        self._free_error_stack_func(message_stack_ref)
+
+        return message_stack
+
 
 __all__ = [
     'Leopard',