mosaicml · KuuCi · Sep 27, 2024 · Sep 26, 2024 · Sep 27, 2024 · Sep 27, 2024
@@ -234,27 +234,7 @@ def run_query(
     elif method == 'dbconnect':
         if spark == None:
             raise ValueError(f'sparkSession is required for dbconnect')
-
-        try:
-            df = spark.sql(query)
-        except Exception as e:
-            from pyspark.errors import AnalysisException
-            if isinstance(e, AnalysisException):
-                if 'INSUFFICIENT_PERMISSIONS' in e.message:  # pyright: ignore
-                    match = re.search(
-                        r"Schema\s+'([^']+)'",
-                        e.message,  # pyright: ignore
-                    )
-                    if match:
-                        schema_name = match.group(1)
-                        action = f'using the schema {schema_name}'
-                    else:
-                        action = 'using the schema'
-                    raise InsufficientPermissionsError(action=action,) from e
-            raise RuntimeError(
-                f'Error in querying into schema. Restart sparkSession and try again',
-            ) from e
-
+        df = spark.sql(query)
         if collect:
             return df.collect()
         return df
@@ -469,71 +449,66 @@ def fetch(
     """
     cursor = dbsql.cursor() if dbsql is not None else None
     try:
-        nrows = get_total_rows(
-            tablename,
-            method,
-            cursor,
-            sparkSession,
-        )
-    except Exception as e:
-        from pyspark.errors import AnalysisException
-        if isinstance(e, AnalysisException):
-            if 'INSUFFICIENT_PERMISSIONS' in e.message:  # pyright: ignore
-                raise InsufficientPermissionsError(
-                    action=f'reading from {tablename}',
-                ) from e
-        if isinstance(e, InsufficientPermissionsError):
-            raise e
-        raise RuntimeError(
-            f'Error in get rows from {tablename}. Restart sparkSession and try again',
-        ) from e
+        # Get total rows
+        nrows = get_total_rows(tablename, method, cursor, sparkSession)
 
-    try:
+        # Get columns info
         columns, order_by, columns_str = get_columns_info(
             tablename,
             method,
             cursor,
             sparkSession,
         )
+
+        if method == 'dbconnect' and sparkSession is not None:
+            log.info(f'{processes=}')
+            df = sparkSession.table(tablename)
+
+            # Running the query and collecting the data as arrow or json.
+            signed, _, _ = df.collect_cf('arrow')  # pyright: ignore
+            log.info(f'len(signed) = {len(signed)}')
+
+            args = get_args(signed, json_output_folder, columns)
+
+            # Stopping the SparkSession to avoid spilling connection state into the subprocesses.
+            sparkSession.stop()
+
+            with ProcessPoolExecutor(max_workers=processes) as executor:
+                list(executor.map(download_starargs, args))
+
+        elif method == 'dbsql' and cursor is not None:
+            for start in range(0, nrows, batch_size):
+                log.warning(f'batch {start}')
+                end = min(start + batch_size, nrows)
+                fetch_data(
+                    method,
+                    cursor,
+                    sparkSession,
+                    start,
+                    end,
+                    order_by,
+                    tablename,
+                    columns_str,
+                    json_output_folder,
+                )
+
     except Exception as e:
-        raise RuntimeError(
-            f'Error in get columns from {tablename}. Restart sparkSession and try again',
-        ) from e
+        from databricks.sql.exc import ServerOperationError
+        from pyspark.errors import AnalysisException
 
-    if method == 'dbconnect' and sparkSession is not None:
-        log.info(f'{processes=}')
-        df = sparkSession.table(tablename)
-
-        # Running the query and collecting the data as arrow or json.
-        signed, _, _ = df.collect_cf('arrow')  # pyright: ignore
-        log.info(f'len(signed) = {len(signed)}')
-
-        args = get_args(signed, json_output_folder, columns)
-
-        # Stopping the SparkSession to avoid spilling connection state into the subprocesses.
-        sparkSession.stop()
-
-        with ProcessPoolExecutor(max_workers=processes) as executor:
-            list(executor.map(download_starargs, args))
-
-    elif method == 'dbsql' and cursor is not None:
-        for start in range(0, nrows, batch_size):
-            log.warning(f'batch {start}')
-            end = min(start + batch_size, nrows)
-            fetch_data(
-                method,
-                cursor,
-                sparkSession,
-                start,
-                end,
-                order_by,
-                tablename,
-                columns_str,
-                json_output_folder,
-            )
+        if isinstance(e, (AnalysisException, ServerOperationError)):
+            if 'INSUFFICIENT_PERMISSIONS' in str(e):
+                raise InsufficientPermissionsError(str(e)) from e
+
+        if isinstance(e, InsufficientPermissionsError):
+            raise
+
+        # For any other exception, raise a general error
+        raise RuntimeError(f'Error processing {tablename}: {str(e)}') from e
 
-    if cursor is not None:
-        cursor.close()
+    finally:
+        if cursor is not None:
+            cursor.close()
 
 
 def validate_and_get_cluster_info(

@@ -456,6 +456,5 @@ def __init__(
 class InsufficientPermissionsError(UserError):
     """Error thrown when the user does not have sufficient permissions."""
 
-    def __init__(self, action: str) -> None:
-        message = f'Insufficient permissions when {action}. Please check your permissions.'
-        super().__init__(message, action=action)
+    def __init__(self, message: str) -> None:
+        super().__init__(message)