From 248fd0f5037c27349e45286a51dfe1e7e3d8cb4b Mon Sep 17 00:00:00 2001
From: Mateen Ulhaq <mulhaq2005@gmail.com>
Date: Tue, 8 Aug 2023 04:05:35 -0700
Subject: [PATCH 1/2] perf: read_ply replace pandas.read_csv engine=python with
 c

---
 pyntcloud/io/ply.py | 27 +++++++++++++++++++++++----
 1 file changed, 23 insertions(+), 4 deletions(-)
diff --git a/pyntcloud/io/ply.py b/pyntcloud/io/ply.py
index 6deda6e..b88696d 100644
--- a/pyntcloud/io/ply.py
+++ b/pyntcloud/io/ply.py
@@ -4,6 +4,10 @@
 import numpy as np
 import pandas as pd
 from collections import defaultdict
+from contextlib import contextmanager
+from io import StringIO
+from itertools import islice
+
 
 sys_byteorder = ('>', '<')[sys.byteorder == 'little']
 
@@ -132,8 +136,13 @@ def read_ply(filename, allow_bool=False):
 
         names = [x[0] for x in dtypes["vertex"]]
 
-        data["points"] = pd.read_csv(filename, sep=" ", header=None, engine="python",
-                                     skiprows=top, skipfooter=bottom, usecols=names, names=names)
+        with open(filename, 'r') as f:
+            lines = f.readlines()
+
+        with _file_from_lines(lines, top, len(lines) - bottom) as f:
+            data["points"] = pd.read_csv(
+                f, sep=" ", header=None, usecols=names, names=names
+            )
 
         for n, col in enumerate(data["points"].columns):
             data["points"][col] = data["points"][col].astype(
@@ -146,8 +155,10 @@ def read_ply(filename, allow_bool=False):
             usecols = [1, 2, 3, 5, 6, 7, 8, 9, 10] if has_texture else [1, 2, 3]
             names = names[usecols]
 
-            data["mesh"] = pd.read_csv(
-                filename, sep=" ", header=None, engine="python", skiprows=top, usecols=usecols, names=names)
+            with _file_from_lines(lines, top) as f:
+                data["mesh"] = pd.read_csv(
+                    f, sep=" ", header=None, usecols=usecols, names=names
+                )
 
             for n, col in enumerate(data["mesh"].columns):
                 data["mesh"][col] = data["mesh"][col].astype(
@@ -261,3 +272,11 @@ def describe_element(name, df):
             element.append('property ' + f + ' ' + df.columns.values[i])
 
     return element
+
+
+@contextmanager
+def _file_from_lines(lines, start=None, stop=None):
+    with StringIO() as f:
+        f.writelines("".join(islice(lines, start, stop)))
+        f.seek(0)
+        yield f

From 12ee9f2208f4207844be80ac5fdbafaf9f0652fa Mon Sep 17 00:00:00 2001
From: Mateen Ulhaq <mulhaq2005@gmail.com>
Date: Tue, 8 Aug 2023 03:57:39 -0700
Subject: [PATCH 2/2] fix: read_off improve robustness of header parsing

Improve robustness of header parsing a bit.

In particular, ModelNet40 has faulty headers:
```bash
$ head -n 1 ModelNet40/chair/train/chair_0856.off
OFF6586 5534 0
```

For reference, the correct format is:
```
OFF
6586 5534 0
```

Nonetheless, it is still valuable to parse the faulty header.

Also, reuse already open file for reading instead of opening it twice.
---
 pyntcloud/io/off.py | 75 +++++++++++++++++++++++++++++----------------
 1 file changed, 48 insertions(+), 27 deletions(-)

diff --git a/pyntcloud/io/off.py b/pyntcloud/io/off.py
index 1f4193f..b816d96 100644
--- a/pyntcloud/io/off.py
+++ b/pyntcloud/io/off.py
@@ -1,43 +1,57 @@
-import pandas as pd
+import re
+
 import numpy as np
+import pandas as pd
 
 
 def read_off(filename):
-
-    with open(filename) as off:
-
-        first_line = off.readline()
+    with open(filename) as f:
+        first_line = f.readline()
         if "OFF" not in first_line:
-            raise ValueError('The file does not start with the word OFF')
-        color = True if "C" in first_line else False
+            raise ValueError("The file does not start with the word OFF")
+        has_color = "C" in first_line
+
+        num_rows = None
+        n_points = None
+        n_faces = None
+        n_header = 1
 
-        n_points = 0
-        n_faces = 0
+        # Backtrack to account for faulty headers, e.g. "OFF4 4 0".
+        m = re.match(r"^(?P<prefix>\D+)([\d\s]+)$", first_line)
+        if m:
+            f.seek(len(m.group("prefix")))
+            n_header = 0
 
-        count = 1
-        for line in off:
-            count += 1
+        # Read header.
+        for line in f:
+            n_header += 1
             if line.startswith("#"):
                 continue
             line = line.strip().split()
-            if len(line) > 1:
-                n_points = int(line[0])
-                n_faces = int(line[1])
-                break
+            if len(line) <= 1:
+                continue
+            n_points = int(line[0])
+            n_faces = int(line[1])
+            num_rows = n_points + n_faces
+            break
+
+        if num_rows is None:
+            raise ValueError("The file does not contain a valid header")
 
-        if (n_points == 0):
-            raise ValueError('The file has no points')
+        if n_points == 0:
+            raise ValueError("The file contains no points")
 
         data = {}
         point_names = ["x", "y", "z"]
-        point_types = {'x': np.float32, 'y': np.float32, 'z': np.float32}
+        point_types = {"x": np.float32, "y": np.float32, "z": np.float32}
 
-        if color:
+        if has_color:
             point_names.extend(["red", "green", "blue"])
-            point_types = dict(point_types, **{'red': np.uint8, 'green': np.uint8, 'blue': np.uint8})
+            color_point_types = {"red": np.uint8, "green": np.uint8, "blue": np.uint8}
+            point_types = {**point_types, **color_point_types}
 
         data["points"] = pd.read_csv(
-            off,
+            f,
             sep=" ",
             header=None,
             engine="c",
@@ -45,18 +59,25 @@ def read_off(filename):
             names=point_names,
             dtype=point_types,
             index_col=False,
-            comment="#"
+            comment="#",
         )
 
+        assert len(data["points"]) == n_points
+
+        f.seek(0)
+
         data["mesh"] = pd.read_csv(
-            filename,
+            f,
             sep=" ",
             header=None,
             engine="c",
-            skiprows=(count + n_points),
+            skiprows=n_header + n_points,
             nrows=n_faces,
             usecols=[1, 2, 3],
             names=["v1", "v2", "v3"],
-            comment="#"
+            comment="#",
         )
-        return data
+
+        assert len(data["mesh"]) == n_faces
+
+    return data