From 248fd0f5037c27349e45286a51dfe1e7e3d8cb4b Mon Sep 17 00:00:00 2001 From: Mateen Ulhaq Date: Tue, 8 Aug 2023 04:05:35 -0700 Subject: [PATCH 1/2] perf: read_ply replace pandas.read_csv engine=python with c --- pyntcloud/io/ply.py | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/pyntcloud/io/ply.py b/pyntcloud/io/ply.py index 6deda6e..b88696d 100644 --- a/pyntcloud/io/ply.py +++ b/pyntcloud/io/ply.py @@ -4,6 +4,10 @@ import numpy as np import pandas as pd from collections import defaultdict +from contextlib import contextmanager +from io import StringIO +from itertools import islice + sys_byteorder = ('>', '<')[sys.byteorder == 'little'] @@ -132,8 +136,13 @@ def read_ply(filename, allow_bool=False): names = [x[0] for x in dtypes["vertex"]] - data["points"] = pd.read_csv(filename, sep=" ", header=None, engine="python", - skiprows=top, skipfooter=bottom, usecols=names, names=names) + with open(filename, 'r') as f: + lines = f.readlines() + + with _file_from_lines(lines, top, len(lines) - bottom) as f: + data["points"] = pd.read_csv( + f, sep=" ", header=None, usecols=names, names=names + ) for n, col in enumerate(data["points"].columns): data["points"][col] = data["points"][col].astype( @@ -146,8 +155,10 @@ def read_ply(filename, allow_bool=False): usecols = [1, 2, 3, 5, 6, 7, 8, 9, 10] if has_texture else [1, 2, 3] names = names[usecols] - data["mesh"] = pd.read_csv( - filename, sep=" ", header=None, engine="python", skiprows=top, usecols=usecols, names=names) + with _file_from_lines(lines, top) as f: + data["mesh"] = pd.read_csv( + f, sep=" ", header=None, usecols=usecols, names=names + ) for n, col in enumerate(data["mesh"].columns): data["mesh"][col] = data["mesh"][col].astype( @@ -261,3 +272,11 @@ def describe_element(name, df): element.append('property ' + f + ' ' + df.columns.values[i]) return element + + +@contextmanager +def _file_from_lines(lines, start=None, stop=None): + with StringIO() as f: + f.writelines("".join(islice(lines, start, stop))) + f.seek(0) + yield f From 12ee9f2208f4207844be80ac5fdbafaf9f0652fa Mon Sep 17 00:00:00 2001 From: Mateen Ulhaq Date: Tue, 8 Aug 2023 03:57:39 -0700 Subject: [PATCH 2/2] fix: read_off improve robustness of header parsing Improve robustness of header parsing a bit. In particular, ModelNet40 has faulty headers: ```bash $ head -n 1 ModelNet40/chair/train/chair_0856.off OFF6586 5534 0 ``` For reference, the correct format is: ``` OFF 6586 5534 0 ``` Nonetheless, it is still valuable to parse the faulty header. Also, reuse already open file for reading instead of opening it twice. --- pyntcloud/io/off.py | 75 +++++++++++++++++++++++++++++---------------- 1 file changed, 48 insertions(+), 27 deletions(-) diff --git a/pyntcloud/io/off.py b/pyntcloud/io/off.py index 1f4193f..b816d96 100644 --- a/pyntcloud/io/off.py +++ b/pyntcloud/io/off.py @@ -1,43 +1,57 @@ -import pandas as pd +import re + import numpy as np +import pandas as pd def read_off(filename): - - with open(filename) as off: - - first_line = off.readline() + with open(filename) as f: + first_line = f.readline() if "OFF" not in first_line: - raise ValueError('The file does not start with the word OFF') - color = True if "C" in first_line else False + raise ValueError("The file does not start with the word OFF") + has_color = "C" in first_line + + num_rows = None + n_points = None + n_faces = None + n_header = 1 - n_points = 0 - n_faces = 0 + # Backtrack to account for faulty headers, e.g. "OFF4 4 0". + m = re.match(r"^(?P\D+)([\d\s]+)$", first_line) + if m: + f.seek(len(m.group("prefix"))) + n_header = 0 - count = 1 - for line in off: - count += 1 + # Read header. + for line in f: + n_header += 1 if line.startswith("#"): continue line = line.strip().split() - if len(line) > 1: - n_points = int(line[0]) - n_faces = int(line[1]) - break + if len(line) <= 1: + continue + n_points = int(line[0]) + n_faces = int(line[1]) + num_rows = n_points + n_faces + break + + if num_rows is None: + raise ValueError("The file does not contain a valid header") - if (n_points == 0): - raise ValueError('The file has no points') + if n_points == 0: + raise ValueError("The file contains no points") data = {} point_names = ["x", "y", "z"] - point_types = {'x': np.float32, 'y': np.float32, 'z': np.float32} + point_types = {"x": np.float32, "y": np.float32, "z": np.float32} - if color: + if has_color: point_names.extend(["red", "green", "blue"]) - point_types = dict(point_types, **{'red': np.uint8, 'green': np.uint8, 'blue': np.uint8}) + color_point_types = {"red": np.uint8, "green": np.uint8, "blue": np.uint8} + point_types = {**point_types, **color_point_types} data["points"] = pd.read_csv( - off, + f, sep=" ", header=None, engine="c", @@ -45,18 +59,25 @@ def read_off(filename): names=point_names, dtype=point_types, index_col=False, - comment="#" + comment="#", ) + assert len(data["points"]) == n_points + + f.seek(0) + data["mesh"] = pd.read_csv( - filename, + f, sep=" ", header=None, engine="c", - skiprows=(count + n_points), + skiprows=n_header + n_points, nrows=n_faces, usecols=[1, 2, 3], names=["v1", "v2", "v3"], - comment="#" + comment="#", ) - return data + + assert len(data["mesh"]) == n_faces + + return data