From 4794d529685e138941ad26cf849249049701b326 Mon Sep 17 00:00:00 2001 From: jackieff Date: Fri, 4 Dec 2020 20:36:51 -0500 Subject: [PATCH 1/5] Speeding up lookup of inp sections and bracketed words --- swmmio/utils/text.py | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/swmmio/utils/text.py b/swmmio/utils/text.py index a04913b..3442f81 100644 --- a/swmmio/utils/text.py +++ b/swmmio/utils/text.py @@ -216,23 +216,22 @@ def get_inp_sections_details(inp_path, include_brackets=False): found_sects = OrderedDict() with open(inp_path) as f: - for line in f: - sect_not_found = True - for sect_id, data in INP_OBJECTS.items(): - # find the start of an INP section - search_tag = format_inp_section_header(sect_id) - if search_tag.lower() in line.lower(): - if include_brackets: - sect_id = '[{}]'.format(sect_id.upper()) - found_sects[sect_id.upper()] = data - sect_not_found = False - break - if sect_not_found: - if '[' and ']' in line: - h = line.strip() - if not include_brackets: - h = h.replace('[', '').replace(']', '') - found_sects[h] = OrderedDict(columns=['blob']) + txt = f.read() + section_dict = {key:txt.find("[{}]".format(key)) for key in INP_OBJECTS.keys() if txt.find("[{}]".format(key)) >= 0} + section_dict = sorted(section_dict, key=section_dict.get) + bracketed_words = re.findall(r"\[([A-Za-z0-9_]+)\]",txt) + + for sect in bracketed_words: + if sect not in section_dict: + if not include_brackets: + h = sect.replace('[', '').replace(']', '') + found_sects[h] = OrderedDict(columns=['blob']) + else: + if include_brackets: + sect_id = '[{}]'.format(sect.upper()) + else: + sect_id = sect.upper() + found_sects[sect_id] = INP_OBJECTS[sect] # make necessary adjustments to columns that change based on options ops_cols = INP_OBJECTS['OPTIONS']['columns'] From d556e1fdb2e2cb20a30ac9f814acf8206ff1fd88 Mon Sep 17 00:00:00 2001 From: jackieff Date: Wed, 5 May 2021 17:12:44 -0400 Subject: [PATCH 2/5] Eliminating need to scan inp file twice for dataframe_from_inp --- swmmio/utils/dataframes.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/swmmio/utils/dataframes.py b/swmmio/utils/dataframes.py index 612f2f9..948b735 100644 --- a/swmmio/utils/dataframes.py +++ b/swmmio/utils/dataframes.py @@ -116,22 +116,18 @@ def dataframe_from_inp(inp_path, section, additional_cols=None, quote_replace=' :param quote_replace: :return: """ - + from swmmio.defs import INP_OBJECTS # format the section header for look up in headers OrderedDict sect = remove_braces(section).upper() - # get list of all section headers in inp to use as section ending flags - headers = get_inp_sections_details(inp_path, include_brackets=False) - - if sect not in headers: - warnings.warn(f'{sect} section not found in {inp_path}') - return pd.DataFrame() - # extract the string and read into a dataframe start_string = format_inp_section_header(section) - end_strings = [format_inp_section_header(h) for h in headers.keys()] + end_strings = [format_inp_section_header(h) for h in INP_OBJECTS.keys()] s = extract_section_of_file(inp_path, start_string, end_strings, **kwargs) + if len(s.replace(start_string, "").replace("\n","")) == 0: + warnings.warn(f'{sect} section not found in {inp_path}') + return pd.DataFrame() # replace occurrences of double quotes "" s = s.replace('""', quote_replace) From 339a29daf7bb3f86f495a85a9820533e6145b528 Mon Sep 17 00:00:00 2001 From: jackieff Date: Wed, 5 May 2021 17:21:33 -0400 Subject: [PATCH 3/5] Fixing headers error --- swmmio/utils/dataframes.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/swmmio/utils/dataframes.py b/swmmio/utils/dataframes.py index 948b735..ae5c512 100644 --- a/swmmio/utils/dataframes.py +++ b/swmmio/utils/dataframes.py @@ -134,9 +134,9 @@ def dataframe_from_inp(inp_path, section, additional_cols=None, quote_replace=' # and get the list of columns to use for parsing this section # add any additional columns needed for special cases (build instructions) additional_cols = [] if additional_cols is None else additional_cols - cols = headers[sect]['columns'] + additional_cols + cols = INP_OBJECTS[sect]['columns'] + additional_cols - if headers[sect]['columns'][0] == 'blob': + if INP_OBJECTS[sect]['columns'][0] == 'blob': # return the whole row, without specific col headers return pd.read_csv(StringIO(s), delim_whitespace=False) else: From 1292b22805e09a0e81f49ce381e47b150c5c828a Mon Sep 17 00:00:00 2001 From: Adam Erispaha Date: Wed, 5 May 2021 16:25:10 -0500 Subject: [PATCH 4/5] minor change to code style --- swmmio/utils/text.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/swmmio/utils/text.py b/swmmio/utils/text.py index 3442f81..22cd16e 100644 --- a/swmmio/utils/text.py +++ b/swmmio/utils/text.py @@ -217,9 +217,12 @@ def get_inp_sections_details(inp_path, include_brackets=False): with open(inp_path) as f: txt = f.read() - section_dict = {key:txt.find("[{}]".format(key)) for key in INP_OBJECTS.keys() if txt.find("[{}]".format(key)) >= 0} + section_dict = { + key: txt.find("[{}]".format(key)) for key in INP_OBJECTS.keys() + if txt.find("[{}]".format(key)) >= 0 + } section_dict = sorted(section_dict, key=section_dict.get) - bracketed_words = re.findall(r"\[([A-Za-z0-9_]+)\]",txt) + bracketed_words = re.findall(r"\[([A-Za-z0-9_]+)\]", txt) for sect in bracketed_words: if sect not in section_dict: From 3331597ecc1cf916adc990479261b0de2bd8c63d Mon Sep 17 00:00:00 2001 From: jackieff Date: Wed, 5 May 2021 17:49:03 -0400 Subject: [PATCH 5/5] Reverting to 31766d6 --- swmmio/utils/dataframes.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/swmmio/utils/dataframes.py b/swmmio/utils/dataframes.py index ae5c512..612f2f9 100644 --- a/swmmio/utils/dataframes.py +++ b/swmmio/utils/dataframes.py @@ -116,27 +116,31 @@ def dataframe_from_inp(inp_path, section, additional_cols=None, quote_replace=' :param quote_replace: :return: """ - from swmmio.defs import INP_OBJECTS + # format the section header for look up in headers OrderedDict sect = remove_braces(section).upper() + # get list of all section headers in inp to use as section ending flags + headers = get_inp_sections_details(inp_path, include_brackets=False) + + if sect not in headers: + warnings.warn(f'{sect} section not found in {inp_path}') + return pd.DataFrame() + # extract the string and read into a dataframe start_string = format_inp_section_header(section) - end_strings = [format_inp_section_header(h) for h in INP_OBJECTS.keys()] + end_strings = [format_inp_section_header(h) for h in headers.keys()] s = extract_section_of_file(inp_path, start_string, end_strings, **kwargs) - if len(s.replace(start_string, "").replace("\n","")) == 0: - warnings.warn(f'{sect} section not found in {inp_path}') - return pd.DataFrame() # replace occurrences of double quotes "" s = s.replace('""', quote_replace) # and get the list of columns to use for parsing this section # add any additional columns needed for special cases (build instructions) additional_cols = [] if additional_cols is None else additional_cols - cols = INP_OBJECTS[sect]['columns'] + additional_cols + cols = headers[sect]['columns'] + additional_cols - if INP_OBJECTS[sect]['columns'][0] == 'blob': + if headers[sect]['columns'][0] == 'blob': # return the whole row, without specific col headers return pd.read_csv(StringIO(s), delim_whitespace=False) else: