From 69daf9e212ecb544e838a9c30e3b2d50fffa1ada Mon Sep 17 00:00:00 2001 From: Ryan Date: Sun, 17 Jul 2022 12:19:49 -0700 Subject: [PATCH 1/3] Case where key is present ('test1') and value is empty ('') led to the node label being "None"; this fixes so it's "test1:" --- unfurl/core.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/unfurl/core.py b/unfurl/core.py index cedb197..df4b821 100644 --- a/unfurl/core.py +++ b/unfurl/core.py @@ -72,6 +72,8 @@ def __init__(self, node_id, data_type, key, value, label=None, hover=None, self.label = f'{self.key}: {self.value}' elif self.value: self.label = self.value + elif self.key: + self.label = f'{self.key}:' def __repr__(self): return str(self.__dict__) From a3cff15c49d4abc97ff2081cf0a4827f86398ee4 Mon Sep 17 00:00:00 2001 From: Ryan Date: Sun, 17 Jul 2022 12:20:58 -0700 Subject: [PATCH 2/3] Expand match for inflated b64 string to include more characters (=, &, %, ., -) --- unfurl/parsers/parse_compressed.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unfurl/parsers/parse_compressed.py b/unfurl/parsers/parse_compressed.py index 1ff7523..c7366c5 100644 --- a/unfurl/parsers/parse_compressed.py +++ b/unfurl/parsers/parse_compressed.py @@ -86,7 +86,7 @@ def run(unfurl, node): try: inflated_str = inflated_bytes.decode('ascii', errors='strict') - if re.fullmatch(r'\w+', inflated_str): + if re.fullmatch(r'[\w=&%\.-]+', inflated_str): unfurl.add_to_queue( data_type='string', key=None, value=inflated_str, parent_id=node.node_id, From 364d735846bb0e75cbb520b1fc539246aac45b7b Mon Sep 17 00:00:00 2001 From: Ryan Date: Mon, 18 Jul 2022 15:20:55 -0700 Subject: [PATCH 3/3] Refactor a couple URL parsing sections and move to functions where possible. --- unfurl/parsers/parse_url.py | 152 +++++++++++++++++++++--------------- 1 file changed, 89 insertions(+), 63 deletions(-) diff --git a/unfurl/parsers/parse_url.py b/unfurl/parsers/parse_url.py index bc27402..3dc93da 100644 --- a/unfurl/parsers/parse_url.py +++ b/unfurl/parsers/parse_url.py @@ -26,8 +26,58 @@ } +def parse_delimited_string(unfurl_instance, node, delimiter, pairs=False) -> None: + split_values = node.value.split(delimiter) + + for split_value in split_values: + if pairs: + key, value = split_value.split('=') + unfurl_instance.add_to_queue( + data_type='url.query.pair', key=key, value=value, + parent_id=node.node_id, incoming_edge_config=urlparse_edge) + else: + unfurl_instance.add_to_queue( + data_type='string', key=None, value=split_value, + parent_id=node.node_id, incoming_edge_config=urlparse_edge) + + +def try_url_unquote(unfurl_instance, node) -> bool: + unquoted = urllib.parse.unquote_plus(node.value) + if unquoted != node.value: + unfurl_instance.add_to_queue( + data_type='string', key=None, value=unquoted, + hover='Unquoted URL (replaced %xx escapes with their single-character equivalent)', + parent_id=node.node_id, incoming_edge_config=urlparse_edge) + return True + return False + + +def try_url_parse(unfurl_instance, node) -> bool: + try: + parsed_url = urllib.parse.urlparse(node.value) + if (parsed_url.netloc and parsed_url.path) or (parsed_url.scheme and parsed_url.netloc): + unfurl_instance.add_to_queue( + data_type='url', key=None, value=node.value, parent_id=node.node_id, + incoming_edge_config=urlparse_edge) + return True + return False + except: + return False + + def run(unfurl, node): + if not node.data_type.startswith('url'): + try: + # If a node isn't of type 'url' (but maybe 'string' or something) but we can recognize its + # value as a URL, update the data_type so the rest of the parser can act on it. + parsed_url = urllib.parse.urlparse(node.value) + if (parsed_url.netloc and parsed_url.path) or (parsed_url.scheme and parsed_url.netloc): + node.data_type = 'url' + except: + # Guess it wasn't a URL + return + if node.data_type == 'url': parsed_url = urllib.parse.urlparse(node.value) @@ -105,16 +155,9 @@ def run(unfurl, node): # If the query string or fragment is actually another URL (as seen in some redirectors), we want to # continue doing subsequent parsing on it. For that, we need to recognize it and change the data_type to url. if not parsed_qs: - try: - parsed_url = urllib.parse.urlparse(node.value) - if (parsed_url.netloc and parsed_url.path) or (parsed_url.scheme and parsed_url.netloc): - unfurl.add_to_queue( - data_type='url', key=None, value=node.value, parent_id=node.node_id, - incoming_edge_config=urlparse_edge) - return - except: - # Guess it wasn't a URL - pass + parsed = try_url_parse(unfurl, node) + if parsed: + return elif node.data_type == 'url.params': split_params_re = re.compile(r'^(?P[^=]+?)=(?P[^=?]+)(?P[;,|])') @@ -163,35 +206,38 @@ def run(unfurl, node): 'per RFC3986', parent_id=node.node_id, incoming_edge_config=urlparse_edge) - elif node.data_type == 'url.query.pair' and node.key in ['l', 'lang', 'language', 'set-lang']: + elif node.data_type == 'url.query.pair': + if node.key in ['l', 'lang', 'language', 'set-lang']: + language = None - if len(node.value) == 2: - language = pycountry.languages.get(alpha_2=node.value) - elif len(node.value) == 3: - language = pycountry.languages.get(alpha_3=node.value) - else: - return + if len(node.value) == 2: + language = pycountry.languages.get(alpha_2=node.value) + elif len(node.value) == 3: + language = pycountry.languages.get(alpha_3=node.value) - if language: - unfurl.add_to_queue( - data_type='descriptor', key='Language', value=language.name, - hover='This is a generic parser based on common query-string patterns across websites', - parent_id=node.node_id, incoming_edge_config=urlparse_edge) + if language: + unfurl.add_to_queue( + data_type='descriptor', key='Language', value=language.name, + hover='This is a generic parser based on common query-string patterns across websites', + parent_id=node.node_id, incoming_edge_config=urlparse_edge) + return - elif node.data_type == 'url.query.pair' and node.key in ['c', 'cc', 'country', 'country_code']: + elif node.key in ['c', 'cc', 'country', 'country_code']: + country = None - if len(node.value) == 2: - country = pycountry.countries.get(alpha_2=node.value) - elif len(node.value) == 3: - country = pycountry.countries.get(alpha_3=node.value) - else: - return + if len(node.value) == 2: + country = pycountry.countries.get(alpha_2=node.value) + elif len(node.value) == 3: + country = pycountry.countries.get(alpha_3=node.value) - if country: - unfurl.add_to_queue( - data_type='descriptor', key='Country', value=country.name, - hover='This is a generic parser based on common query-string patterns across websites', - parent_id=node.node_id, incoming_edge_config=urlparse_edge) + if country: + unfurl.add_to_queue( + data_type='descriptor', key='Country', value=country.name, + hover='This is a generic parser based on common query-string patterns across websites', + parent_id=node.node_id, incoming_edge_config=urlparse_edge) + return + + try_url_unquote(unfurl, node) elif node.data_type == 'url.path.segment': for file_type in mimetypes.types_map.keys(): @@ -210,29 +256,16 @@ def run(unfurl, node): if not isinstance(node.value, str): return - try: - # If we can recognize another URL inside a value, parse it - parsed_url = urllib.parse.urlparse(node.value) - if (parsed_url.netloc and parsed_url.path) or (parsed_url.scheme and parsed_url.netloc): - unfurl.add_to_queue( - data_type='url', key=None, value=node.value, parent_id=node.node_id, - incoming_edge_config=urlparse_edge) - return - except: - # Guess it wasn't a URL - pass + parsed = try_url_parse(unfurl, node) + if parsed: + return # If the value contains more pairs of the form "a=b|c=d|e=f" pipe_delimited_pairs_re = re.compile( r'((?P[^|=]+)=(?P[^|=]+)\|)+(?P[^|=]+)=(?P[^|=]+)') m = pipe_delimited_pairs_re.fullmatch(node.value) if m: - pipe_pairs = node.value.split('|') - for pair in pipe_pairs: - key, value = pair.split('=') - unfurl.add_to_queue( - data_type='url.query.pair', key=key, value=value, - parent_id=node.node_id, incoming_edge_config=urlparse_edge) + parse_delimited_string(unfurl, node, delimiter='|', pairs=True) return # If the value contains more values in the form "a|b|c|d|e|f" @@ -240,22 +273,15 @@ def run(unfurl, node): r'((?P[^|]+)\|)+(?P[^|]+)') m = pipe_delimited_values_re.fullmatch(node.value) if m: - pipe_values = node.value.split('|') - for value in pipe_values: - unfurl.add_to_queue( - data_type='string', key=None, value=value, - parent_id=node.node_id, incoming_edge_config=urlparse_edge) + parse_delimited_string(unfurl, node, delimiter='|') return # If the value contains more pairs of the form "a=b&c=d&e=f" amp_delimited_pairs_re = re.compile( - r'((?P[^&=]+)=(?P[^&=]+)&)+(?P[^&=]+)=(?P[^&=]+)') + r'((?P[^&=]+)=(?P[^&=]*)&)+(?P[^&=]+)=(?P[^&=]*)') m = amp_delimited_pairs_re.fullmatch(node.value) if m: - amp_pairs = node.value.split('&') - for pair in amp_pairs: - key, value = pair.split('=') - unfurl.add_to_queue( - data_type='url.query.pair', key=key, value=value, - parent_id=node.node_id, incoming_edge_config=urlparse_edge) + parse_delimited_string(unfurl, node, delimiter='&', pairs=True) return + + try_url_unquote(unfurl, node)