feat(analyzers): Implement base64asciianalyzer (#172)

Thank you for your contribution(s) - especially for the tests, since, from my experience, these take most of the time to set up. All tests are green, the analyzer matches only what it should, codacy is happy with the code quality, so I am happy to merge this PR. fixes #166 ___ * feat(analyzers): implement base64asciianalyzer * test(analyzers) add test to base64analyzers * fix: use proper name __init__ * feat: add option to return decoded b64 from analyzer The b64asciianalyzer can now return either the original text or the decoded result * fix: add check for valid ascii characters by using the decode method on the byte string and catching exceptions, we can figure out if the result is actual ascii or other data. * update base64asciianalyzer tests for decode flag functionality * fix(test): correct order of equality comparisons The first parameter is always the expected result. The second one is the sample one. * fix(test): check for newline character The original test did not actually check for newline/linefeed character but for the literal string '\n'. Co-authored-by: Rico <[email protected]>
d-Rickyy-b · Feb 12, 2020 · b535781 · b535781
1 parent 6345639
commit b535781
Show file tree

Hide file tree

Showing 5 changed files with 216 additions and 1 deletion.
diff --git a/pastepwn/analyzers/__init__.py b/pastepwn/analyzers/__init__.py
@@ -6,6 +6,7 @@
 from .awssessiontokenanalyzer import AWSSessionTokenAnalyzer
 from .azuresubscriptionkeyanalyzer import AzureSubscriptionKeyAnalyzer
 from .base64analyzer import Base64Analyzer
+from .base64asciianalyzer import Base64AsciiAnalyzer
 from .basicanalyzer import BasicAnalyzer
 from .battlenetkeyanalyzer import BattleNetKeyAnalyzer
 from .bcrypthashanalyzer import BcryptHashAnalyzer
@@ -50,6 +51,7 @@
     'AWSSessionTokenAnalyzer',
     'AzureSubscriptionKeyAnalyzer',
     'Base64Analyzer',
+    'Base64AsciiAnalyzer',
     'BasicAnalyzer',
     'BattleNetKeyAnalyzer',
     'BcryptHashAnalyzer',

diff --git a/pastepwn/analyzers/base64analyzer.py b/pastepwn/analyzers/base64analyzer.py
@@ -12,7 +12,7 @@ def __init__(self, actions, min_len=1):
         super().__init__(actions, regex)
 
     def verify(self, results):
-        """Method to perform additional checks to test if the found strings are of sufficient lenght"""
+        """Method to perform additional checks to test if the found strings are of sufficient length"""
         validated_strings = []
 
         for result in results:

diff --git a/pastepwn/analyzers/base64asciianalyzer.py b/pastepwn/analyzers/base64asciianalyzer.py
@@ -0,0 +1,42 @@
+# -*- coding: utf-8 -*-
+from .base64analyzer import Base64Analyzer
+from base64 import b64decode
+import binascii
+
+
+class Base64AsciiAnalyzer(Base64Analyzer):
+    """Analyzer to match base64 strings which decode to valid ASCII"""
+    name = 'Base64AsciiAnalyzer'
+
+    def __init__(self, actions, min_len=1, decode=False):
+        super().__init__(actions, min_len)
+        self.decode = decode
+
+    def verify(self, results):
+        """Method to determine if found base64 decodes to valid ASCII"""
+        # find valid base64 strings with the parent class
+        validated_strings = super().verify(results)
+
+        # go through each base64 string and attempt to decode
+        base64_ascii_strings = []
+
+        for validated_string in validated_strings:
+            # Check if the string is valid base64
+            try:
+                decoded_string = b64decode(validated_string)
+            except binascii.Error:
+                # The string is no valid base64
+                continue
+
+            # Check if the valid base64 decodes to plain ascii
+            try:
+                b64_ascii_string = decoded_string.decode('ascii')
+            except UnicodeDecodeError:
+                continue
+
+            if self.decode:
+                base64_ascii_strings.append(b64_ascii_string)
+            else:
+                base64_ascii_strings.append(validated_string)
+
+        return base64_ascii_strings
diff --git a/pastepwn/analyzers/tests/base64analyzer_test.py b/pastepwn/analyzers/tests/base64analyzer_test.py
@@ -109,6 +109,16 @@ def test_match_negative(self):
                           "Dj448rhbNTJrKhRn7TPkYRubZLhmbCrg6bavDa9a"
         self.assertFalse(self.analyzer.match(self.paste))
 
+    def test_invalid_decodes(self):
+        """Test to make sure we match all base64 strings even ones that don't decode to ASCII."""
+        # base64 encoded string containing one non-ascii character: "This string contains a non-ascii character: ¤" (UTF-8)
+        self.paste.body = "VGhpcyBzdHJpbmcgY29udGFpbnMgYSBub24tYXNjaWkgY2hhcmFjdGVyOiDCpA=="
+        self.assertTrue(self.analyzer.match(self.paste))
+
+        # base64 encoded string containing only non-ascii characters: "ΗÈλλθ ωÖΓλÐ" (UTF-8)
+        self.paste.body = "zpfDiM67zrvOuCDPicOWzpPOu8OQ"
+        self.assertTrue(self.analyzer.match(self.paste))
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/pastepwn/analyzers/tests/base64asciianalyzer_test.py b/pastepwn/analyzers/tests/base64asciianalyzer_test.py
@@ -0,0 +1,161 @@
+# -*- coding: utf-8 -*-
+import unittest
+from unittest import mock
+
+from pastepwn.analyzers.base64asciianalyzer import Base64AsciiAnalyzer
+
+
+class TestBase64AsciiAnalyzer(unittest.TestCase):
+    def setUp(self):
+        self.analyzer = Base64AsciiAnalyzer(None)
+        self.paste = mock.Mock()
+
+    def test_match_positive(self):
+        """Test if positives are recognized"""
+        # base64 encoded string: "Hello World" (UTF-8, LF)
+        self.paste.body = "SGVsbG8gV29ybGQ="
+        self.assertTrue(self.analyzer.match(self.paste))
+
+        # base64 encoded string: "Hello\nWorld" (UTF-8, LF)
+        self.paste.body = "SGVsbG9cbldvcmxk"
+        self.assertTrue(self.analyzer.match(self.paste))
+
+        # base64 encoded string (32 chars): "2fwZ_CTjDKxu48FLCLZcGdB!sEj5XRQh" (UTF-8, LF)
+        self.paste.body = "MmZ3Wl9DVGpES3h1NDhGTENMWmNHZEIhc0VqNVhSUWg="
+        self.assertTrue(self.analyzer.match(self.paste))
+
+        # base64 encoded string (64 chars): "Mv=ZH?NJrrBSdhus*KVg%4dG6*C&ub?sSeq!VrzCb_-QcY^KWfxKy8AJ3=^5?b6N"
+        # (UTF-8, LF)
+        self.paste.body = "TXY9Wkg/TkpyckJTZGh1cypLVmclNGRHNipDJnViP3NTZXEhVnJ6Q2JfLVFjWV5LV2Z4S3k4QUozPV41P2I2Tg=="
+        self.assertTrue(self.analyzer.match(self.paste))
+
+        # base64 encoded string (256 chars): "etFk!?m@A_vvdMT39Mgcynx_AFz6HY!4R8U3n_7JA?-rF=F3ehWat%4rKfhsuCc98G
+        # =t8jMY7hgJDZ2c!y!$!XQATbk6fQD2pa+EdQ_rfP^&_DKJ34dFPcuGjDBTqdxZ&=3U%@dm&?JW#+k@mB%a3TFn%GAzukL+-%TUTq?fAbAKr
+        # @y%LPK+KEmxeh+rg7?s3aR2v5A%tbn&_7zNMckCPRd&s8$wW5Bec@aRMCs@4rn?cRx?a&y-Z%kn&h8aLu*R" (UTF-8, LF)
+        self.paste.body = "ZXRGayE/bUBBX3Z2ZE1UMzlNZ2N5bnhfQUZ6NkhZITRSOFUzbl83SkE/LXJGPUYzZWhXYXQlNHJLZmhzdUNjO" \
+                          "ThHPXQ4ak1ZN2hnSkRaMmMheSEkIVhRQVRiazZmUUQycGErRWRRX3JmUF4mX0RLSjM0ZEZQY3VHakRCVHFkeF" \
+                          "omPTNVJUBkbSY/SlcjK2tAbUIlYTNURm4lR0F6dWtMKy0lVFVUcT9mQWJBS3JAeSVMUEsrS0VteGVoK3JnNz9" \
+                          "zM2FSMnY1QSV0Ym4mXzd6Tk1ja0NQUmQmczgkd1c1QmVjQGFSTUNzQDRybj9jUng/YSZ5LVola24maDhhTHUqUg=="
+        self.assertTrue(self.analyzer.match(self.paste))
+
+    def test_intext(self):
+        """Test if matches inside text are recognized"""
+        self.paste.body = "I wan to tel you tha TXY9Wkg/TkpyckJTZGh1cypLVmclNGRHNipDJnViP3NTZXEhVnJ6Q2JfLVFjWV5LV2Z4S3k4QUozPV41P2I2Tg== is " \
+                          "very important"
+        match = self.analyzer.match(self.paste)
+        self.assertTrue(match)
+        self.assertEqual("TXY9Wkg/TkpyckJTZGh1cypLVmclNGRHNipDJnViP3NTZXEhVnJ6Q2JfLVFjWV5LV2Z4S3k4QUozPV41P2I2Tg==", match[0])
+
+    def test_multiple(self):
+        """Test if multiple matches are recognized"""
+        # Needed to keep the words below 3 chars each. Otherwise they would match as well
+        self.paste.body = "I wan to tel you tha TXY9Wkg/TkpyckJTZGh1cypLVmclNGRHNipDJnViP3NTZXEhVnJ6Q2JfLVFjWV5LV2Z4S3k4QUozPV41P2I2Tg== is " \
+                          "ver imp.\nBut not onl tha, it's als MmZ3Wl9DVGpES3h1NDhGTENMWmNHZEIhc0VqNVhSUWg= and muc mor!"
+        match = self.analyzer.match(self.paste)
+        self.assertTrue(match)
+        self.assertEqual("TXY9Wkg/TkpyckJTZGh1cypLVmclNGRHNipDJnViP3NTZXEhVnJ6Q2JfLVFjWV5LV2Z4S3k4QUozPV41P2I2Tg==", match[0])
+        self.assertEqual("MmZ3Wl9DVGpES3h1NDhGTENMWmNHZEIhc0VqNVhSUWg=", match[1])
+
+    def test_multiple_min_len(self):
+        """Test if we can match multiple base64 strings in a longer text with min_len"""
+        analyzer = Base64AsciiAnalyzer(None, min_len=8)
+        self.paste.body = "I wanted to tell you that TXY9Wkg/TkpyckJTZGh1cypLVmclNGRHNipDJnViP3NTZXEhVnJ6Q2JfLVFjWV5LV2Z4S3k4QUozPV41P2I2Tg== is " \
+                          "very important.\nBut not only that, it's also MmZ3Wl9DVGpES3h1NDhGTENMWmNHZEIhc0VqNVhSUWg= and much more!"
+        match = analyzer.match(self.paste)
+        self.assertTrue(match)
+        self.assertEqual("TXY9Wkg/TkpyckJTZGh1cypLVmclNGRHNipDJnViP3NTZXEhVnJ6Q2JfLVFjWV5LV2Z4S3k4QUozPV41P2I2Tg==", match[0])
+        self.assertEqual("MmZ3Wl9DVGpES3h1NDhGTENMWmNHZEIhc0VqNVhSUWg=", match[1])
+
+    def test_min_len(self):
+        """Test if the min_len parameter works as expected"""
+        self.paste.body = "dGVz"
+        analyzer = Base64AsciiAnalyzer(None, min_len=4)
+        match = analyzer.match(self.paste)
+        self.assertTrue(match)
+
+        self.paste.body = "dGVz"
+        analyzer = Base64AsciiAnalyzer(None, min_len=5)
+        match = analyzer.match(self.paste)
+        self.assertFalse(match)
+
+        self.paste.body = "dGVzdFRoaXNTdHJpbmc="
+        match = analyzer.match(self.paste)
+        self.assertTrue(match)
+
+    def test_match_negative(self):
+        """Test if negatives are not recognized"""
+        # test that when nothing, is provided nothing matches
+        self.paste.body = ""
+        self.assertFalse(self.analyzer.match(self.paste))
+
+        # test that when nothing, is provided nothing matches
+        self.paste.body = None
+        self.assertFalse(self.analyzer.match(self.paste))
+
+        # invalid base64 string (% symbol inserted which is not valid base64)
+        self.paste.body = "SGVsbG8gV%29ybGQ="
+        self.assertFalse(self.analyzer.match(self.paste))
+
+        # not a base64 string
+        self.paste.body = "====="
+        self.assertFalse(self.analyzer.match(self.paste))
+
+        # base32 encoded string
+        self.paste.body = "JBSWY3DPEBLW64TMMQ======"
+        self.assertFalse(self.analyzer.match(self.paste))
+
+        # long string (129) not base64
+        self.paste.body = "sFm2XgxTt6fuErnWw9JZkae76sL7XDqyNvf2Wkatt9gkzVDxXTf6dCr3Yh6fT82fFzvNWG49P3KSR7XXngHJ5D9ba" \
+                          "Dj448rhbNTJrKhRn7TPkYRubZLhmbCrg6bavDa9a"
+        self.assertFalse(self.analyzer.match(self.paste))
+
+    def test_invalid_decodes(self):
+        """Test to make sure we don't match base64 strings which don't decode to ASCII"""
+        # base64 encoded string containing one non-ascii character: "This string contains a non-ascii character: ¤" (UTF-8)
+        self.paste.body = "VGhpcyBzdHJpbmcgY29udGFpbnMgYSBub24tYXNjaWkgY2hhcmFjdGVyOiDCpA=="
+        self.assertFalse(self.analyzer.match(self.paste))
+
+        # base64 encoded string containing only non-ascii characters: "ΗÈλλθ ωÖΓλÐ" (UTF-8)
+        self.paste.body = "zpfDiM67zrvOuCDPicOWzpPOu8OQ"
+        self.assertFalse(self.analyzer.match(self.paste))
+
+        # base64 encoded string containing one non-ascii character: "º" (UTF-8)
+        self.paste.body = "wro="
+        self.assertFalse(self.analyzer.match(self.paste))
+
+    def test_ascii_decode(self):
+        """Test if ascii decode flag works"""
+        analyzer = Base64AsciiAnalyzer(None, decode=True)
+
+        # base64 encoded string: "Hello World" (UTF-8, LF)
+        self.paste.body = "SGVsbG8gV29ybGQ="
+        self.assertEqual("Hello World", analyzer.match(self.paste)[0])
+
+        # base64 encoded string: "Hello\nWorld" (UTF-8, LF)
+        self.paste.body = "SGVsbG8KV29ybGQ="
+        self.assertEqual("Hello\nWorld", analyzer.match(self.paste)[0])
+
+        # base64 encoded string (32 chars): "2fwZ_CTjDKxu48FLCLZcGdB!sEj5XRQh" (UTF-8, LF)
+        self.paste.body = "MmZ3Wl9DVGpES3h1NDhGTENMWmNHZEIhc0VqNVhSUWg="
+        self.assertEqual("2fwZ_CTjDKxu48FLCLZcGdB!sEj5XRQh", analyzer.match(self.paste)[0])
+
+        # base64 encoded string (64 chars): "Mv=ZH?NJrrBSdhus*KVg%4dG6*C&ub?sSeq!VrzCb_-QcY^KWfxKy8AJ3=^5?b6N"
+        # (UTF-8, LF)
+        self.paste.body = "TXY9Wkg/TkpyckJTZGh1cypLVmclNGRHNipDJnViP3NTZXEhVnJ6Q2JfLVFjWV5LV2Z4S3k4QUozPV41P2I2Tg=="
+        self.assertEqual("Mv=ZH?NJrrBSdhus*KVg%4dG6*C&ub?sSeq!VrzCb_-QcY^KWfxKy8AJ3=^5?b6N", analyzer.match(self.paste)[0])
+
+        # base64 encoded string (256 chars): "etFk!?m@A_vvdMT39Mgcynx_AFz6HY!4R8U3n_7JA?-rF=F3ehWat%4rKfhsuCc98G
+        # =t8jMY7hgJDZ2c!y!$!XQATbk6fQD2pa+EdQ_rfP^&_DKJ34dFPcuGjDBTqdxZ&=3U%@dm&?JW#+k@mB%a3TFn%GAzukL+-%TUTq?fAbAKr
+        # @y%LPK+KEmxeh+rg7?s3aR2v5A%tbn&_7zNMckCPRd&s8$wW5Bec@aRMCs@4rn?cRx?a&y-Z%kn&h8aLu*R" (UTF-8, LF)
+        self.paste.body = "ZXRGayE/bUBBX3Z2ZE1UMzlNZ2N5bnhfQUZ6NkhZITRSOFUzbl83SkE/LXJGPUYzZWhXYXQlNHJLZmhzdUNjO" \
+                          "ThHPXQ4ak1ZN2hnSkRaMmMheSEkIVhRQVRiazZmUUQycGErRWRRX3JmUF4mX0RLSjM0ZEZQY3VHakRCVHFkeF" \
+                          "omPTNVJUBkbSY/SlcjK2tAbUIlYTNURm4lR0F6dWtMKy0lVFVUcT9mQWJBS3JAeSVMUEsrS0VteGVoK3JnNz9" \
+                          "zM2FSMnY1QSV0Ym4mXzd6Tk1ja0NQUmQmczgkd1c1QmVjQGFSTUNzQDRybj9jUng/YSZ5LVola24maDhhTHUqUg=="
+        self.assertEqual("etFk!?m@A_vvdMT39Mgcynx_AFz6HY!4R8U3n_7JA?-rF=F3ehWat%4rKfhsuCc98G" \
+                         "=t8jMY7hgJDZ2c!y!$!XQATbk6fQD2pa+EdQ_rfP^&_DKJ34dFPcuGjDBTqdxZ&=3U%" \
+                         "@dm&?JW#+k@mB%a3TFn%GAzukL+-%TUTq?fAbAKr@y%LPK+KEmxeh+rg7?s3aR2v5A%tbn&" \
+                         "_7zNMckCPRd&s8$wW5Bec@aRMCs@4rn?cRx?a&y-Z%kn&h8aLu*R", analyzer.match(self.paste)[0])
+
+
+if __name__ == '__main__':
+    unittest.main()