From 41fe1a395570c9d957e4769f6caa761905752c49 Mon Sep 17 00:00:00 2001 From: Ninad Sinha <32181677+AHarmlessPyro@users.noreply.github.com> Date: Thu, 30 Mar 2023 12:03:33 +0530 Subject: [PATCH] Add support for key based patterns analysis (#462) Co-authored-by: Nikhil Shahi --- backend/src/services/data-classes/index.ts | 25 +++++++- backend/src/services/data-classes/utils.ts | 20 +++++- .../src/services/metlo-config/constants.ts | 21 +++++- backend/src/services/scanner/scan.ts | 52 +++++++++++---- backend/src/services/scanner/utils.ts | 12 +++- common/src/types.ts | 1 + ingestors/rust-common/src/metlo_config.rs | 64 +++++++++++++------ ingestors/rust-common/src/process_graphql.rs | 24 +++++-- ingestors/rust-common/src/process_trace.rs | 32 +++++++++- ingestors/rust-common/src/sensitive_data.rs | 43 +++++++++++-- 10 files changed, 239 insertions(+), 55 deletions(-) diff --git a/backend/src/services/data-classes/index.ts b/backend/src/services/data-classes/index.ts index 1e9e9ccd1..00a5dcea6 100644 --- a/backend/src/services/data-classes/index.ts +++ b/backend/src/services/data-classes/index.ts @@ -78,12 +78,22 @@ export const getCombinedDataClasses = async (ctx: MetloContext) => { }) .filter(v => v !== undefined) roughMap.forEach(v => { - const [key, { severity, patterns: regexList, ...rest1 }, ...rest] = - Object.entries(v)[0] + const [ + key, + { severity, patterns: regexList, keyPatterns: keyRegexList, ...rest1 }, + ...rest + ] = Object.entries(v)[0] userDefinedClassMap.push({ className: key, severity: RiskScore[severity] as RiskScore, - regex: new RegExp(regexList.map(regex => `(${regex})`).join("|")), + regex: + regexList && regexList.length > 0 + ? new RegExp(regexList.map(regex => `(${regex})`).join("|")) + : null, + keyRegex: + keyRegexList && keyRegexList.length > 0 + ? new RegExp(keyRegexList.map(regex => `(${regex})`).join("|")) + : null, }) }) } @@ -98,7 +108,16 @@ export const getCombinedDataClasses = async (ctx: MetloContext) => { }) return [...metloDefinedClassMap, ...userDefinedClassMap].map(cls => { if (cls.regex) { + if (cls.keyRegex) { + return { + ...cls, + regex: cls.regex.source, + keyRegex: cls.keyRegex.source, + } + } return { ...cls, regex: cls.regex.source } + } else if (cls.keyRegex) { + return { ...cls, keyRegex: cls.keyRegex.source } } else { return { className: cls.className, diff --git a/backend/src/services/data-classes/utils.ts b/backend/src/services/data-classes/utils.ts index f9e8ebb17..d3a6c636b 100644 --- a/backend/src/services/data-classes/utils.ts +++ b/backend/src/services/data-classes/utils.ts @@ -1,12 +1,20 @@ import { RiskScore } from "@common/enums" import Zod from "zod" -export interface rawDataClass { +interface rawDataClassRegex { className: string severity: RiskScore - regex: RegExp + regex?: RegExp shortName?: string } +interface rawDataClassKeyRegex { + className: string + severity: RiskScore + keyRegex?: RegExp + shortName?: string +} + +export interface rawDataClass extends rawDataClassRegex, rawDataClassKeyRegex {} const scoreArray = Object.keys(RiskScore) @@ -18,5 +26,11 @@ const SCORE_VALUES: [string, ...string[]] = [ export const customDataClass = Zod.object({ severity: Zod.enum(SCORE_VALUES), - patterns: Zod.string().array(), + patterns: Zod.string().array().optional(), + keyPatterns: Zod.string().array().optional(), }) + .partial() + .refine( + data => data.severity && (data.patterns || data.keyPatterns), + "Severity must be provided along with either of patterns or keyPatterns", + ) diff --git a/backend/src/services/metlo-config/constants.ts b/backend/src/services/metlo-config/constants.ts index 06eadbf15..9238db555 100644 --- a/backend/src/services/metlo-config/constants.ts +++ b/backend/src/services/metlo-config/constants.ts @@ -137,8 +137,27 @@ export const METLO_CONFIG_SCHEMA = { format: "regex", }, }, + keyPatterns: { + type: "array", + minItems: 1, + uniqueItems: true, + items: { + type: "string", + format: "regex", + }, + }, }, - required: ["severity", "patterns"], + anyOf: [ + { + required: ["severity", "patterns"], + }, + { + required: ["severity", "keyPatterns"], + }, + { + required: ["severity", "keyPatterns", "patterns"], + }, + ], }, }, additionalProperties: false, diff --git a/backend/src/services/scanner/scan.ts b/backend/src/services/scanner/scan.ts index 43cc12ffc..8598c0641 100644 --- a/backend/src/services/scanner/scan.ts +++ b/backend/src/services/scanner/scan.ts @@ -39,7 +39,7 @@ export const VALIDATION_FUNC_MAP: Record boolean> = { [__DataClass_INTERNAL__.BRAZIL_CPF]: validateBrazilCPF, } -export const scan = (text: any, dataClasses: DataClass[]): string[] => { +export const scanValue = (text: any, dataClasses: DataClass[]): string[] => { const res: string[] = [] let convertedText: string try { @@ -52,16 +52,11 @@ export const scan = (text: any, dataClasses: DataClass[]): string[] => { if (STRING_ONLY_DATA_CLASSES.has(className) && typeof text !== "string") { return } - const r = new RegExp(exp) - const match = r.test(convertedText) - if (match) { - const validationFunc = VALIDATION_FUNC_MAP[className] - if (validationFunc) { - const matchArr = convertedText.match(r) - if (matchArr && validationFunc(matchArr[0])) { - res.push(className) - } - } else { + if (exp) { + const r = new RegExp(exp) + const matchedValue = r.test(convertedText) + const matchRes = returnMatch(matchedValue, className, convertedText, r) + if (matchRes) { res.push(className) } } @@ -69,3 +64,38 @@ export const scan = (text: any, dataClasses: DataClass[]): string[] => { }) return res } + +export const scanKey = (text: string, dataClasses: DataClass[]): string[] => { + const res: string[] = [] + dataClasses.forEach(({ className, keyRegex: keyExp }) => { + if (keyExp) { + const keyMatch = new RegExp(keyExp) + const matchedKey = keyMatch.test(text) + const matchRes = returnMatch(matchedKey, className, text, keyMatch) + if (matchRes) { + res.push(className) + } + } + }) + return res +} + +const returnMatch = ( + match: boolean, + className: string, + convertedText: string, + matcher: RegExp, +): boolean => { + if (match) { + const validationFunc = VALIDATION_FUNC_MAP[className] + if (validationFunc) { + const matchArr = convertedText.match(matcher) + if (matchArr && validationFunc(matchArr[0])) { + return true + } + } else { + return true + } + } + return false +} diff --git a/backend/src/services/scanner/utils.ts b/backend/src/services/scanner/utils.ts index d23e474b1..af4b324d5 100644 --- a/backend/src/services/scanner/utils.ts +++ b/backend/src/services/scanner/utils.ts @@ -2,7 +2,7 @@ import { DataClass, PairObject } from "@common/types" import { DataSection, DataType } from "@common/enums" import { isParameter, parsedJson, parsedJsonNonNull } from "utils" import { getPathTokens } from "@common/utils" -import { scan } from "./scan" +import { scanKey, scanValue } from "./scan" import { getMapDataFields } from "services/data-field/utils" const handleDataField = ( @@ -17,9 +17,15 @@ const handleDataField = ( const key = `${statusCode}_${contentType}_${dataSection}${ dataPath ? `.${dataPath}` : "" }` - const detectedData = scan(dataValue, dataClasses) + const detectedDataInValue = scanValue(dataValue, dataClasses) + const detectedDataInPath = scanKey(dataPath, dataClasses) let newDataClasses = sensitiveDataMap[key] || [] - detectedData.forEach(e => { + detectedDataInValue.forEach(e => { + if (!newDataClasses.includes(e)) { + newDataClasses.push(e) + } + }) + detectedDataInPath.forEach(e => { if (!newDataClasses.includes(e)) { newDataClasses.push(e) } diff --git a/common/src/types.ts b/common/src/types.ts index 8ca6d6b73..edf49311a 100644 --- a/common/src/types.ts +++ b/common/src/types.ts @@ -449,6 +449,7 @@ export interface DataClass { className: string severity: RiskScore regex?: string + keyRegex?: string shortName?: string } diff --git a/ingestors/rust-common/src/metlo_config.rs b/ingestors/rust-common/src/metlo_config.rs index 773fefca0..835d0b149 100644 --- a/ingestors/rust-common/src/metlo_config.rs +++ b/ingestors/rust-common/src/metlo_config.rs @@ -25,6 +25,7 @@ pub struct MetloSensitiveData { class_name: String, severity: String, regex: Option, + key_regex: Option, } #[derive(Debug, Default, Clone, Deserialize, Serialize)] @@ -173,29 +174,54 @@ pub async fn pull_metlo_config() -> Result<(), Box> { .await? .json::() .await?; - let new_sensitive_data: Vec = resp .sensitive_data_list .iter() - .map(|e| match &e.regex { - Some(unwrapped_regex) => { - let regex = Regex::new(unwrapped_regex); - match regex { - Ok(r) => Some(SensitiveData { - sensitive_data_type: e.class_name.clone(), - regex: r, - }), - Err(err) => { - log::debug!( - "Failed to Compile Regex \"{}\" - {}\n", - e.class_name, - err.to_string() - ); - None - } - } + .map(|e| match (&e.regex, &e.key_regex) { + (Some(regex), Some(key_regex)) => { + let _regex = Regex::new(regex); + let _key_regex = Regex::new(key_regex); + Some(SensitiveData { + sensitive_data_type: e.class_name.clone(), + regex: match _regex { + Ok(r) => Some(r), + Err(_) => None, + }, + key_regex: match _key_regex { + Ok(r) => Some(r), + Err(_) => None, + }, + }) + } + (Some(regex), None) => { + let _regex = Regex::new(regex); + Some(SensitiveData { + sensitive_data_type: e.class_name.clone(), + regex: match _regex { + Ok(r) => Some(r), + Err(_) => None, + }, + key_regex: None, + }) + } + (None, Some(key_regex)) => { + let _key_regex = Regex::new(key_regex); + Some(SensitiveData { + sensitive_data_type: e.class_name.clone(), + regex: None, + key_regex: match _key_regex { + Ok(r) => Some(r), + Err(_) => None, + }, + }) + } + (None, None) => { + log::debug!( + "Missing both regex and key_regex fields in \"{}\"\n", + e.class_name, + ); + None } - None => None, }) .flatten() .collect(); diff --git a/ingestors/rust-common/src/process_graphql.rs b/ingestors/rust-common/src/process_graphql.rs index c92b4ca78..a3e91abd6 100644 --- a/ingestors/rust-common/src/process_graphql.rs +++ b/ingestors/rust-common/src/process_graphql.rs @@ -1,5 +1,5 @@ use crate::{ - process_trace::{insert_data_type, process_json_val}, + process_trace::{insert_data_type, process_json_val, process_path}, sensitive_data::detect_sensitive_data, trace::{ GraphQlData, GraphQlRes, KeyVal, Operation, OperationItem, ProcessTraceResInner, Variable, @@ -47,11 +47,21 @@ fn process_graphql_argument<'a>( } } schema::Value::Boolean(_) => { - insert_data_type(data_types, path.as_str(), "boolean".to_owned()) + insert_data_type(data_types, path.as_str(), "boolean".to_owned()); + process_path(&path, path.clone(), sensitive_data_detected); + } + schema::Value::Float(_) => { + insert_data_type(data_types, path.as_str(), "number".to_owned()); + process_path(&path, path.clone(), sensitive_data_detected); + } + schema::Value::Int(_) => { + insert_data_type(data_types, path.as_str(), "number".to_owned()); + process_path(&path, path.clone(), sensitive_data_detected); + } + schema::Value::Null => { + insert_data_type(data_types, path.as_str(), "null".to_owned()); + process_path(&path, path.clone(), sensitive_data_detected); } - schema::Value::Float(_) => insert_data_type(data_types, path.as_str(), "number".to_owned()), - schema::Value::Int(_) => insert_data_type(data_types, path.as_str(), "number".to_owned()), - schema::Value::Null => insert_data_type(data_types, path.as_str(), "null".to_owned()), schema::Value::String(s) => { insert_data_type(data_types, path.as_str(), "string".to_owned()); let text = s.as_str(); @@ -72,6 +82,7 @@ fn process_graphql_argument<'a>( Some(old) => old.extend(sensitive_data), } } + process_path(&path, path.clone(), sensitive_data_detected); } schema::Value::Enum(e) => { let s = &e.to_owned().to_owned(); @@ -94,6 +105,7 @@ fn process_graphql_argument<'a>( Some(old) => old.extend(sensitive_data), } } + process_path(&path, path.clone(), sensitive_data_detected); } schema::Value::List(ls) => { let limit = std::cmp::min(ls.len(), 10); @@ -439,7 +451,7 @@ fn process_graphql_obj(m: &Map) -> Option { let query = m.get("query"); let default_map = Map::new(); let variables_map: &Map = match m.get("variables") { - Some(v) => v.as_object().unwrap(), + Some(v) => v.as_object().unwrap_or(&default_map), None => &default_map, }; let operation_name = match m.get("operationName") { diff --git a/ingestors/rust-common/src/process_trace.rs b/ingestors/rust-common/src/process_trace.rs index bcbafa7d0..c9209345f 100644 --- a/ingestors/rust-common/src/process_trace.rs +++ b/ingestors/rust-common/src/process_trace.rs @@ -1,7 +1,7 @@ use crate::{ open_api::{find_open_api_diff, get_split_path, EndpointInfo}, process_graphql::{process_graphql_body, process_graphql_query}, - sensitive_data::detect_sensitive_data, + sensitive_data::{detect_sensitive_data, detect_sensitive_in_path_data}, trace::{ApiResponse, ApiTrace, GraphQlData, KeyVal, ProcessTraceRes, ProcessTraceResInner}, METLO_CONFIG, }; @@ -120,14 +120,17 @@ pub fn process_json_val( serde_json::Value::Null => { let resolved_path = fix_path(path, response_alias_map); insert_data_type(data_types, resolved_path.as_str(), "null".to_string()); + process_path(path, resolved_path, sensitive_data_detected) } serde_json::Value::Bool(_) => { let resolved_path = fix_path(path, response_alias_map); insert_data_type(data_types, resolved_path.as_str(), "boolean".to_string()); + process_path(path, resolved_path, sensitive_data_detected) } serde_json::Value::Number(_) => { let resolved_path = fix_path(path, response_alias_map); insert_data_type(data_types, resolved_path.as_str(), "number".to_string()); + process_path(path, resolved_path, sensitive_data_detected) } serde_json::Value::String(e) => { let resolved_path = fix_path(path, response_alias_map); @@ -156,6 +159,7 @@ pub fn process_json_val( } } } + process_path(path, resolved_path, sensitive_data_detected) } serde_json::Value::Array(ls) => { let limit = std::cmp::min(ls.len(), 10); @@ -200,6 +204,27 @@ pub fn process_json_val( } } +pub fn process_path( + path: &String, + resolved_path: String, + sensitive_data_detected: &mut HashMap>, +) { + let sensitive_data_path = detect_sensitive_in_path_data(path.as_str()); + if !sensitive_data_path.is_empty() { + let old_sensitive_data = sensitive_data_detected.get_mut(&resolved_path); + match old_sensitive_data { + None => { + sensitive_data_detected.insert(resolved_path.clone(), sensitive_data_path); + } + Some(old) => { + for e in sensitive_data_path { + old.insert(e); + } + } + } + } +} + fn process_json( prefix: String, value: Value, @@ -392,7 +417,7 @@ fn process_key_val(prefix: String, vals: &Vec) -> Option { - sensitive_data_detected.insert(path, sensitive_data); + sensitive_data_detected.insert(path.clone(), sensitive_data); } Some(old) => { for e in sensitive_data { @@ -401,6 +426,7 @@ fn process_key_val(prefix: String, vals: &Vec) -> Option (ProcessTraceRes, bool) { let mut endpoint_path: String = trace.request.url.path.clone(); let split_path: Vec<&str> = get_split_path(&trace.request.url.path); let conf_read = METLO_CONFIG.try_read(); - if let Ok(ref conf) = METLO_CONFIG.try_read() { + if let Ok(ref conf) = conf_read { if let Some(endpoints) = &conf.endpoints { let key = format!( "{}-{}", diff --git a/ingestors/rust-common/src/sensitive_data.rs b/ingestors/rust-common/src/sensitive_data.rs index 1fab4fbe9..ae4abc784 100644 --- a/ingestors/rust-common/src/sensitive_data.rs +++ b/ingestors/rust-common/src/sensitive_data.rs @@ -7,18 +7,21 @@ use crate::METLO_CONFIG; #[derive(Debug, Clone)] pub struct SensitiveData { pub sensitive_data_type: String, - pub regex: Regex, + pub regex: Option, + pub key_regex: Option, } lazy_static! { pub static ref DEFAULT_SENSITIVE_DATA_LS: Vec = vec![ SensitiveData { - sensitive_data_type: "email".to_string(), - regex: Regex::new(r#"(^|\s)(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])(\s|$)"#).unwrap(), + sensitive_data_type: "Email".to_string(), + regex: Some(Regex::new(r#"(^|\s)(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])(\s|$)"#).unwrap()), + key_regex: None }, SensitiveData { - sensitive_data_type: "ipv4".to_string(), - regex: Regex::new(r#"(^|\s)(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(\s|$)"#).unwrap(), + sensitive_data_type: "IP Address".to_string(), + regex: Some(Regex::new(r#"(^|\s)(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(\s|$)"#).unwrap()), + key_regex: None }, ]; static ref AADHAR_MULT: Vec> = vec![ @@ -116,7 +119,10 @@ fn validate(sensitive_data_type: String, text: &str) -> bool { fn detect_sensitive_data_inner(txt: &str, sensitive_data: &[SensitiveData]) -> HashSet { sensitive_data .iter() - .filter(|e| e.regex.is_match(txt) && validate(e.sensitive_data_type.clone(), txt)) + .filter(|e| match &e.regex { + Some(regex) => regex.is_match(txt) && validate(e.sensitive_data_type.clone(), txt), + None => false, + }) .map(|e| e.sensitive_data_type.clone()) .collect() } @@ -131,3 +137,28 @@ pub fn detect_sensitive_data(txt: &str) -> HashSet { Err(_err) => detect_sensitive_data_inner(txt, &DEFAULT_SENSITIVE_DATA_LS), } } + +fn detect_sensitive_data_in_path_inner( + txt: &str, + sensitive_data: &[SensitiveData], +) -> HashSet { + sensitive_data + .iter() + .filter(|e| match &e.key_regex { + Some(regex) => regex.is_match(txt) && validate(e.sensitive_data_type.clone(), txt), + None => false, + }) + .map(|e| e.sensitive_data_type.clone()) + .collect() +} + +pub fn detect_sensitive_in_path_data(txt: &str) -> HashSet { + let conf_read = METLO_CONFIG.try_read(); + match conf_read { + Ok(conf) => match &conf.sensitive_data { + Some(s) => detect_sensitive_data_in_path_inner(txt, s), + None => HashSet::new(), + }, + Err(_err) => HashSet::new(), + } +}