From 8f2cb579a5280d21d37fc397be8ba3226c2a9414 Mon Sep 17 00:00:00 2001 From: br0kej Date: Wed, 11 Sep 2024 22:49:03 +0100 Subject: [PATCH 1/3] [refactor/bugfix] Changing how the CFG edge lists are created to account for heavily optimised CFG's and make it much simpler overall --- src/agfj.rs | 76 +++++++++++++++++++++++++++-------------------------- src/bb.rs | 75 +++++++++------------------------------------------- 2 files changed, 52 insertions(+), 99 deletions(-) diff --git a/src/agfj.rs b/src/agfj.rs index 8d64697..d4f5a17 100644 --- a/src/agfj.rs +++ b/src/agfj.rs @@ -136,17 +136,13 @@ impl AGFJFunc { } pub fn create_bb_edge_list(&mut self, min_blocks: &u16) { if self.blocks.len() > (*min_blocks).into() && self.blocks[0].offset != 1 { - let mut addr_idxs = Vec::::new(); - + let bb_start_addrs: Vec = self.blocks.iter().map(|x| x.offset).collect::>(); let mut edge_list = Vec::<(u32, u32, u32)>::new(); - let min_offset: u64 = self.offset; - let max_offset: u64 = self.offset + self.size.unwrap_or(0); - for bb in &self.blocks { - bb.get_block_edges(&mut addr_idxs, &mut edge_list, max_offset, min_offset) + bb.get_block_edges(&bb_start_addrs, &mut edge_list) } - self.addr_idx = Some(addr_idxs); + self.addr_idx = Some(bb_start_addrs); self.edge_list = Some(edge_list); } } @@ -279,6 +275,13 @@ impl AGFJFunc { feature_type: FeatureType, inference_job: &Option>, ) { + /* + This function needs some serious sorting out. + + - Need to get GPU toggle-able + - Need to use new CFG edge builder + - General refactor + */ info!("Processing {:?}", self.name); let full_output_path = get_save_file_path(path, output_path, Some(".json".to_string()), None, None); @@ -286,16 +289,14 @@ impl AGFJFunc { // offset != 1 has been added to skip functions with invalid instructions if self.blocks.len() >= (*min_blocks).into() && self.blocks[0].offset != 1 { - let mut addr_idxs = Vec::::new(); - + let bb_start_addrs: Vec = self.blocks.iter().map(|x| x.offset).collect::>(); let mut edge_list = Vec::<(u32, u32, u32)>::new(); let mut feature_vecs = Vec::<_>::new(); let mut feature_vec_of_vecs = Vec::<_>::new(); - let min_offset = self.offset; - let max_offset = self.offset + self.size.unwrap_or(0); + for bb in &self.blocks { - bb.get_block_edges(&mut addr_idxs, &mut edge_list, max_offset, min_offset); + bb.get_block_edges(&bb_start_addrs, &mut edge_list); if inference_job.is_some() { let inference = inference_job.as_ref().unwrap().clone(); match feature_type { @@ -393,7 +394,6 @@ impl AGFJFunc { if !Path::new(&fname_string).is_file() { // offset != 1 has been added to skip functions with invalid instructions if self.blocks.len() >= (*min_blocks).into() && self.blocks[0].offset != 1 { - let mut addr_idxs = Vec::::new(); let mut edge_list = Vec::<(u32, u32, u32)>::new(); let mut feature_vecs: StringOrF64 = match feature_type { @@ -411,8 +411,9 @@ impl AGFJFunc { } }; - let min_offset: u64 = self.offset; - let max_offset: u64 = self.offset + self.size.unwrap_or(0); + let bb_start_addrs: Vec = + self.blocks.iter().map(|x| x.offset).collect::>(); + match feature_type { FeatureType::Tiknib | FeatureType::Gemini @@ -420,27 +421,20 @@ impl AGFJFunc { | FeatureType::DGIS => { let feature_vecs = feature_vecs.as_f64_mut().unwrap(); for bb in &self.blocks { - bb.get_block_edges( - &mut addr_idxs, - &mut edge_list, - max_offset, - min_offset, - ); + bb.get_block_edges(&bb_start_addrs, &mut edge_list); bb.generate_bb_feature_vec(feature_vecs, feature_type, architecture); } + debug!("Number of Feature Vecs: {}", feature_vecs.len()); + assert_eq!(self.blocks.len(), feature_vecs.len()) } FeatureType::Esil | FeatureType::Disasm | FeatureType::Pseudo => { let feature_vecs = feature_vecs.as_string_mut().unwrap(); for bb in &self.blocks { - bb.get_block_edges( - &mut addr_idxs, - &mut edge_list, - max_offset, - min_offset, - ); + bb.get_block_edges(&bb_start_addrs, &mut edge_list); bb.generate_bb_feature_strings(feature_vecs, feature_type, true); } - debug!("Number of Feature Vecs: {}", feature_vecs.len()) + debug!("Number of Feature Vecs: {}", feature_vecs.len()); + assert_eq!(self.blocks.len(), feature_vecs.len()) } FeatureType::ModelEmbedded | FeatureType::Encoded | FeatureType::Invalid => { info!("Invalid Feature Type. Skipping.."); @@ -454,10 +448,16 @@ impl AGFJFunc { edge_list.is_empty(), edge_list.len() ); - if !edge_list.is_empty() { - let mut graph = Graph::::from_edges(&edge_list); - Self::str_to_hex_node_idxs(&mut graph, &mut addr_idxs); + if !edge_list.is_empty() { + let mut graph = Graph::::from_edges(&edge_list); + Self::str_to_hex_node_idxs(&mut graph, &bb_start_addrs); + if graph.node_count() != self.blocks.len() { + debug!("Graph for {} does not have the same number of nodes as basic blocks - N: {} B: {}. This suggests \ + there is something wrong with the CFG edge recovery. If this is a problem, please raise a GitHub issue!", + self.name, graph.node_count(), self.blocks.len()); + return; + } // Unpack the NodeTypes to the inner Types if feature_type == FeatureType::Gemini { @@ -577,22 +577,24 @@ impl AGFJFunc { info!("Function {} has no edges. Skipping...", self.name) } } else { - info!( + debug!( "Function {} has less than the minimum number of blocks. Skipping..", self.name ); } } else { - info!( - "Function {} has already been processed. Skipping...", - self.name - ) + trace!("Function has fewer basic blocks than the minimum. Skipping..."); } + } else { + debug!( + "Function {} has already been processed. Skipping...", + self.name + ) } } // Convert string memory address to hex / string - fn str_to_hex_node_idxs(graph: &mut Graph, addr_idxs: &mut [i64]) { + fn str_to_hex_node_idxs(graph: &mut Graph, addr_idxs: &[i64]) { for idx in graph.node_indices() { let i_idx = idx.index(); let hex = addr_idxs[i_idx]; diff --git a/src/bb.rs b/src/bb.rs index 39ba083..b8c1031 100644 --- a/src/bb.rs +++ b/src/bb.rs @@ -446,70 +446,21 @@ impl ACFJBlock { } num_offspring } - - // Get the edges associated with a given basic block. - // This function only considers valid edges as being - // fail, jumps or switchops that reside within the function itself. - // If there are edges that jump to another function outside of the program - // these edges are ignored. - // - // This function updates the provide mutable edge list with a three-tuple which - // represents (src, dst, weight). The weight in this case is the type of edge where - // 1 denotes jump, 2 denotes fail, 3 denotes switchop - pub fn get_block_edges( - &self, - addr_idxs: &mut Vec, - edge_list: &mut Vec<(u32, u32, u32)>, - max_offset: u64, - min_offset: u64, - ) { - let mut addr: i64 = self.offset; - let mut jump: i64 = self.jump; - let mut fail: i64 = self.fail; - - if addr < min_offset.try_into().unwrap() || addr >= max_offset.try_into().unwrap() { - addr = -1; - } - - if jump < min_offset.try_into().unwrap() || jump >= max_offset.try_into().unwrap() { - jump = -1; - } - - if fail < min_offset.try_into().unwrap() || fail >= max_offset.try_into().unwrap() { - fail = -1; - } - - if addr != -1 && !addr_idxs.contains(&addr) { - addr_idxs.push(addr); - } - if jump != -1 && !addr_idxs.contains(&jump) { - addr_idxs.push(jump) - } - - if fail != -1 && !addr_idxs.contains(&fail) { - addr_idxs.push(fail) - } - - let addr_idx = addr_idxs.iter().position(|&p| p == addr); - - if let Some(addr_idx) = addr_idx { - if jump != -1 { - let jump_idx = addr_idxs.iter().position(|&p| p == jump).unwrap(); - edge_list.push((addr_idx as u32, jump_idx as u32, 1)); - } - - if fail != -1 { - let fail_idx = addr_idxs.iter().position(|&p| p == fail).unwrap(); - edge_list.push((addr_idx as u32, fail_idx as u32, 2)); + pub fn get_block_edges(&self, bb_start_addrs: &[i64], edge_list: &mut Vec<(u32, u32, u32)>) { + let offset_idx = bb_start_addrs.iter().position(|&p| p == self.offset); + + if let Some(offset_idx) = offset_idx { + if self.jump != -1 { + let jump_idx = bb_start_addrs.iter().position(|&p| p == self.jump); + if let Some(jump_idx) = jump_idx { + edge_list.push((offset_idx as u32, jump_idx as u32, 1)); + } } - if self.switchop.is_some() { - for item in &self.switchop.as_ref().unwrap().cases { - if !addr_idxs.contains(&item.jump) { - addr_idxs.push(item.jump) - } - let item_addr_idx = addr_idxs.iter().position(|&p| p == item.jump).unwrap(); - edge_list.push((addr_idx as u32, item_addr_idx as u32, 3)); + if self.fail != -1 { + let fail_idx = bb_start_addrs.iter().position(|&p| p == self.fail); + if let Some(fail_idx) = fail_idx { + edge_list.push((offset_idx as u32, fail_idx as u32, 1)); } } } From 4d82f45f3abce2fa5f2be334f2aac091c6fe8365 Mon Sep 17 00:00:00 2001 From: br0kej Date: Wed, 11 Sep 2024 22:49:51 +0100 Subject: [PATCH 2/3] bump version to 0.4.0 --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 13b50b6..72f03d0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "bin2ml" -version = "0.3.2" +version = "0.4.0" edition = "2021" [dependencies] From cbd73cd021f919855540204c24407445091c07c1 Mon Sep 17 00:00:00 2001 From: br0kej Date: Wed, 11 Sep 2024 22:53:18 +0100 Subject: [PATCH 3/3] fixing test given new basic block edge function --- src/agfj.rs | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/agfj.rs b/src/agfj.rs index d4f5a17..c09408a 100644 --- a/src/agfj.rs +++ b/src/agfj.rs @@ -862,18 +862,19 @@ mod tests { // Check edge list output is the correct format let expected_edge_list = Some(vec![ + (0, 2, 1), (0, 1, 1), - (0, 2, 2), - (2, 3, 1), (1, 3, 1), + (2, 3, 1), + (3, 5, 1), (3, 4, 1), - (3, 5, 2), + (4, 8, 1), + (5, 7, 1), (5, 6, 1), - (4, 7, 1), - (4, 8, 2), - (8, 6, 1), - (7, 6, 1), + (6, 8, 1), + (7, 8, 1), ]); + assert_eq!(target_func.edge_list, expected_edge_list) } }