Source code for dexray_insight.modules.native.string_extraction

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
Native String Extraction Module

This module extracts strings from native binaries (.so files) using radare2.
The extracted strings are then made available to other analysis modules for
pattern matching, tracker detection, and security analysis.
"""

import time
import re
from typing import Dict, Any, Optional, List

from .base_native_module import BaseNativeModule, NativeBinaryInfo, NativeAnalysisResult, NativeStringSource

[docs] class NativeStringExtractionModule(BaseNativeModule): """ Native module for extracting strings from native binaries. This module uses radare2's string extraction capabilities to find readable strings in native binaries and makes them available for further analysis. """
[docs] def __init__(self, config: Dict[str, Any], logger: Optional[Any] = None): super().__init__(config, logger) # Configuration self.min_length = config.get('min_string_length', 4) self.max_length = config.get('max_string_length', 1024) self.encoding = config.get('encoding', 'utf-8') self.fallback_encodings = config.get('fallback_encodings', ['latin1', 'ascii']) # String filtering patterns (to reduce noise) self.noise_patterns = [ r'^[0-9.]+$', # Pure numbers/versions r'^[A-Fa-f0-9]+$', # Hexadecimal strings r'^[\x00-\x1f]+$', # Control characters only r'^[^A-Za-z]*$' # No alphabetic characters ] self.compiled_noise_patterns = [re.compile(pattern) for pattern in self.noise_patterns]
[docs] def analyze_binary(self, binary_info: NativeBinaryInfo, r2: Any) -> NativeAnalysisResult: """ Extract strings from a native binary using radare2. Args: binary_info: Information about the binary being analyzed r2: r2pipe connection to the binary Returns: NativeAnalysisResult with extracted strings """ start_time = time.time() try: self.logger.debug(f"Extracting strings from {binary_info.file_name}") # Extract strings using multiple r2 commands strings_found = [] # Method 1: Use r2's iz command (strings in data sections) strings_found.extend(self._extract_strings_iz(r2, binary_info)) # Method 2: Use r2's izz command (strings in all sections) strings_found.extend(self._extract_strings_izz(r2, binary_info)) # Remove duplicates while preserving order unique_strings = self._deduplicate_strings(strings_found) # Filter noise filtered_strings = self._filter_noise_strings(unique_strings) self.logger.debug(f"Extracted {len(filtered_strings)} strings from {binary_info.file_name}") return NativeAnalysisResult( binary_info=binary_info, module_name=self.get_module_name(), success=True, execution_time=time.time() - start_time, strings_found=filtered_strings, additional_data={ 'total_strings_before_filtering': len(unique_strings), 'strings_after_filtering': len(filtered_strings), 'filter_ratio': len(filtered_strings) / max(len(unique_strings), 1) } ) except Exception as e: self.logger.error(f"String extraction failed for {binary_info.file_name}: {e}") return NativeAnalysisResult( binary_info=binary_info, module_name=self.get_module_name(), success=False, error_message=str(e), execution_time=time.time() - start_time )
def _extract_strings_iz(self, r2: Any, binary_info: NativeBinaryInfo) -> List[NativeStringSource]: """Extract strings using r2's iz command (data sections only)""" strings = [] try: # Get strings from data sections result = self._safe_r2_command(r2, "izj", "[]") # JSON format for easier parsing if result and result != "[]": import json try: string_objects = json.loads(result) for obj in string_objects: if isinstance(obj, dict): string_content = obj.get('string', '') offset = obj.get('vaddr', obj.get('paddr', 0)) # length = obj.get('length', len(string_content)) # Unused variable if self._is_valid_string_length(string_content): strings.append(NativeStringSource( content=string_content, file_path=binary_info.relative_path, extraction_method="r2_iz_data_sections", offset=offset, encoding=self.encoding, confidence=0.9 # High confidence from data sections )) except json.JSONDecodeError: # Fallback to text parsing strings.extend(self._parse_iz_text_output(result, binary_info, "r2_iz_data_sections")) except Exception as e: self.logger.debug(f"iz command failed: {e}") return strings def _extract_strings_izz(self, r2: Any, binary_info: NativeBinaryInfo) -> List[NativeStringSource]: """Extract strings using r2's izz command (all sections)""" strings = [] try: # Get strings from all sections result = self._safe_r2_command(r2, "izzj", "[]") # JSON format if result and result != "[]": import json try: string_objects = json.loads(result) for obj in string_objects: if isinstance(obj, dict): string_content = obj.get('string', '') offset = obj.get('vaddr', obj.get('paddr', 0)) section = obj.get('section', 'unknown') if self._is_valid_string_length(string_content): strings.append(NativeStringSource( content=string_content, file_path=binary_info.relative_path, extraction_method=f"r2_izz_all_sections_{section}", offset=offset, encoding=self.encoding, confidence=0.8 # Slightly lower confidence from all sections )) except json.JSONDecodeError: # Fallback to text parsing strings.extend(self._parse_iz_text_output(result, binary_info, "r2_izz_all_sections")) except Exception as e: self.logger.debug(f"izz command failed: {e}") return strings def _parse_iz_text_output(self, output: str, binary_info: NativeBinaryInfo, method: str) -> List[NativeStringSource]: """Parse text output from iz/izz commands as fallback""" strings = [] if not output or not output.strip(): return strings lines = output.strip().split('\n') for line in lines: if not line.strip(): continue # Parse r2 string output format # Format typically: [ordinal] [offset] [length] [size] [section] string_content parts = line.split(None, 5) if len(parts) >= 6: try: offset_str = parts[1] string_content = parts[5] # Parse offset offset = int(offset_str, 16) if offset_str.startswith('0x') else int(offset_str) if self._is_valid_string_length(string_content): strings.append(NativeStringSource( content=string_content, file_path=binary_info.relative_path, extraction_method=method, offset=offset, encoding=self.encoding, confidence=0.7 # Lower confidence for text parsing )) except (ValueError, IndexError): continue return strings def _is_valid_string_length(self, string_content: str) -> bool: """Check if string meets length requirements""" if not string_content: return False return self.min_length <= len(string_content) <= self.max_length def _deduplicate_strings(self, strings: List[NativeStringSource]) -> List[NativeStringSource]: """Remove duplicate strings while preserving order""" seen_contents = set() unique_strings = [] for string_obj in strings: if string_obj.content not in seen_contents: seen_contents.add(string_obj.content) unique_strings.append(string_obj) return unique_strings def _filter_noise_strings(self, strings: List[NativeStringSource]) -> List[NativeStringSource]: """Filter out noise strings that are unlikely to be useful""" filtered = [] for string_obj in strings: content = string_obj.content # Skip empty or very short strings if not content or len(content.strip()) < self.min_length: continue # Check noise patterns is_noise = False for pattern in self.compiled_noise_patterns: if pattern.match(content): is_noise = True break if not is_noise: filtered.append(string_obj) else: self.logger.debug(f"Filtered noise string: {content[:50]}...") return filtered
[docs] def can_analyze(self, binary_info: NativeBinaryInfo) -> bool: """Check if this module should analyze the given binary""" # Only analyze .so files return (super().can_analyze(binary_info) and binary_info.file_name.endswith('.so') and binary_info.file_size > 0)
[docs] def get_module_name(self) -> str: """Get the module name""" return "native_string_extraction"