Source code for dexray_insight.modules.native.string_extraction

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
Native String Extraction Module

This module extracts strings from native binaries (.so files) using radare2.
The extracted strings are then made available to other analysis modules for
pattern matching, tracker detection, and security analysis.
"""

import time
import re
from typing import Dict, Any, Optional, List

from .base_native_module import BaseNativeModule, NativeBinaryInfo, NativeAnalysisResult, NativeStringSource


[docs]
class NativeStringExtractionModule(BaseNativeModule):
    """
    Native module for extracting strings from native binaries.
    
    This module uses radare2's string extraction capabilities to find readable
    strings in native binaries and makes them available for further analysis.
    """
    

[docs]
    def __init__(self, config: Dict[str, Any], logger: Optional[Any] = None):
        super().__init__(config, logger)
        
        # Configuration
        self.min_length = config.get('min_string_length', 4)
        self.max_length = config.get('max_string_length', 1024)
        self.encoding = config.get('encoding', 'utf-8')
        self.fallback_encodings = config.get('fallback_encodings', ['latin1', 'ascii'])
        
        # String filtering patterns (to reduce noise)
        self.noise_patterns = [
            r'^[0-9.]+$',           # Pure numbers/versions
            r'^[A-Fa-f0-9]+$',      # Hexadecimal strings
            r'^[\x00-\x1f]+$',      # Control characters only
            r'^[^A-Za-z]*$'         # No alphabetic characters
        ]
        self.compiled_noise_patterns = [re.compile(pattern) for pattern in self.noise_patterns]

    

[docs]
    def analyze_binary(self, binary_info: NativeBinaryInfo, r2: Any) -> NativeAnalysisResult:
        """
        Extract strings from a native binary using radare2.
        
        Args:
            binary_info: Information about the binary being analyzed
            r2: r2pipe connection to the binary
            
        Returns:
            NativeAnalysisResult with extracted strings
        """
        start_time = time.time()
        
        try:
            self.logger.debug(f"Extracting strings from {binary_info.file_name}")
            
            # Extract strings using multiple r2 commands
            strings_found = []
            
            # Method 1: Use r2's iz command (strings in data sections)
            strings_found.extend(self._extract_strings_iz(r2, binary_info))
            
            # Method 2: Use r2's izz command (strings in all sections)
            strings_found.extend(self._extract_strings_izz(r2, binary_info))
            
            # Remove duplicates while preserving order
            unique_strings = self._deduplicate_strings(strings_found)
            
            # Filter noise
            filtered_strings = self._filter_noise_strings(unique_strings)
            
            self.logger.debug(f"Extracted {len(filtered_strings)} strings from {binary_info.file_name}")
            
            return NativeAnalysisResult(
                binary_info=binary_info,
                module_name=self.get_module_name(),
                success=True,
                execution_time=time.time() - start_time,
                strings_found=filtered_strings,
                additional_data={
                    'total_strings_before_filtering': len(unique_strings),
                    'strings_after_filtering': len(filtered_strings),
                    'filter_ratio': len(filtered_strings) / max(len(unique_strings), 1)
                }
            )
            
        except Exception as e:
            self.logger.error(f"String extraction failed for {binary_info.file_name}: {e}")
            return NativeAnalysisResult(
                binary_info=binary_info,
                module_name=self.get_module_name(),
                success=False,
                error_message=str(e),
                execution_time=time.time() - start_time
            )

    
    def _extract_strings_iz(self, r2: Any, binary_info: NativeBinaryInfo) -> List[NativeStringSource]:
        """Extract strings using r2's iz command (data sections only)"""
        strings = []
        
        try:
            # Get strings from data sections
            result = self._safe_r2_command(r2, "izj", "[]")  # JSON format for easier parsing
            
            if result and result != "[]":
                import json
                try:
                    string_objects = json.loads(result)
                    for obj in string_objects:
                        if isinstance(obj, dict):
                            string_content = obj.get('string', '')
                            offset = obj.get('vaddr', obj.get('paddr', 0))
                            # length = obj.get('length', len(string_content))  # Unused variable
                            
                            if self._is_valid_string_length(string_content):
                                strings.append(NativeStringSource(
                                    content=string_content,
                                    file_path=binary_info.relative_path,
                                    extraction_method="r2_iz_data_sections",
                                    offset=offset,
                                    encoding=self.encoding,
                                    confidence=0.9  # High confidence from data sections
                                ))
                except json.JSONDecodeError:
                    # Fallback to text parsing
                    strings.extend(self._parse_iz_text_output(result, binary_info, "r2_iz_data_sections"))
                    
        except Exception as e:
            self.logger.debug(f"iz command failed: {e}")
        
        return strings
    
    def _extract_strings_izz(self, r2: Any, binary_info: NativeBinaryInfo) -> List[NativeStringSource]:
        """Extract strings using r2's izz command (all sections)"""
        strings = []
        
        try:
            # Get strings from all sections
            result = self._safe_r2_command(r2, "izzj", "[]")  # JSON format
            
            if result and result != "[]":
                import json
                try:
                    string_objects = json.loads(result)
                    for obj in string_objects:
                        if isinstance(obj, dict):
                            string_content = obj.get('string', '')
                            offset = obj.get('vaddr', obj.get('paddr', 0))
                            section = obj.get('section', 'unknown')
                            
                            if self._is_valid_string_length(string_content):
                                strings.append(NativeStringSource(
                                    content=string_content,
                                    file_path=binary_info.relative_path,
                                    extraction_method=f"r2_izz_all_sections_{section}",
                                    offset=offset,
                                    encoding=self.encoding,
                                    confidence=0.8  # Slightly lower confidence from all sections
                                ))
                except json.JSONDecodeError:
                    # Fallback to text parsing
                    strings.extend(self._parse_iz_text_output(result, binary_info, "r2_izz_all_sections"))
                    
        except Exception as e:
            self.logger.debug(f"izz command failed: {e}")
        
        return strings
    
    def _parse_iz_text_output(self, output: str, binary_info: NativeBinaryInfo, method: str) -> List[NativeStringSource]:
        """Parse text output from iz/izz commands as fallback"""
        strings = []
        
        if not output or not output.strip():
            return strings
        
        lines = output.strip().split('\n')
        for line in lines:
            if not line.strip():
                continue
                
            # Parse r2 string output format
            # Format typically: [ordinal] [offset] [length] [size] [section] string_content
            parts = line.split(None, 5)
            if len(parts) >= 6:
                try:
                    offset_str = parts[1]
                    string_content = parts[5]
                    
                    # Parse offset
                    offset = int(offset_str, 16) if offset_str.startswith('0x') else int(offset_str)
                    
                    if self._is_valid_string_length(string_content):
                        strings.append(NativeStringSource(
                            content=string_content,
                            file_path=binary_info.relative_path,
                            extraction_method=method,
                            offset=offset,
                            encoding=self.encoding,
                            confidence=0.7  # Lower confidence for text parsing
                        ))
                except (ValueError, IndexError):
                    continue
        
        return strings
    
    def _is_valid_string_length(self, string_content: str) -> bool:
        """Check if string meets length requirements"""
        if not string_content:
            return False
        return self.min_length <= len(string_content) <= self.max_length
    
    def _deduplicate_strings(self, strings: List[NativeStringSource]) -> List[NativeStringSource]:
        """Remove duplicate strings while preserving order"""
        seen_contents = set()
        unique_strings = []
        
        for string_obj in strings:
            if string_obj.content not in seen_contents:
                seen_contents.add(string_obj.content)
                unique_strings.append(string_obj)
        
        return unique_strings
    
    def _filter_noise_strings(self, strings: List[NativeStringSource]) -> List[NativeStringSource]:
        """Filter out noise strings that are unlikely to be useful"""
        filtered = []
        
        for string_obj in strings:
            content = string_obj.content
            
            # Skip empty or very short strings
            if not content or len(content.strip()) < self.min_length:
                continue
            
            # Check noise patterns
            is_noise = False
            for pattern in self.compiled_noise_patterns:
                if pattern.match(content):
                    is_noise = True
                    break
            
            if not is_noise:
                filtered.append(string_obj)
            else:
                self.logger.debug(f"Filtered noise string: {content[:50]}...")
        
        return filtered
    

[docs]
    def can_analyze(self, binary_info: NativeBinaryInfo) -> bool:
        """Check if this module should analyze the given binary"""
        # Only analyze .so files
        return (super().can_analyze(binary_info) and 
                binary_info.file_name.endswith('.so') and
                binary_info.file_size > 0)

    

[docs]
    def get_module_name(self) -> str:
        """Get the module name"""
        return "native_string_extraction"