Source code for core.entropy

"""Reusable Shannon entropy computation for TLS memory dump analysis.

Provides sliding-window entropy profiling with O(1) incremental updates
per step. Used by change_point detection and entropy visualization.

All functions are stdlib-only (math) with no external dependencies.
"""

import logging
import math
from typing import List, Tuple

logger = logging.getLogger("memdiver.entropy")


[docs] def entropy_from_freq(freq: list, total: int) -> float: """Compute Shannon entropy from byte frequency counts. Iterates over 256 frequency bins and applies the standard formula: H = -sum(p * log2(p)) for each p = count / total where count > 0 Args: freq: List of 256 integer counts (one per byte value). total: Sum of all counts (window size). Returns: Entropy in bits per byte, in range [0.0, 8.0]. Returns 0.0 when total is zero. """ if total == 0: return 0.0 entropy = 0.0 for count in freq: if count > 0: p = count / total entropy -= p * math.log2(p) return entropy
[docs] def shannon_entropy(data: bytes) -> float: """Compute Shannon entropy of raw byte data. Builds a full frequency table over the input and computes entropy in a single pass. Args: data: Arbitrary byte sequence. Returns: Entropy in bits per byte, in range [0.0, 8.0]. Returns 0.0 for empty input. """ length = len(data) if length == 0: return 0.0 freq = [0] * 256 for byte in data: freq[byte] += 1 return entropy_from_freq(freq, length)
[docs] def compute_entropy_profile( data: bytes, window: int = 32, step: int = 1 ) -> List[Tuple[int, float]]: """Sliding-window entropy profile over byte data. Uses an incremental frequency table that adds the incoming byte and removes the outgoing byte at each step, keeping each step O(1) regardless of window size. For large dumps (e.g. 10 MB), use step=16 to produce ~625K sample points instead of ~10M. Args: data: Raw memory dump bytes. window: Sliding window size in bytes (default 32). step: Advance step in bytes (default 1). Returns: List of (offset, entropy) tuples, one per window position. Returns an empty list when data is shorter than the window. """ data_len = len(data) if data_len < window: return [] # Initialize frequency table for the first window. freq = [0] * 256 for i in range(window): freq[data[i]] += 1 profile: List[Tuple[int, float]] = [] profile.append((0, entropy_from_freq(freq, window))) # Slide the window forward by 'step' bytes at a time. pos = step while pos <= data_len - window: # Incrementally update: remove bytes that left, add bytes that entered. old_start = pos - step new_end_start = pos + window - step for i in range(old_start, min(old_start + step, data_len)): freq[data[i]] -= 1 for i in range(new_end_start, min(new_end_start + step, data_len)): freq[data[i]] += 1 profile.append((pos, entropy_from_freq(freq, window))) pos += step return profile
[docs] def find_high_entropy_regions( profile: List[Tuple[int, float]], threshold: float = 7.5, min_width: int = 32, ) -> List[Tuple[int, int, float]]: """Find contiguous high-entropy regions in an entropy profile. Scans the profile for runs of consecutive points at or above the threshold, then filters by minimum width. Args: profile: Output of compute_entropy_profile -- list of (offset, entropy) tuples, assumed sorted by offset. threshold: Minimum entropy (bits/byte) to qualify as high-entropy. Default 7.5 targets near-random data. min_width: Minimum span (end - start) in bytes for a region to be reported. Default 32 (one AES-256 key length). Returns: List of (start_offset, end_offset, mean_entropy) tuples for each qualifying region. Offsets refer to the window start positions from the profile. """ if not profile: return [] regions: List[Tuple[int, int, float]] = [] in_region = False region_start = 0 entropy_sum = 0.0 region_count = 0 for offset, entropy in profile: if entropy >= threshold: if not in_region: in_region = True region_start = offset entropy_sum = 0.0 region_count = 0 entropy_sum += entropy region_count += 1 else: if in_region: region_end = offset if region_end - region_start >= min_width and region_count > 0: mean_entropy = entropy_sum / region_count regions.append((region_start, region_end, mean_entropy)) in_region = False # Close any region that extends to the end of the profile. if in_region and region_count > 0: last_offset = profile[-1][0] region_end = last_offset if region_end - region_start >= min_width: mean_entropy = entropy_sum / region_count regions.append((region_start, region_end, mean_entropy)) return regions