"""Reusable Shannon entropy computation for TLS memory dump analysis.
Provides sliding-window entropy profiling with O(1) incremental updates
per step. Used by change_point detection and entropy visualization.
All functions are stdlib-only (math) with no external dependencies.
"""
import logging
import math
from typing import List, Tuple
logger = logging.getLogger("memdiver.entropy")
[docs]
def entropy_from_freq(freq: list, total: int) -> float:
"""Compute Shannon entropy from byte frequency counts.
Iterates over 256 frequency bins and applies the standard formula:
H = -sum(p * log2(p)) for each p = count / total where count > 0
Args:
freq: List of 256 integer counts (one per byte value).
total: Sum of all counts (window size).
Returns:
Entropy in bits per byte, in range [0.0, 8.0].
Returns 0.0 when total is zero.
"""
if total == 0:
return 0.0
entropy = 0.0
for count in freq:
if count > 0:
p = count / total
entropy -= p * math.log2(p)
return entropy
[docs]
def shannon_entropy(data: bytes) -> float:
"""Compute Shannon entropy of raw byte data.
Builds a full frequency table over the input and computes entropy
in a single pass.
Args:
data: Arbitrary byte sequence.
Returns:
Entropy in bits per byte, in range [0.0, 8.0].
Returns 0.0 for empty input.
"""
length = len(data)
if length == 0:
return 0.0
freq = [0] * 256
for byte in data:
freq[byte] += 1
return entropy_from_freq(freq, length)
[docs]
def compute_entropy_profile(
data: bytes, window: int = 32, step: int = 1
) -> List[Tuple[int, float]]:
"""Sliding-window entropy profile over byte data.
Uses an incremental frequency table that adds the incoming byte and
removes the outgoing byte at each step, keeping each step O(1)
regardless of window size.
For large dumps (e.g. 10 MB), use step=16 to produce ~625K sample
points instead of ~10M.
Args:
data: Raw memory dump bytes.
window: Sliding window size in bytes (default 32).
step: Advance step in bytes (default 1).
Returns:
List of (offset, entropy) tuples, one per window position.
Returns an empty list when data is shorter than the window.
"""
data_len = len(data)
if data_len < window:
return []
# Initialize frequency table for the first window.
freq = [0] * 256
for i in range(window):
freq[data[i]] += 1
profile: List[Tuple[int, float]] = []
profile.append((0, entropy_from_freq(freq, window)))
# Slide the window forward by 'step' bytes at a time.
pos = step
while pos <= data_len - window:
# Incrementally update: remove bytes that left, add bytes that entered.
old_start = pos - step
new_end_start = pos + window - step
for i in range(old_start, min(old_start + step, data_len)):
freq[data[i]] -= 1
for i in range(new_end_start, min(new_end_start + step, data_len)):
freq[data[i]] += 1
profile.append((pos, entropy_from_freq(freq, window)))
pos += step
return profile
[docs]
def find_high_entropy_regions(
profile: List[Tuple[int, float]],
threshold: float = 7.5,
min_width: int = 32,
) -> List[Tuple[int, int, float]]:
"""Find contiguous high-entropy regions in an entropy profile.
Scans the profile for runs of consecutive points at or above the
threshold, then filters by minimum width.
Args:
profile: Output of compute_entropy_profile -- list of
(offset, entropy) tuples, assumed sorted by offset.
threshold: Minimum entropy (bits/byte) to qualify as
high-entropy. Default 7.5 targets near-random data.
min_width: Minimum span (end - start) in bytes for a region
to be reported. Default 32 (one AES-256 key length).
Returns:
List of (start_offset, end_offset, mean_entropy) tuples for
each qualifying region. Offsets refer to the window start
positions from the profile.
"""
if not profile:
return []
regions: List[Tuple[int, int, float]] = []
in_region = False
region_start = 0
entropy_sum = 0.0
region_count = 0
for offset, entropy in profile:
if entropy >= threshold:
if not in_region:
in_region = True
region_start = offset
entropy_sum = 0.0
region_count = 0
entropy_sum += entropy
region_count += 1
else:
if in_region:
region_end = offset
if region_end - region_start >= min_width and region_count > 0:
mean_entropy = entropy_sum / region_count
regions.append((region_start, region_end, mean_entropy))
in_region = False
# Close any region that extends to the end of the profile.
if in_region and region_count > 0:
last_offset = profile[-1][0]
region_end = last_offset
if region_end - region_start >= min_width:
mean_entropy = entropy_sum / region_count
regions.append((region_start, region_end, mean_entropy))
return regions