Source code for smooth_disorder.barcode

"""
barcode.py — H1 Persistent Homology Barcodes and Bond-Network Entropy
======================================================================

This module implements the core algorithm for computing the H1 persistent
homology barcode of a local atomic environment and the Bond-Network Entropy
(BNE) derived from it.

**Background**

Amorphous solids do not have long-range order and instead we can look
at the local atomic environments to understand their structure.
In here we will treat a local atomic environment as a selection of atoms
around a given atom and construct a subgraph where atoms are nodes and bonds
between them are edges.

A "barcode" in persistent homology counts topological loops (rings) in that
subgraph at different distance scales. Each atom has a corresponding barcode; the
distribution of barcodes across all atoms is summarized with a single number —
the information entropy of barcode probability distribution 
— called the Bond-Network Entropy (BNE).

**References**

B. Schweinhart, D. Rodney, and J.K. Mason,
*Phys. Rev. E* 101, 052312 (2020)  — definition of H1 barcode used here.

K. Iwanowski, G. Csányi, and M. Simoncelli,
*Phys. Rev. X* 15, 041041 (2025)   — BNE applied to disordered energy materials.
"""

import string, re, struct, sys, math, os
import scipy
import numpy as np
from ase.io import read
from typing import Tuple, List


# ---------------------------------------------------------------------------
# Local environment extraction
# ---------------------------------------------------------------------------


[docs]
def obtain_local_number_environment_big_structures(
    adjacency_matrix: np.ndarray,
    atom_index: int,
    distances: np.ndarray,
    idx_distances: np.ndarray,
    n_environment_atoms: int,
):
    """
    Extract the local atomic environment (LAE) centred on a given atom.

    The LAE is the subgraph formed by the n_environment_atoms atoms that are
    geometrically closest to the central atom (including central atom).  
    We also record which "layer" (bond-distance shell) each atom belongs to, 
    which is needed for the barcode calculation.

    **How the layer discovery works**

    Layers are found using a breadth-first search (BFS) implemented via
    matrix multiplication:

        current_layer = A @ previous_layer_indicator

    where A is the local adjacency matrix and the indicator vector has 1 at
    atoms already found and 0 elsewhere.  Multiplying spreads the "signal"
    to all bonded neighbours in one step.  By taking the XOR of the new
    indicator with the old one we find only the *newly discovered* atoms,
    i.e. the next shell.

    Parameters
    ----------
    adjacency_matrix : np.ndarray, shape (N, K)
        Global adjacency matrix for the full structure.  Entry [i, j] = 1
        means atoms i and idx_distances[i, j] are bonded, 0 otherwise.
    atom_index : int
        Index of the central atom in the global structure.
    distances : np.ndarray, shape (N, K)
        Pre-computed distances from each atom to itself and its K-1 nearest
        neighbours (K in total; output of obtain_distances_ase or
        obtain_distances_big_structures).
    idx_distances : np.ndarray, shape (N, K)
        Global atom indices corresponding to the distances array.
    n_environment_atoms : int
        Number of atoms to include in the local environment (n in "LAE of n").

    Returns
    -------
    local_adjacency_matrix : np.ndarray, shape (n_environment_atoms, n_environment_atoms)
        Adjacency matrix restricted to the local environment.
    layers : list of lists
        layers[k] contains the local indices of atoms first reached in hop k.
        layers[0] = [local_atom_index] (the central atom itself).
    local_atom_index : int
        Index of the central atom within the local environment (0 … n-1).
    global_index : np.ndarray, shape (n_environment_atoms,)
        Global atom indices of the atoms in the local environment, sorted.
    """

    # -----------------------------------------------------------------------
    # Step 1: identify which n atoms form the local environment
    # -----------------------------------------------------------------------

    # argpartition gives the n_environment_atoms smallest distances in O(n)
    # (faster than a full sort which would be O(n log n))
    local_index = np.argpartition(distances[atom_index], n_environment_atoms)[:n_environment_atoms]

    # convert local position indices back to global atom indices
    global_index = idx_distances[atom_index][local_index]

    # sort global_index so the local adjacency matrix rows/cols have a
    # consistent ordering
    sorted_global_index_idx = np.argsort(global_index)
    global_index = global_index[sorted_global_index_idx]

    # -----------------------------------------------------------------------
    # Step 2: build the local adjacency matrix
    # -----------------------------------------------------------------------
    # For each atom in the local environment, take the row of the global
    # adjacency matrix and restrict it to the other atoms in the environment.

    local_adjacency_matrix = []
    for idx in global_index:
        # which of this atom's neighbours are also in the local environment?
        mask = np.isin(idx_distances[idx], global_index)

        # adjacency entries for the neighbours that are in the environment
        row_adjacency_matrix = adjacency_matrix[idx][mask]
        
        row_connection_order = list(idx_distances[idx][mask])

        # pad to length n_environment_atoms in case fewer neighbours exist
        pad_amount = n_environment_atoms - len(row_connection_order)
        row_adjacency_matrix = np.pad(row_adjacency_matrix, (0, pad_amount), constant_values=(0, 0))
        row_connection_order = np.array(
            row_connection_order + list(global_index[np.logical_not(np.isin(global_index, row_connection_order))]))

        # reorder columns to match the sorted global_index ordering
        local_adjacency_matrix.append(row_adjacency_matrix[np.argsort(row_connection_order)])

    local_adjacency_matrix = np.array(local_adjacency_matrix)

    # -----------------------------------------------------------------------
    # Step 3: BFS to find which bond-distance layer each atom belongs to
    # -----------------------------------------------------------------------

    # find the row index of the central atom within the local environment
    local_atom_index = np.arange(0, n_environment_atoms, 1)[global_index == atom_index][0]

    # indicator vector: 1 at the central atom, 0 everywhere else
    atom_int_array = np.zeros(n_environment_atoms)
    atom_int_array[local_atom_index] = 1

    # boolean mask of all atoms found so far (starts with just the centre)
    local_environment = np.zeros(n_environment_atoms).astype(bool)
    local_environment[local_atom_index] = True

    # layer 0 is the central atom itself
    layers = [[local_atom_index]]

    search_bool = True
    while search_bool:
        # matrix–vector multiply propagates the "signal" to all bonded atoms
        atom_int_array = local_adjacency_matrix @ atom_int_array

        # XOR: atoms that are newly reached (in current shell but not before)
        current_environment = local_environment.copy()
        local_environment = local_environment | atom_int_array.astype(bool)
        current_layer = local_environment ^ current_environment

        if np.isclose(current_layer.sum(), 0.0):  # no new atoms → BFS done
            search_bool = False
            break

        layers.append(list(np.arange(0, n_environment_atoms, 1)[current_layer]))

    return local_adjacency_matrix, layers, local_atom_index, global_index



# ---------------------------------------------------------------------------
# Möbius inversion cache
# ---------------------------------------------------------------------------

# mu stores previously computed values of the Möbius inversion function.
# This avoids recomputing the same values repeatedly (memoization).
# The key is a string encoding the four integer arguments: "(a,b),(c,d)".
mu = {}



[docs]
def clear_mu_cache():
    """
    Clear the Möbius inversion memoization cache.

    The mu dictionary accumulates values across calls to recursive_find_mu.
    Call this function in test teardown to prevent state leaking between tests.
    In normal usage (single script) it never needs to be called.
    """
    mu.clear()



# ---------------------------------------------------------------------------
# Möbius inversion function
# ---------------------------------------------------------------------------


[docs]
def recursive_find_mu(mu: dict, a: int, b: int, c: int, d: int):
    """
    Recursively compute the Möbius inversion function μ((a,b),(c,d)).

    **Background**

    The Möbius inversion is a mathematical tool from combinatorics that
    "inverts" a cumulative sum over a partially ordered set.  Here the
    partial order is defined on pairs of non-negative integers (i, j) with
    i ≤ j, ordered by (a,b) ≤ (c,d)  iff  c ≤ a  and  d ≥ b.

    In the barcode calculation, F[a,b] counts rings in a *union* of shells
    a through b.  The Möbius inversion is used to extract G[c,d] — rings
    that first appear in the interval [c,d] specifically — from the
    cumulative F values.

    Reference: B. Schweinhart et al., Phys. Rev. E 101, 052312 (2020).

    **Memoization**

    Results are stored in the `mu` dictionary so each (a,b,c,d) combination
    is computed only once, then reused on later calls.

    Parameters
    ----------
    mu : dict
        Lookup table passed by reference; modified in-place.
    a, b, c, d : int
        Integer indices satisfying c ≤ a, a ≤ b, b ≤ d.

    Returns
    -------
    int
        The value of the Möbius function μ((a,b),(c,d)).
    """
    # sanity check: the arguments must satisfy the ordering constraints
    if a < c or b > d or b < a or d < c:
        raise Exception("invalid subshell")

    element = f"({a},{b}),({c},{d})"

    if element in mu:
        # result already known — return immediately
        return mu[element]

    else:
        if a == c and b == d:
            # base case: μ((a,b),(a,b)) = 1 by definition
            mu[element] = 1
            return mu[element]

        else:
            # recursive case: sum over all strictly smaller (e,f) intervals
            result = 0

            for e in range(c, a + 1):
                for f in range(b, d + 1):
                    if e == c and f == d:
                        continue  # skip (c,d) itself to avoid circular reference

                    new_element = f"({a},{b}),({e},{f})"
                    if new_element in mu:
                        result += -mu[new_element]
                    else:
                        result += -recursive_find_mu(mu, a, b, e, f)

            
            mu[element] = result
            return mu[element]



# ---------------------------------------------------------------------------
# H1 barcode computation
# ---------------------------------------------------------------------------


[docs]
def obtain_H1_barcode(adjacency_matrix: np.ndarray, layers: List[List[int]], mu: dict):
    """
    Compute the H1 persistent homology barcode for a local atomic environment.

    **What is an H1 barcode?**

    H1 (first homology) counts closed loops (rings) in a graph.  A "barcode"
    in persistent homology is a record of which loops exist at different
    distance scales.  Here we count loops present in each pair of shells
    (i, j), where shell i contains all atoms reachable in exactly i bond hops
    from the centre.

    **Algorithm overview**

    1. Compute F[i,j] = number of *algebraically independent* rings in the union
       of shells i through j, using the Euler characteristic formula
       ``rank = (components) - (atoms) + (edges)``.
       The rank of the first homology equals the number of independent loops.

    2. Apply the Möbius inversion to go from cumulative F to local G:
       ``G[c,d] = Σ_{(a,b) ≤ (c, d)} F[a,b] * μ((a,b),(c,d))``.
       G[c,d] counts rings that *start* in shell c and *end* in shell d
       (i.e. rings that first close when shell d is reached).

    Parameters
    ----------
    adjacency_matrix : np.ndarray, shape (n, n)
        Local adjacency matrix of the environment (from
        obtain_local_number_environment_big_structures).
    layers : list of lists
        Layer assignment from the BFS (output of same function).
    mu : dict
        Möbius function cache (pass the module-level `mu` dict).

    Returns
    -------
    G : np.ndarray, shape (n_layers, n_layers)
        H1 barcode matrix.  G[c,d] = number of independent rings that first
        appear between shells c and d.
    F : np.ndarray, shape (n_layers, n_layers)
        Cumulative ring count matrix.  F[i,j] = total independent rings in
        shells i through j.
    """
    n_steps = len(layers)  # number of shells (including the central atom as shell 0)

    # -----------------------------------------------------------------------
    # Step 1: compute F[i,j] for all shell pairs using the Euler characteristic
    # -----------------------------------------------------------------------
    F = np.zeros((n_steps, n_steps))

    for i in range(0, n_steps):
        for j in range(i, n_steps):

            # collect all atom indices in shells i through j (inclusive)
            shell_layer_atoms = []
            for layer in layers[i:j + 1]:
                shell_layer_atoms += layer

            n_shell_atoms = len(shell_layer_atoms)

            # restrict the adjacency matrix to only these atoms
            shell_adjacency_matrix = adjacency_matrix[shell_layer_atoms, :][:, shell_layer_atoms]

            # count bonds (edges) — each bond is counted twice in the matrix,
            # so divide by 2
            n_shell_edges = int(shell_adjacency_matrix.sum() / 2)

            # count connected components using SciPy's sparse graph routine
            n_shell_components = scipy.sparse.csgraph.connected_components(shell_adjacency_matrix)[0]

            # Euler characteristic formula for the first Betti number (H1 rank):
            #   β₁ = edges - atoms + connected_components
            rank_shell = n_shell_components - n_shell_atoms + n_shell_edges
            F[i, j] = rank_shell

    # -----------------------------------------------------------------------
    # Step 2: apply Möbius inversion to get the local barcode G from F
    # -----------------------------------------------------------------------
    G = np.zeros((n_steps, n_steps))

    for c in range(0, n_steps):
        for d in range(c, n_steps):
            result = 0
            for a in range(c, d + 1):
                for b in range(a, d + 1):
                    element = f"({a},{b}),({c},{d})"
                    if element in mu:
                        result += F[a, b] * mu[element]
                    else:
                        result += F[a, b] * recursive_find_mu(mu, a, b, c, d)
            G[c, d] = result

    # G is the H1 barcode; F is the cumulative ring-count matrix
    return G, F



# ---------------------------------------------------------------------------
# Barcode reduction (canonical form)
# ---------------------------------------------------------------------------


[docs]
def reduce_barcode(G_matrix):
    """
    Remove trailing all-zero rows and columns from a barcode matrix.

    **Why this is needed**

    Two barcodes should be considered identical if they encode the same
    topological information.  A zero-padded matrix and its trimmed version
    describe the same rings, but would compare as different arrays.  Reducing
    to a canonical (minimal) form allows us to count how many atoms share the
    same barcode type.

    The function works recursively: if the last column sums to zero, it (and
    the matching last row) carry no ring information and are removed.  We keep
    trimming until either a 1x1 matrix remains or the last column is non-zero.

    Parameters
    ----------
    G_matrix : np.ndarray, shape (m, m)
        Barcode matrix to reduce.

    Returns
    -------
    np.ndarray
        Reduced barcode matrix with trailing zero rows/columns removed.
    """
    if G_matrix.shape == (1, 1):
        # base case: cannot reduce further
        reduced_G_matrix = G_matrix
    elif np.isclose(G_matrix[:, -1].sum(), 0.0):
        # last column is all zeros — remove it (and the matching last row)
        reduced_G_matrix = G_matrix[:-1, :-1]
        reduced_G_matrix = reduce_barcode(reduced_G_matrix)
    else:
        # last column has non-zero entries — stop
        reduced_G_matrix = G_matrix

    return reduced_G_matrix