Synthetic tests

[1]:
import pandas as pd
import numpy as np
import string
import scipy
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap

from paleokalmag import CoreFieldModel

from pdrm.utils import lif
from pdrm.data_handling import d2t, t2d
from pdrm.constants import archkalmag_field_params

from gen_synth_data import gen_data, distort_data
/tmp/ipykernel_1581/1122084220.py:1: DeprecationWarning:
Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466

  import pandas as pd

In a first step we need a reference process sampled from the prior distribution of the underlying Gaussian Process. From this reference process we can then sample archaeomagnetic data as well as reference sediment data. The reference sediment data need then to be distorted using customized lock-in functions.

[2]:
new_sample_from_prior = False    # set to True to generate a new reference model
# We use the temporal and spartial distribution of arch_data.csv and sed_data_KLK.csv
arch_real = pd.read_csv("../dat/arch_data.csv")
# We use only every third records to test the method on a smaller data set
sed_real = pd.read_csv("../dat/sed_data_KLK.csv")
sed_real = sed_real[sed_real.t >= -6000][::3].reset_index()

knots, dt = np.linspace(
    min(arch_real["t"]), max(arch_real["t"]), 1000, retstep=True
)
if new_sample_from_prior:
    model = CoreFieldModel(*archkalmag_field_params.values())
    prior_sample = model.sample_prior(ts=knots, n_samps=10)[0, :, :]
else:
    prior_sample = np.load("../dat/prior_sample.npy")
spline = scipy.interpolate.BSpline(
    np.insert(knots, [len(knots), 0], [max(knots) + dt, min(knots) - dt]),
    prior_sample.T,
    1,
)
[3]:
gen_new_arch_data = False       # set to True to generate new synthetic arch data
gen_new_sed_data = False        # set to True to generate new synthetic sed data

if gen_new_arch_data:
    arch, _ = gen_data(
        t=arch_real["t"],       # use temporal distribution of real arch data
        lat=arch_real["lat"],   # use spatial distribution of real arch data
        lon=arch_real["lon"],   # use spatial distribution of real arch data
        field_spline=spline,    # use previously generated prior sample
        dt=arch_real["dt"],     # use temporal errors of real arch data
        ddec=arch_real["dD"],   # use measurement errors of real arch data
        dinc=arch_real["dI"],   # use measurement errors of real arch data
        dint=arch_real["dF"],   # use measurement errors of real arch data
        depth=None,             # arch data does not has depth
        with_noise=True,        # add some noise to synthetic arch data
    )
    # To use your new data store it as a csv file using
    # arch.to_csv("path/to/your/synth_arch_data.csv", index=False)
    # Then run the following line to ass unique identifiers
    # add_identifier("path/to/your/synth_arch_data.csv")
else:
    arch = pd.read_csv("../dat/synth_arch_data.csv")

if gen_new_sed_data:
    sed_clean, NEZ_sed_clean = gen_data(
        t=sed_real["t"],        # use temporal distribution of real sed data
        lat=sed_real["lat"],    # use latitude of real sed data
        lon=sed_real["lon"],    # use longitude of real sed data
        field_spline=spline,    # use previously generated prior sample
        dt=sed_real["dt"],      # use temporal errors of real sed data
        depth=sed_real["depth"],# use depth of real sed data
        with_noise=False,       # no noise for reference sediment data
    )
    sed = distort_data(
        sed_clean,
        NEZ_sed_clean,
        ddec=sed_real["dD"],    # use measurement errors of real sed data
        dinc=sed_real["dI"],    # use measurement errors of real sed data
        bs=[10, 10, 30, 10],    # lock-in function parameters
    )
    # To use your new data store it as a csv file using
    # sed.to_csv("path/to/your/sed_data.csv", index=False)
    # Then run the following line to ass unique identifiers
    # add_identifier("path/to/your/sed_data.csv")
else:
    sed_clean = pd.read_csv("../dat/synth_sed_data_clean_sweden.csv")
    sed = pd.read_csv("../dat/a_sweden.csv")

sed = pd.read_csv("../dat/a_sweden.csv")
sed["type"] = "sediments"
fig, axs = plt.subplots(2, 1, figsize=(10, 4), sharex=True, dpi=300)
for ax, d_i in zip(axs, ["D", "I"]):
    ax.plot(sed_clean.depth, sed_clean[d_i], c="C2", label="reference process")
    ax.errorbar(sed.depth, sed[d_i], yerr=sed[f"d{d_i}"], alpha=0.7, c="C1", fmt=".", ls="", label="distorted sediment data")
axs[1].set_xlabel("Absolute depth [years]")
axD_xax2 = axs[0].secondary_xaxis("top", functions=(d2t(sed), t2d(sed)))
axD_xax2.set_xlabel("Absolute time [years]")
axs[1].set_xlabel("Absolute depth [cm]")
axs[1].legend(bbox_to_anchor=(0.5, -0.7), loc='center', ncol=2)
axs[0].set_ylabel("Declination")
axs[1].set_ylabel("Inclination")
fig.align_ylabels()
plt.show()
../../_images/1_Estimating_pDRM_effects_src_synthetic_tests_4_0.png

We used the functions from above to generate twelve synthetic sediment data sets. Six located on Rapa Iti and six located in Sweden. For both locations we used the same six lock-in functions that can be found in the next image.

[4]:
x, dx = np.linspace(0, 44, 1000, retstep=True)
all_bs = np.array(
    [
        [1, 0, 20, 0],      # a
        [1, 10, 0, 10],     # b
        [21, 0, 20, 0],     # c
        [21, 10, 0, 10],    # d
        [1, 0, 0, 40],      # e
        [1, 40, 0, 0],      # f
    ]
)
names = list(string.ascii_lowercase)[0: len(all_bs)]
for bs, name in zip(all_bs, names):
    plt.plot(x, lif(bs)(x), label=name)
plt.legend()
plt.title("Lock-in functions")
plt.show()
../../_images/1_Estimating_pDRM_effects_src_synthetic_tests_6_0.png

To estimate the lock-in function parameters run the following line in your terminal:

foo@bar:~$ python estimate_parameters.py {sed_data_name} {max_lid} {n_runs}

where max_lid is the upper bound for the parameter estimation and n_runs stands for the number of estimations you want to perform. For example to run 50 estimations with an upper bound of 100 cm for the synthetic data set a_sweden.csv execute

foo@bar:~$ python estimate_parameters.py a_sweden 100 50

We recommend using a GPU for faster computation. To adjust the optimizer parameters or the utilized synthetic arch data change these values directly in estimate_parameters.py

The results of the parameter estimation can be found in the results folder and are visualized in the next cell.

[5]:
def numerate_plots(ax, text):
    ax.text(
        0,
        1,
        text,
        color="black",
        fontsize=9,
        fontweight="bold",
        va="bottom",
        ha="center",
        transform=ax.transAxes,
        bbox=dict(
            facecolor="#DCE6F2",
            edgecolor="#004378",
            linewidth=1.3,
            pad=0.35,
            boxstyle="round,rounding_size=0.25",
        ),
    )


def compute_hlid(bs):
    lif_ant = lif(bs).antiderivative()
    bs_cs = np.cumsum(bs)
    return scipy.optimize.root_scalar(
        lambda x: lif_ant(x) - 0.5,
        x0=bs_cs[2] - bs_cs[1],
        bracket=[bs_cs[0], bs_cs[3]],
    ).root


d, dt = np.linspace(-3, 100, 5000, retstep=True)
names = list(string.ascii_lowercase)[0:6]
n = len(names)
titles = [
    "Lock-in functions",
    "Half lock-in depth",
    "Lock-in function height",
    "Lock-in function width",
    "Log-marginal-likelihood values",
]
xlabels = [
    "Relative depth [cm]",
    r"$\rho_{0.5}$",
    r"$\rho_h$",
    r"$\rho_w$",
    "log-ml values",
]
cmap = LinearSegmentedColormap.from_list("rb", ["C3", "purple", "C0"], N=256)

for loc in ["sweden", "rapa"]:
    fig, axs = plt.subplots(n, 5, figsize=(15, 1.7 * n), sharex="col", dpi=500)
    fig.tight_layout(h_pad=0, w_pad=0.2)
    mins, maxs = [], []
    min_hlid, max_hlid = 0, 0
    placeholder = (np.nan, np.nan)
    limits = {
        "hlid": placeholder,
        "height": placeholder,
        "lif_width": placeholder,
    }
    for i, name in enumerate([f"{n}_{loc}" for n in names]):
        df = pd.read_csv(f"../dat/{name}.csv")
        real_bs = np.load(f"../dat/{name}_bs.npz")["res"]
        real_params = {
            "hlid": compute_hlid(real_bs),
            "height": np.max(lif(real_bs)(d)),
            "lif_width": np.sum(real_bs) - real_bs[0],
        }
        res = pd.read_csv(f"../results/{name}_params.csv")
        if loc == "sweden":
            res = res[res.polish != True]
        res["log_ml"] = -res["log_ml"]
        norm = plt.Normalize(vmin=min(res["log_ml"]), vmax=max(res["log_ml"]))
        res["hlid"] = res.apply(
            lambda row: compute_hlid(list(row[0:4])), axis=1
        )
        res["height"] = res.apply(
            lambda row: np.max(lif(list(row[0:4]))(d)), axis=1
        )
        res["lif_width"] = res.apply(
            lambda row: np.sum(row[0:4]) - row[0], axis=1
        )
        for idx, row in res.iterrows():
            axs[i, 0].plot(
                d,
                lif(list(row[0:4]))(d),
                c=cmap(norm(row["log_ml"])),
                alpha=0.8,
                label="Estimated\nlock-in function" if idx == 0 else None,
            )
        axs[i, 0].plot(
            d,
            lif(list(real_bs))(d),
            c="C1",
            lw=3,
            label="Real\nlock-in function",
        )
        numerate_plots(axs[i, 0], list(string.ascii_uppercase)[i])
        for ax, param_name in zip(
            axs[i, 1:], list(res.columns[-3:]) + ["log_ml"]
        ):
            param = res[param_name]
            density = scipy.stats.gaussian_kde(param)
            if param_name != "log_ml":
                ax.axvline(real_params[param_name], c="C1", zorder=2)
                limits[param_name] = (
                    min(
                        d[np.nonzero(np.round(density(d), 15))[0][0]],
                        limits[param_name][0],
                    ),
                    max(
                        d[np.argmin(1 - np.cumsum(density(d)) * dt)],
                        limits[param_name][1],
                    ),
                )
                density_args = np.linspace(
                    limits[param_name][0], limits[param_name][1], 5000
                )
                ax.plot(
                    density_args, density(density_args), c="black", ls="--"
                )
                ax.hist(
                    param,
                    bins=10,
                    density=True,
                    color="C0",
                    alpha=0.8,
                    weights=norm(res["log_ml"]),
                )
            if param_name == "log_ml":
                x = (
                    np.linspace(-159420, -159300, 5000)
                    if loc == "sweden"
                    else np.linspace(-158930, -158860, 5000)
                )
                ax.plot(x, density(x), c="black", ls="--")
                _, bins, patches = ax.hist(
                    param, bins=10, density=True, color="C0", alpha=0.8
                )
                log_ml_centers = [
                    np.mean(
                        res[
                            (res[param_name] >= bins[b])
                            & (res[param_name] < bins[b + 1])
                        ]["log_ml"]
                    )
                    for b in range(len(bins) - 1)
                ]
                for c, p in zip(log_ml_centers, patches):
                    plt.setp(p, "facecolor", cmap(norm(c)))
        for i in range(n):
            for j, vals in enumerate(limits.values()):
                axs[i, j + 1].set_xlim(vals)
    [axs[n - 1, i].set_xlabel(xlabels[i], fontsize=14) for i in range(5)]
    [axs[0, i].set_title(titles[i], fontsize=14) for i in range(5)]
    axs[0, 0].legend()
    fig.align_xlabels()
    Loc = "Sweden" if loc == "sweden" else "Rapa Iti"
    fig.suptitle(
        f"Results for synthetic test located in {Loc}", fontsize=18, y=1.045
    )
    plt.show()
/tmp/ipykernel_1581/2248532584.py:82: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
/tmp/ipykernel_1581/2248532584.py:82: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
/tmp/ipykernel_1581/2248532584.py:82: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
/tmp/ipykernel_1581/2248532584.py:82: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
/tmp/ipykernel_1581/2248532584.py:82: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
/tmp/ipykernel_1581/2248532584.py:82: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
../../_images/1_Estimating_pDRM_effects_src_synthetic_tests_8_6.png
/tmp/ipykernel_1581/2248532584.py:82: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
/tmp/ipykernel_1581/2248532584.py:82: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
/tmp/ipykernel_1581/2248532584.py:82: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
/tmp/ipykernel_1581/2248532584.py:82: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
/tmp/ipykernel_1581/2248532584.py:82: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
/tmp/ipykernel_1581/2248532584.py:82: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
../../_images/1_Estimating_pDRM_effects_src_synthetic_tests_8_13.png