Preprocessing

[1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from cartopy import crs as ccrs, feature as cfeature
from tqdm import tqdm
import requests
import io
import tempfile

import arviz as az
import importlib

from paleokalmag import PaleoKalmag
from paleokalmag.data_handling import read_data
# load ArchKalmag14k.r model
r = requests.get(
    "https://nextcloud.gfz-potsdam.de/s/BqN2XD7c9jK3AsF/download", stream=True
)
archkalmag = PaleoKalmag(io.BytesIO(r.raw.read()), name="ArchKalmag14k.r", legacy=True)
# Alternatively, you can download the model and save it locally
# archkalmag = PaleoKalmag(
#     "path/to/ArchKalmag14k.r.npz", name="ArchKalmag14k.r"
# )

from age_depth_modeling.evaluate_adm import evaluate_adm
from constants import mad_to_alpha_factors, default_alpha95
from calibrate_dec import calibrate
from plot_functions import plot_adm, plot_D_I

pre = "https://nextcloud.gfz-potsdam.de/s/"
post = "/download"
adm_links = {
    "BA3": f"{pre}WQaSYe9eNffzyZJ{post}",
    "BYE": f"{pre}qH5Zc685D8g9BYb{post}",
    "ByaP2": f"{pre}LAgSbXkkDKjcgrC{post}",
    "ByaP3": f"{pre}tEipBed36aYASP3{post}",
    "EIL": f"{pre}JitiXQXPrjbjRRe{post}",
    "FUR": f"{pre}mSMrTBnmWZ8wCsz{post}",
    "P2": f"{pre}jYd7ka4ncWeiF6e{post}",
    "P3": f"{pre}p8yWE8SZYgWQg7i{post}",
    "GYL": f"{pre}5pD7kZE5gGK68kF{post}",
    "GP1": f"{pre}JEdgZXQXS3dMdcT{post}",
    "GP2": f"{pre}8XHGZFCpL3xRW2g{post}",
    "GP4": f"{pre}DSJXNmH9xqPY6di{post}",
    "MOR": f"{pre}oKSjPZN7BgxdZF6{post}",
    "U1305": f"{pre}7Seyd4w44sfkZcC{post}",
    "BIR_unflat": f"{pre}sAnaqtoJrSd9S2X{post}",
}
/tmp/ipykernel_1762/3593069225.py:1: DeprecationWarning:
Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466

  import pandas as pd
cpu
[2]:
core_ids = [
    "BA3",
    "BYE",
    # "ByaP2",
    # "ByaP3",
    # "EIL",
    # "FUR",
    # "P2",
    # "P3",
    # "GYL",
    # "GP1",
    # "GP2",
    # "GP4",
    # "MOR",
    # "U1305",
    # "BIR_unflat",
]
cores = {}
for core_id in tqdm(core_ids):
    print(core_id)
    lib = importlib.import_module(core_id + "." + core_id)
    df = lib.paleomag_data.copy()
    demag_steps = lib.demag_steps
    try:
        df.loc[lib.remove_D, "D"] = np.nan
        df.loc[lib.remove_I, "I"] = np.nan
    except AttributeError:
        pass
    df.dropna(subset=["D", "I"], how="all", inplace=True)
    df.reset_index(drop=True, inplace=True)
    df["lon"].where(df["lon"] > 0, other=df["lon"] + 360, inplace=True)
    if "MAD" in df.columns:
        df["alpha_95"] = mad_to_alpha_factors[str(demag_steps)] * df["MAD"]
    else:
        df["MAD"] = np.nan
    if "alpha_95" not in df.columns:
        df["alpha_95"] = np.nan
    df["alpha_95"].where(
        df["alpha_95"].notna(), other=default_alpha95, inplace=True
    )
    df["dI"] = (57.3 / 140) * df["alpha_95"]
    df["dD"] = np.where(
        df["I"].notna(),
        df["dI"] / np.cos(np.deg2rad(df["I"])),
        df["dI"] / np.cos(np.arctan(2 * np.tan(np.deg2rad(df["lat"])))),
    )
    df["type"] = "sediment"
    try:
        # idata = az.InferenceData.from_netcdf(
        #     f"../../dat/age_depth_models/{core_id}_adm.nc"
        # )
        r = requests.get(adm_links[core_id])
        with tempfile.NamedTemporaryFile(mode='w+b', delete=False) as temp_file:
            temp_file.write(r.content)
        idata = az.from_netcdf(temp_file.name)
        t, dt = evaluate_adm(idata, df["depth"].values)
        df["t"] = t
        df["dt"] = dt
    except FileNotFoundError:
        df["t"] = np.nan
        df["dt"] = np.nan
    df["D_rel"] = df["D"]
    for key, group in df.groupby("core"):
        try:
            cal_mean, cal_std = calibrate(
                group[group["t"] >= -12000], archkalmag
            )
            print(f"{core_id} - {key}: {cal_mean:.2f} +/- {cal_std:.2f}")
            df.loc[df["core"] == key, "D"] = (
                df.loc[df["core"] == key, "D"] - cal_mean
            )
        except:
            cal_mean, cal_std = calibrate(
                group[group["t_old"] >= -12000], archkalmag
            )
            print(f"{core_id} - {key}: {cal_mean:.2f} +/- {cal_std:.2f}")
            df.loc[df["core"] == key, "D"] = (
                df.loc[df["core"] == key, "D"] - cal_mean
            )
    df["D"].where(df["D"] <= 180, df["D"] - 360, inplace=True)
    df = df[
        [
            "depth",
            "t",
            "dt",
            "t_old",
            "lat",
            "lon",
            "D",
            "D_rel",
            "I",
            "dD",
            "dI",
            "MAD",
            "core",
            "type",
            "UID",
            "FID",
            "alpha_95",
        ]
    ]
    df.sort_values("t", inplace=True)
    df.reset_index(drop=True, inplace=True)
    cores[core_id] = df
more-to-come:

class:

stderr

0%| | 0/2 [00:00&lt;?, ?it/s]

</pre>

0%| | 0/2 [00:00<?, ?it/s]

end{sphinxVerbatim}

0%| | 0/2 [00:00<?, ?it/s]

BA3
/tmp/ipykernel_1762/11818589.py:31: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


/tmp/ipykernel_1762/11818589.py:38: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


/opt/conda/envs/virtualenv/lib/python3.10/site-packages/arviz/data/inference_data.py:153: UserWarning: adm_pars group is not defined in the InferenceData scheme
BA3 - GEO17603-3-1: -2.40 +/- 0.39
more-to-come:

class:

stderr

/tmp/ipykernel_1762/11818589.py:80: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.
50%|█████ | 1/2 [00:04&lt;00:04, 4.68s/it]

</pre>

50%|█████ | 1/2 [00:04<00:04, 4.68s/it]

end{sphinxVerbatim}

50%|█████ | 1/2 [00:04<00:04, 4.68s/it]

BA3 - GEO17603-3-2: -25.95 +/- 0.57
BYE
/tmp/ipykernel_1762/11818589.py:31: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


/tmp/ipykernel_1762/11818589.py:38: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


/opt/conda/envs/virtualenv/lib/python3.10/site-packages/arviz/data/inference_data.py:153: UserWarning: adm_pars group is not defined in the InferenceData scheme
BYE - ByaP2: -1.05 +/- 0.90
/tmp/ipykernel_1762/11818589.py:80: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.
100%|██████████| 2/2 [00:07&lt;00:00, 3.60s/it]

</pre>

100%|██████████| 2/2 [00:07<00:00, 3.60s/it]

end{sphinxVerbatim}

100%|██████████| 2/2 [00:07<00:00, 3.60s/it]

100%|██████████| 2/2 [00:07&lt;00:00, 3.76s/it]

</pre>

100%|██████████| 2/2 [00:07<00:00, 3.76s/it]

end{sphinxVerbatim}

100%|██████████| 2/2 [00:07<00:00, 3.76s/it]

BYE - ByaP3: 0.05 +/- 1.02

[3]:
for i, (id, core) in enumerate(cores.items()):
    fig, axs = plt.subplot_mosaic(
        [
            4 * ["D"],
            4 * ["I"],
        ],
        figsize=(10, 3),
        gridspec_kw=dict(hspace=0, wspace=0.8),
        dpi=300,
    )
    plot_D_I(
        axs["D"],
        axs["I"],
        core,
        time_or_depth="depth",
        archkalmag14k=archkalmag,
        limit_y_to_data=True,
    )
    fig.align_ylabels()
    fig.align_xlabels()
    plt.show()
../../../_images/2_data_based_investigation_src_preprocessing_preprocessing_3_0.png
../../../_images/2_data_based_investigation_src_preprocessing_preprocessing_3_1.png
[4]:
proj = ccrs.Mollweide()
fig = plt.figure(figsize=(10, 4), dpi=400)
sph_ax = plt.subplot(projection=proj)
sph_ax.set_global()
sph_ax.add_feature(cfeature.LAND, zorder=0, color="lightgray")

with np.load("../../dat/rejection_list.npz", allow_pickle=True) as fh:
    to_reject = fh["to_reject"]
arch_data = read_data("../../dat/archkalmag_data.csv", rejection_lists=to_reject)

sph_ax.scatter(
    arch_data.lon,
    arch_data.lat,
    color="gray",
    s=10,
    alpha=0.8,
    lw=0,
    rasterized=True,
    transform=ccrs.PlateCarree(),
    label="Archaoemagnetic data"
)
for i, (core_id, core) in enumerate(cores.items()):
    if core_id in ["BIR_unflat", "ByaP2", "ByaP3", "GP1", "GP2", "GP4", "P2", "P3"]:
        continue
    label = f"{core_id}"
    for j, (key, group) in enumerate(core.groupby("core")):
        sph_ax.scatter(
            group.lon,
            group.lat,
            color=plt.get_cmap("turbo", len(cores.values()))(i),
            label=label if j == len(core["core"].unique()) - 1 else None,
            s=70,
            marker="*",
            rasterized=True,
            transform=ccrs.PlateCarree(),
        )
sph_ax.legend(
    loc="center", prop={"size": 6}, bbox_to_anchor=(0.5, -0.05), ncol=11
)
fig.autofmt_xdate()
fig.tight_layout()
Rejected 1150 outliers.
/opt/conda/envs/virtualenv/lib/python3.10/site-packages/paleokalmag/data_handling.py:160: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


/opt/conda/envs/virtualenv/lib/python3.10/site-packages/paleokalmag/data_handling.py:166: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


/opt/conda/envs/virtualenv/lib/python3.10/site-packages/paleokalmag/data_handling.py:180: UserWarning: Records with indices [   72    73    74    75    76    77    78    79    80    81   108   109
   356   357  1064  1065  1152  1613  2399  2572  2668  2701  2750  2753
  2754  3032  3211  3213  3214  3215  3216  3217  3218  3219  3220  3221
  3499  3768  3984  4043  4231  4411  4500  4526  4527  4651  5168  5322
  5430  5547  5703  5759  5798  5801  5955  6049  6101  6563  6643  6702
  6777  6805  7145  7576  7581  7589  7688  7698  7731  7772  7802  7822
  7877  7975  8063  8292  8323  8423  8506  8666  8704  8748  9220 10128
 10327 10505 10598 10600 10643 11523 11713 11754] contain declination, but not inclination! The errors need special treatment!
To be able to use the provided data, these records have been dropped from the output.
/opt/conda/envs/virtualenv/lib/python3.10/site-packages/paleokalmag/data_handling.py:189: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


/opt/conda/envs/virtualenv/lib/python3.10/site-packages/paleokalmag/data_handling.py:197: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


/opt/conda/envs/virtualenv/lib/python3.10/site-packages/paleokalmag/data_handling.py:198: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


/opt/conda/envs/virtualenv/lib/python3.10/site-packages/cartopy/io/__init__.py:241: DownloadWarning: Downloading: https://naturalearth.s3.amazonaws.com/110m_physical/ne_110m_land.zip
../../../_images/2_data_based_investigation_src_preprocessing_preprocessing_4_2.png
[5]:
for core_id, core in cores.items():
    core[
        [
            "depth",
            "t",
            "dt",
            "lat",
            "lon",
            "D",
            "I",
            "dD",
            "dI",
            "type",
            "core",
            "UID",
            "FID",
        ]
    ]#.to_csv(f"../../dat/preprocessed_data/{core_id}.csv", index=False)