Source code for mlx_graphs.datasets.elliptic

from typing import Callable, Optional

import mlx.core as mx
import numpy as np

from mlx_graphs.data import GraphData
from mlx_graphs.datasets.dataset import Dataset
from mlx_graphs.datasets.utils import download, extract_archive



[docs]
class EllipticBitcoinDataset(Dataset):
    """The Elliptic Bitcoin dataset of Bitcoin transactions from the
    `"Anti-Money Laundering in Bitcoin: Experimenting with Graph Convolutional
    Networks for Financial Forensics" <https://arxiv.org/abs/1908.02591>`_
    paper.

    :class:`EllipticBitcoinDataset` maps Bitcoin transactions to real entities
    belonging to licit categories (exchanges, wallet providers, miners,
    licit services, etc.) versus illicit ones (scams, malware, terrorist
    organizations, ransomware, Ponzi schemes, etc.)

    There exists 203,769 node transactions and 234,355 directed edge payments
    flows, with two percent of nodes (4,545) labelled as illicit, and
    twenty-one percent of nodes (42,019) labelled as licit.
    The remaining transactions are unknown

    Args:
        base_dir: Directory where to store dataset files. Default is
            in the local directory ``.mlx_graphs_data/``.
        pre_transform: A function/transform which
            takes in a GraphData object and returns a transformed
            version. The data will be transformed before saving to
            the disk.
        transforms: A function/transform that
            takes in a graphData object and returns a transformed version
            The data object will be transformed before every access
    """

    def __init__(
        self,
        base_dir: Optional[str] = None,
        pre_transform: Optional[Callable] = None,
        transform: Optional[Callable] = None,
    ):
        super().__init__(
            name="ellipticBitcoin",
            base_dir=base_dir,
            pre_transform=pre_transform,
            transform=transform,
        )

    @property
    def raw_file_names(self):
        return [
            "elliptic_txs_features.csv",
            "elliptic_txs_edgelist.csv",
            "elliptic_txs_classes.csv",
        ]

    def download(self):
        # This url is unable to download the data for elliptic bitcoin dataset
        # lets try with pytorch geometric to verify the data
        url = "https://data.pyg.org/datasets/elliptic/"
        for files in self.raw_file_names:
            download(f"{url}{files}.zip", self.raw_path)
            extract_archive(f"{self.raw_path}/{files}.zip", f"{self.raw_path}")

    def process(self, train=True):
        tx_features_from_np = np.loadtxt(
            f"{self.raw_path}/{self.raw_file_names[0]}",
            dtype=float,
            delimiter=",",
            usecols=np.arange(2, 167),
        )
        node_ids = np.loadtxt(
            f"{self.raw_path}/{self.raw_file_names[0]}",
            dtype=str,
            delimiter=",",
            usecols=np.arange(0, 2),
        )
        edge_file = f"{self.raw_path}/{self.raw_file_names[1]}"
        label_file = f"{self.raw_path}/{self.raw_file_names[2]}"

        tx_edges_from_np = np.loadtxt(edge_file, dtype=str, delimiter=",", skiprows=1)
        tx_labels_from_np = np.loadtxt(label_file, dtype=str, delimiter=",", skiprows=1)

        node_features_np = mx.array(tx_features_from_np)

        mapping = {"unknown": 2, "1": 1, "2": 0}

        tx_labels_from_np_classes = tx_labels_from_np[:, 1]
        tx_labels_from_np_classes[tx_labels_from_np_classes == "2"] = 0
        tx_labels_from_np_classes[tx_labels_from_np_classes == "1"] = 1
        tx_labels_from_np_classes[tx_labels_from_np_classes == "unknown"] = 2

        mapping = {idx: i for i, idx in enumerate(node_ids[:, 0])}

        tx_labels_from_np_classes = tx_labels_from_np_classes.astype(int)
        y_numpy = mx.array(tx_labels_from_np_classes.astype(int))

        tx_edges_from_np[:, 0] = np.vectorize(mapping.get)(tx_edges_from_np[:, 0])
        tx_edges_from_np[:, 1] = np.vectorize(mapping.get)(tx_edges_from_np[:, 1])

        tx_edges_from_np = tx_edges_from_np.astype(int)
        edge_index_numpy_array = mx.array(tx_edges_from_np.T)

        time_step = mx.array(node_ids[:, 1].astype(int))

        train_mask = (time_step < 35) & (y_numpy != 2)
        test_mask = (time_step >= 35) & (y_numpy != 2)

        graph = GraphData(
            edge_index=edge_index_numpy_array,
            node_features=node_features_np,
            node_labels=y_numpy,
        )
        graph.train_mask = train_mask
        graph.test_mask = test_mask

        self.graphs = [graph]