first commit
This commit is contained in:
@@ -0,0 +1,22 @@
|
||||
__all__ = [
|
||||
"NaT",
|
||||
"NaTType",
|
||||
"OutOfBoundsDatetime",
|
||||
"Period",
|
||||
"Timedelta",
|
||||
"Timestamp",
|
||||
"iNaT",
|
||||
"Interval",
|
||||
]
|
||||
|
||||
|
||||
from pandas._libs.interval import Interval
|
||||
from pandas._libs.tslibs import (
|
||||
NaT,
|
||||
NaTType,
|
||||
OutOfBoundsDatetime,
|
||||
Period,
|
||||
Timedelta,
|
||||
Timestamp,
|
||||
iNaT,
|
||||
)
|
||||
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,12 @@
|
||||
from pandas._libs.dtypes cimport numeric_t
|
||||
|
||||
|
||||
cdef numeric_t kth_smallest_c(numeric_t* arr, Py_ssize_t k, Py_ssize_t n) nogil
|
||||
|
||||
cdef enum TiebreakEnumType:
|
||||
TIEBREAK_AVERAGE
|
||||
TIEBREAK_MIN,
|
||||
TIEBREAK_MAX
|
||||
TIEBREAK_FIRST
|
||||
TIEBREAK_FIRST_DESCENDING
|
||||
TIEBREAK_DENSE
|
||||
@@ -0,0 +1,446 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._typing import npt
|
||||
|
||||
class Infinity:
|
||||
"""
|
||||
Provide a positive Infinity comparison method for ranking.
|
||||
"""
|
||||
|
||||
def __eq__(self, other) -> bool: ...
|
||||
def __ne__(self, other) -> bool: ...
|
||||
def __lt__(self, other) -> bool: ...
|
||||
def __le__(self, other) -> bool: ...
|
||||
def __gt__(self, other) -> bool: ...
|
||||
def __ge__(self, other) -> bool: ...
|
||||
|
||||
class NegInfinity:
|
||||
"""
|
||||
Provide a negative Infinity comparison method for ranking.
|
||||
"""
|
||||
|
||||
def __eq__(self, other) -> bool: ...
|
||||
def __ne__(self, other) -> bool: ...
|
||||
def __lt__(self, other) -> bool: ...
|
||||
def __le__(self, other) -> bool: ...
|
||||
def __gt__(self, other) -> bool: ...
|
||||
def __ge__(self, other) -> bool: ...
|
||||
|
||||
def unique_deltas(
|
||||
arr: np.ndarray, # const int64_t[:]
|
||||
) -> np.ndarray: ... # np.ndarray[np.int64, ndim=1]
|
||||
def is_lexsorted(list_of_arrays: list[npt.NDArray[np.int64]]) -> bool: ...
|
||||
def groupsort_indexer(
|
||||
index: np.ndarray, # const int64_t[:]
|
||||
ngroups: int,
|
||||
) -> tuple[
|
||||
np.ndarray, # ndarray[int64_t, ndim=1]
|
||||
np.ndarray, # ndarray[int64_t, ndim=1]
|
||||
]: ...
|
||||
def kth_smallest(
|
||||
a: np.ndarray, # numeric[:]
|
||||
k: int,
|
||||
) -> Any: ... # numeric
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# Pairwise correlation/covariance
|
||||
|
||||
def nancorr(
|
||||
mat: npt.NDArray[np.float64], # const float64_t[:, :]
|
||||
cov: bool = ...,
|
||||
minp: int | None = ...,
|
||||
) -> npt.NDArray[np.float64]: ... # ndarray[float64_t, ndim=2]
|
||||
def nancorr_spearman(
|
||||
mat: npt.NDArray[np.float64], # ndarray[float64_t, ndim=2]
|
||||
minp: int = ...,
|
||||
) -> npt.NDArray[np.float64]: ... # ndarray[float64_t, ndim=2]
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
|
||||
# ctypedef fused algos_t:
|
||||
# float64_t
|
||||
# float32_t
|
||||
# object
|
||||
# int64_t
|
||||
# int32_t
|
||||
# int16_t
|
||||
# int8_t
|
||||
# uint64_t
|
||||
# uint32_t
|
||||
# uint16_t
|
||||
# uint8_t
|
||||
|
||||
def validate_limit(nobs: int | None, limit=...) -> int: ...
|
||||
def pad(
|
||||
old: np.ndarray, # ndarray[algos_t]
|
||||
new: np.ndarray, # ndarray[algos_t]
|
||||
limit=...,
|
||||
) -> npt.NDArray[np.intp]: ... # np.ndarray[np.intp, ndim=1]
|
||||
def pad_inplace(
|
||||
values: np.ndarray, # algos_t[:]
|
||||
mask: np.ndarray, # uint8_t[:]
|
||||
limit=...,
|
||||
) -> None: ...
|
||||
def pad_2d_inplace(
|
||||
values: np.ndarray, # algos_t[:, :]
|
||||
mask: np.ndarray, # const uint8_t[:, :]
|
||||
limit=...,
|
||||
) -> None: ...
|
||||
def backfill(
|
||||
old: np.ndarray, # ndarray[algos_t]
|
||||
new: np.ndarray, # ndarray[algos_t]
|
||||
limit=...,
|
||||
) -> npt.NDArray[np.intp]: ... # np.ndarray[np.intp, ndim=1]
|
||||
def backfill_inplace(
|
||||
values: np.ndarray, # algos_t[:]
|
||||
mask: np.ndarray, # uint8_t[:]
|
||||
limit=...,
|
||||
) -> None: ...
|
||||
def backfill_2d_inplace(
|
||||
values: np.ndarray, # algos_t[:, :]
|
||||
mask: np.ndarray, # const uint8_t[:, :]
|
||||
limit=...,
|
||||
) -> None: ...
|
||||
def is_monotonic(
|
||||
arr: np.ndarray, # ndarray[algos_t, ndim=1]
|
||||
timelike: bool,
|
||||
) -> tuple[bool, bool, bool]: ...
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# rank_1d, rank_2d
|
||||
# ----------------------------------------------------------------------
|
||||
|
||||
# ctypedef fused rank_t:
|
||||
# object
|
||||
# float64_t
|
||||
# uint64_t
|
||||
# int64_t
|
||||
|
||||
def rank_1d(
|
||||
values: np.ndarray, # ndarray[rank_t, ndim=1]
|
||||
labels: np.ndarray | None = ..., # const int64_t[:]=None
|
||||
is_datetimelike: bool = ...,
|
||||
ties_method=...,
|
||||
ascending: bool = ...,
|
||||
pct: bool = ...,
|
||||
na_option=...,
|
||||
) -> np.ndarray: ... # np.ndarray[float64_t, ndim=1]
|
||||
def rank_2d(
|
||||
in_arr: np.ndarray, # ndarray[rank_t, ndim=2]
|
||||
axis: int = ...,
|
||||
is_datetimelike: bool = ...,
|
||||
ties_method=...,
|
||||
ascending: bool = ...,
|
||||
na_option=...,
|
||||
pct: bool = ...,
|
||||
) -> np.ndarray: ... # np.ndarray[float64_t, ndim=1]
|
||||
def diff_2d(
|
||||
arr: np.ndarray, # ndarray[diff_t, ndim=2]
|
||||
out: np.ndarray, # ndarray[out_t, ndim=2]
|
||||
periods: int,
|
||||
axis: int,
|
||||
datetimelike: bool = ...,
|
||||
) -> None: ...
|
||||
def ensure_platform_int(arr: object) -> npt.NDArray[np.intp]: ...
|
||||
def ensure_object(arr: object) -> npt.NDArray[np.object_]: ...
|
||||
def ensure_complex64(arr: object, copy=...) -> npt.NDArray[np.complex64]: ...
|
||||
def ensure_complex128(arr: object, copy=...) -> npt.NDArray[np.complex128]: ...
|
||||
def ensure_float64(arr: object, copy=...) -> npt.NDArray[np.float64]: ...
|
||||
def ensure_float32(arr: object, copy=...) -> npt.NDArray[np.float32]: ...
|
||||
def ensure_int8(arr: object, copy=...) -> npt.NDArray[np.int8]: ...
|
||||
def ensure_int16(arr: object, copy=...) -> npt.NDArray[np.int16]: ...
|
||||
def ensure_int32(arr: object, copy=...) -> npt.NDArray[np.int32]: ...
|
||||
def ensure_int64(arr: object, copy=...) -> npt.NDArray[np.int64]: ...
|
||||
def ensure_uint8(arr: object, copy=...) -> npt.NDArray[np.uint8]: ...
|
||||
def ensure_uint16(arr: object, copy=...) -> npt.NDArray[np.uint16]: ...
|
||||
def ensure_uint32(arr: object, copy=...) -> npt.NDArray[np.uint32]: ...
|
||||
def ensure_uint64(arr: object, copy=...) -> npt.NDArray[np.uint64]: ...
|
||||
def take_1d_int8_int8(
|
||||
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
|
||||
) -> None: ...
|
||||
def take_1d_int8_int32(
|
||||
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
|
||||
) -> None: ...
|
||||
def take_1d_int8_int64(
|
||||
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
|
||||
) -> None: ...
|
||||
def take_1d_int8_float64(
|
||||
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
|
||||
) -> None: ...
|
||||
def take_1d_int16_int16(
|
||||
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
|
||||
) -> None: ...
|
||||
def take_1d_int16_int32(
|
||||
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
|
||||
) -> None: ...
|
||||
def take_1d_int16_int64(
|
||||
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
|
||||
) -> None: ...
|
||||
def take_1d_int16_float64(
|
||||
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
|
||||
) -> None: ...
|
||||
def take_1d_int32_int32(
|
||||
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
|
||||
) -> None: ...
|
||||
def take_1d_int32_int64(
|
||||
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
|
||||
) -> None: ...
|
||||
def take_1d_int32_float64(
|
||||
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
|
||||
) -> None: ...
|
||||
def take_1d_int64_int64(
|
||||
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
|
||||
) -> None: ...
|
||||
def take_1d_int64_float64(
|
||||
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
|
||||
) -> None: ...
|
||||
def take_1d_float32_float32(
|
||||
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
|
||||
) -> None: ...
|
||||
def take_1d_float32_float64(
|
||||
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
|
||||
) -> None: ...
|
||||
def take_1d_float64_float64(
|
||||
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
|
||||
) -> None: ...
|
||||
def take_1d_object_object(
|
||||
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
|
||||
) -> None: ...
|
||||
def take_1d_bool_bool(
|
||||
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
|
||||
) -> None: ...
|
||||
def take_1d_bool_object(
|
||||
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
|
||||
) -> None: ...
|
||||
def take_2d_axis0_int8_int8(
|
||||
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
|
||||
) -> None: ...
|
||||
def take_2d_axis0_int8_int32(
|
||||
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
|
||||
) -> None: ...
|
||||
def take_2d_axis0_int8_int64(
|
||||
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
|
||||
) -> None: ...
|
||||
def take_2d_axis0_int8_float64(
|
||||
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
|
||||
) -> None: ...
|
||||
def take_2d_axis0_int16_int16(
|
||||
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
|
||||
) -> None: ...
|
||||
def take_2d_axis0_int16_int32(
|
||||
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
|
||||
) -> None: ...
|
||||
def take_2d_axis0_int16_int64(
|
||||
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
|
||||
) -> None: ...
|
||||
def take_2d_axis0_int16_float64(
|
||||
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
|
||||
) -> None: ...
|
||||
def take_2d_axis0_int32_int32(
|
||||
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
|
||||
) -> None: ...
|
||||
def take_2d_axis0_int32_int64(
|
||||
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
|
||||
) -> None: ...
|
||||
def take_2d_axis0_int32_float64(
|
||||
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
|
||||
) -> None: ...
|
||||
def take_2d_axis0_int64_int64(
|
||||
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
|
||||
) -> None: ...
|
||||
def take_2d_axis0_int64_float64(
|
||||
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
|
||||
) -> None: ...
|
||||
def take_2d_axis0_float32_float32(
|
||||
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
|
||||
) -> None: ...
|
||||
def take_2d_axis0_float32_float64(
|
||||
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
|
||||
) -> None: ...
|
||||
def take_2d_axis0_float64_float64(
|
||||
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
|
||||
) -> None: ...
|
||||
def take_2d_axis0_object_object(
|
||||
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
|
||||
) -> None: ...
|
||||
def take_2d_axis0_bool_bool(
|
||||
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
|
||||
) -> None: ...
|
||||
def take_2d_axis0_bool_object(
|
||||
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
|
||||
) -> None: ...
|
||||
def take_2d_axis1_int8_int8(
|
||||
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
|
||||
) -> None: ...
|
||||
def take_2d_axis1_int8_int32(
|
||||
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
|
||||
) -> None: ...
|
||||
def take_2d_axis1_int8_int64(
|
||||
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
|
||||
) -> None: ...
|
||||
def take_2d_axis1_int8_float64(
|
||||
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
|
||||
) -> None: ...
|
||||
def take_2d_axis1_int16_int16(
|
||||
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
|
||||
) -> None: ...
|
||||
def take_2d_axis1_int16_int32(
|
||||
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
|
||||
) -> None: ...
|
||||
def take_2d_axis1_int16_int64(
|
||||
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
|
||||
) -> None: ...
|
||||
def take_2d_axis1_int16_float64(
|
||||
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
|
||||
) -> None: ...
|
||||
def take_2d_axis1_int32_int32(
|
||||
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
|
||||
) -> None: ...
|
||||
def take_2d_axis1_int32_int64(
|
||||
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
|
||||
) -> None: ...
|
||||
def take_2d_axis1_int32_float64(
|
||||
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
|
||||
) -> None: ...
|
||||
def take_2d_axis1_int64_int64(
|
||||
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
|
||||
) -> None: ...
|
||||
def take_2d_axis1_int64_float64(
|
||||
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
|
||||
) -> None: ...
|
||||
def take_2d_axis1_float32_float32(
|
||||
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
|
||||
) -> None: ...
|
||||
def take_2d_axis1_float32_float64(
|
||||
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
|
||||
) -> None: ...
|
||||
def take_2d_axis1_float64_float64(
|
||||
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
|
||||
) -> None: ...
|
||||
def take_2d_axis1_object_object(
|
||||
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
|
||||
) -> None: ...
|
||||
def take_2d_axis1_bool_bool(
|
||||
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
|
||||
) -> None: ...
|
||||
def take_2d_axis1_bool_object(
|
||||
values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
|
||||
) -> None: ...
|
||||
def take_2d_multi_int8_int8(
|
||||
values: np.ndarray,
|
||||
indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
|
||||
out: np.ndarray,
|
||||
fill_value=...,
|
||||
) -> None: ...
|
||||
def take_2d_multi_int8_int32(
|
||||
values: np.ndarray,
|
||||
indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
|
||||
out: np.ndarray,
|
||||
fill_value=...,
|
||||
) -> None: ...
|
||||
def take_2d_multi_int8_int64(
|
||||
values: np.ndarray,
|
||||
indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
|
||||
out: np.ndarray,
|
||||
fill_value=...,
|
||||
) -> None: ...
|
||||
def take_2d_multi_int8_float64(
|
||||
values: np.ndarray,
|
||||
indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
|
||||
out: np.ndarray,
|
||||
fill_value=...,
|
||||
) -> None: ...
|
||||
def take_2d_multi_int16_int16(
|
||||
values: np.ndarray,
|
||||
indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
|
||||
out: np.ndarray,
|
||||
fill_value=...,
|
||||
) -> None: ...
|
||||
def take_2d_multi_int16_int32(
|
||||
values: np.ndarray,
|
||||
indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
|
||||
out: np.ndarray,
|
||||
fill_value=...,
|
||||
) -> None: ...
|
||||
def take_2d_multi_int16_int64(
|
||||
values: np.ndarray,
|
||||
indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
|
||||
out: np.ndarray,
|
||||
fill_value=...,
|
||||
) -> None: ...
|
||||
def take_2d_multi_int16_float64(
|
||||
values: np.ndarray,
|
||||
indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
|
||||
out: np.ndarray,
|
||||
fill_value=...,
|
||||
) -> None: ...
|
||||
def take_2d_multi_int32_int32(
|
||||
values: np.ndarray,
|
||||
indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
|
||||
out: np.ndarray,
|
||||
fill_value=...,
|
||||
) -> None: ...
|
||||
def take_2d_multi_int32_int64(
|
||||
values: np.ndarray,
|
||||
indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
|
||||
out: np.ndarray,
|
||||
fill_value=...,
|
||||
) -> None: ...
|
||||
def take_2d_multi_int32_float64(
|
||||
values: np.ndarray,
|
||||
indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
|
||||
out: np.ndarray,
|
||||
fill_value=...,
|
||||
) -> None: ...
|
||||
def take_2d_multi_int64_float64(
|
||||
values: np.ndarray,
|
||||
indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
|
||||
out: np.ndarray,
|
||||
fill_value=...,
|
||||
) -> None: ...
|
||||
def take_2d_multi_float32_float32(
|
||||
values: np.ndarray,
|
||||
indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
|
||||
out: np.ndarray,
|
||||
fill_value=...,
|
||||
) -> None: ...
|
||||
def take_2d_multi_float32_float64(
|
||||
values: np.ndarray,
|
||||
indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
|
||||
out: np.ndarray,
|
||||
fill_value=...,
|
||||
) -> None: ...
|
||||
def take_2d_multi_float64_float64(
|
||||
values: np.ndarray,
|
||||
indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
|
||||
out: np.ndarray,
|
||||
fill_value=...,
|
||||
) -> None: ...
|
||||
def take_2d_multi_object_object(
|
||||
values: np.ndarray,
|
||||
indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
|
||||
out: np.ndarray,
|
||||
fill_value=...,
|
||||
) -> None: ...
|
||||
def take_2d_multi_bool_bool(
|
||||
values: np.ndarray,
|
||||
indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
|
||||
out: np.ndarray,
|
||||
fill_value=...,
|
||||
) -> None: ...
|
||||
def take_2d_multi_bool_object(
|
||||
values: np.ndarray,
|
||||
indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
|
||||
out: np.ndarray,
|
||||
fill_value=...,
|
||||
) -> None: ...
|
||||
def take_2d_multi_int64_int64(
|
||||
values: np.ndarray,
|
||||
indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
|
||||
out: np.ndarray,
|
||||
fill_value=...,
|
||||
) -> None: ...
|
||||
1478
dashboard/flask-server/venv/Lib/site-packages/pandas/_libs/algos.pyx
Normal file
1478
dashboard/flask-server/venv/Lib/site-packages/pandas/_libs/algos.pyx
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,72 @@
|
||||
"""
|
||||
Template for each `dtype` helper function using 1-d template
|
||||
|
||||
WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
|
||||
"""
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# ensure_dtype
|
||||
# ----------------------------------------------------------------------
|
||||
|
||||
|
||||
def ensure_platform_int(object arr):
|
||||
# GH3033, GH1392
|
||||
# platform int is the size of the int pointer, e.g. np.intp
|
||||
if util.is_array(arr):
|
||||
if (<ndarray>arr).descr.type_num == cnp.NPY_INTP:
|
||||
return arr
|
||||
else:
|
||||
# equiv: arr.astype(np.intp)
|
||||
return cnp.PyArray_Cast(<ndarray>arr, cnp.NPY_INTP)
|
||||
else:
|
||||
return np.array(arr, dtype=np.intp)
|
||||
|
||||
|
||||
def ensure_object(object arr):
|
||||
if util.is_array(arr):
|
||||
if (<ndarray>arr).descr.type_num == NPY_OBJECT:
|
||||
return arr
|
||||
else:
|
||||
# equiv: arr.astype(object)
|
||||
return cnp.PyArray_Cast(<ndarray>arr, NPY_OBJECT)
|
||||
else:
|
||||
return np.array(arr, dtype=np.object_)
|
||||
|
||||
{{py:
|
||||
|
||||
# name, c_type, dtype
|
||||
dtypes = [('float64', 'FLOAT64', 'float64'),
|
||||
# ('float32', 'FLOAT32', 'float32'), # disabling bc unused
|
||||
('int8', 'INT8', 'int8'),
|
||||
('int16', 'INT16', 'int16'),
|
||||
('int32', 'INT32', 'int32'),
|
||||
('int64', 'INT64', 'int64'),
|
||||
# Disabling uint and complex dtypes because we do not use them
|
||||
# (and compiling them increases wheel size)
|
||||
# ('uint8', 'UINT8', 'uint8'),
|
||||
# ('uint16', 'UINT16', 'uint16'),
|
||||
# ('uint32', 'UINT32', 'uint32'),
|
||||
# ('uint64', 'UINT64', 'uint64'),
|
||||
# ('complex64', 'COMPLEX64', 'complex64'),
|
||||
# ('complex128', 'COMPLEX128', 'complex128')
|
||||
]
|
||||
|
||||
def get_dispatch(dtypes):
|
||||
|
||||
for name, c_type, dtype in dtypes:
|
||||
yield name, c_type, dtype
|
||||
}}
|
||||
|
||||
{{for name, c_type, dtype in get_dispatch(dtypes)}}
|
||||
|
||||
|
||||
def ensure_{{name}}(object arr, copy=True):
|
||||
if util.is_array(arr):
|
||||
if (<ndarray>arr).descr.type_num == NPY_{{c_type}}:
|
||||
return arr
|
||||
else:
|
||||
return arr.astype(np.{{dtype}}, copy=copy)
|
||||
else:
|
||||
return np.array(arr, dtype=np.{{dtype}})
|
||||
|
||||
{{endfor}}
|
||||
@@ -0,0 +1,222 @@
|
||||
"""
|
||||
Template for each `dtype` helper function for take
|
||||
|
||||
WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
|
||||
"""
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# take_1d, take_2d
|
||||
# ----------------------------------------------------------------------
|
||||
|
||||
|
||||
{{py:
|
||||
|
||||
# c_type_in, c_type_out
|
||||
dtypes = [
|
||||
('uint8_t', 'uint8_t'),
|
||||
('uint8_t', 'object'),
|
||||
('int8_t', 'int8_t'),
|
||||
('int8_t', 'int32_t'),
|
||||
('int8_t', 'int64_t'),
|
||||
('int8_t', 'float64_t'),
|
||||
('int16_t', 'int16_t'),
|
||||
('int16_t', 'int32_t'),
|
||||
('int16_t', 'int64_t'),
|
||||
('int16_t', 'float64_t'),
|
||||
('int32_t', 'int32_t'),
|
||||
('int32_t', 'int64_t'),
|
||||
('int32_t', 'float64_t'),
|
||||
('int64_t', 'int64_t'),
|
||||
('int64_t', 'float64_t'),
|
||||
('float32_t', 'float32_t'),
|
||||
('float32_t', 'float64_t'),
|
||||
('float64_t', 'float64_t'),
|
||||
('object', 'object'),
|
||||
]
|
||||
|
||||
|
||||
def get_dispatch(dtypes):
|
||||
|
||||
for (c_type_in, c_type_out) in dtypes:
|
||||
|
||||
def get_name(dtype_name):
|
||||
if dtype_name == "object":
|
||||
return "object"
|
||||
if dtype_name == "uint8_t":
|
||||
return "bool"
|
||||
return dtype_name[:-2]
|
||||
|
||||
name = get_name(c_type_in)
|
||||
dest = get_name(c_type_out)
|
||||
|
||||
args = dict(name=name, dest=dest, c_type_in=c_type_in,
|
||||
c_type_out=c_type_out)
|
||||
|
||||
yield (name, dest, c_type_in, c_type_out)
|
||||
|
||||
}}
|
||||
|
||||
|
||||
{{for name, dest, c_type_in, c_type_out in get_dispatch(dtypes)}}
|
||||
|
||||
|
||||
@cython.wraparound(False)
|
||||
@cython.boundscheck(False)
|
||||
{{if c_type_in != "object"}}
|
||||
def take_1d_{{name}}_{{dest}}(const {{c_type_in}}[:] values,
|
||||
{{else}}
|
||||
def take_1d_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=1] values,
|
||||
{{endif}}
|
||||
const intp_t[:] indexer,
|
||||
{{c_type_out}}[:] out,
|
||||
fill_value=np.nan):
|
||||
|
||||
cdef:
|
||||
Py_ssize_t i, n, idx
|
||||
{{c_type_out}} fv
|
||||
|
||||
n = indexer.shape[0]
|
||||
|
||||
fv = fill_value
|
||||
|
||||
{{if c_type_out != "object"}}
|
||||
with nogil:
|
||||
{{else}}
|
||||
if True:
|
||||
{{endif}}
|
||||
for i in range(n):
|
||||
idx = indexer[i]
|
||||
if idx == -1:
|
||||
out[i] = fv
|
||||
else:
|
||||
{{if c_type_in == "uint8_t" and c_type_out == "object"}}
|
||||
out[i] = True if values[idx] > 0 else False
|
||||
{{else}}
|
||||
out[i] = values[idx]
|
||||
{{endif}}
|
||||
|
||||
|
||||
@cython.wraparound(False)
|
||||
@cython.boundscheck(False)
|
||||
{{if c_type_in != "object"}}
|
||||
def take_2d_axis0_{{name}}_{{dest}}(const {{c_type_in}}[:, :] values,
|
||||
{{else}}
|
||||
def take_2d_axis0_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values,
|
||||
{{endif}}
|
||||
ndarray[intp_t, ndim=1] indexer,
|
||||
{{c_type_out}}[:, :] out,
|
||||
fill_value=np.nan):
|
||||
cdef:
|
||||
Py_ssize_t i, j, k, n, idx
|
||||
{{c_type_out}} fv
|
||||
{{if c_type_in == c_type_out != "object"}}
|
||||
const {{c_type_out}} *v
|
||||
{{c_type_out}} *o
|
||||
{{endif}}
|
||||
|
||||
n = len(indexer)
|
||||
k = values.shape[1]
|
||||
|
||||
fv = fill_value
|
||||
|
||||
{{if c_type_in == c_type_out != "object"}}
|
||||
# GH#3130
|
||||
if (values.strides[1] == out.strides[1] and
|
||||
values.strides[1] == sizeof({{c_type_out}}) and
|
||||
sizeof({{c_type_out}}) * n >= 256):
|
||||
|
||||
for i in range(n):
|
||||
idx = indexer[i]
|
||||
if idx == -1:
|
||||
for j in range(k):
|
||||
out[i, j] = fv
|
||||
else:
|
||||
v = &values[idx, 0]
|
||||
o = &out[i, 0]
|
||||
memmove(o, v, <size_t>(sizeof({{c_type_out}}) * k))
|
||||
return
|
||||
{{endif}}
|
||||
|
||||
for i in range(n):
|
||||
idx = indexer[i]
|
||||
if idx == -1:
|
||||
for j in range(k):
|
||||
out[i, j] = fv
|
||||
else:
|
||||
for j in range(k):
|
||||
{{if c_type_in == "uint8_t" and c_type_out == "object"}}
|
||||
out[i, j] = True if values[idx, j] > 0 else False
|
||||
{{else}}
|
||||
out[i, j] = values[idx, j]
|
||||
{{endif}}
|
||||
|
||||
|
||||
@cython.wraparound(False)
|
||||
@cython.boundscheck(False)
|
||||
{{if c_type_in != "object"}}
|
||||
def take_2d_axis1_{{name}}_{{dest}}(const {{c_type_in}}[:, :] values,
|
||||
{{else}}
|
||||
def take_2d_axis1_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values,
|
||||
{{endif}}
|
||||
ndarray[intp_t, ndim=1] indexer,
|
||||
{{c_type_out}}[:, :] out,
|
||||
fill_value=np.nan):
|
||||
|
||||
cdef:
|
||||
Py_ssize_t i, j, k, n, idx
|
||||
{{c_type_out}} fv
|
||||
|
||||
n = len(values)
|
||||
k = len(indexer)
|
||||
|
||||
if n == 0 or k == 0:
|
||||
return
|
||||
|
||||
fv = fill_value
|
||||
|
||||
for i in range(n):
|
||||
for j in range(k):
|
||||
idx = indexer[j]
|
||||
if idx == -1:
|
||||
out[i, j] = fv
|
||||
else:
|
||||
{{if c_type_in == "uint8_t" and c_type_out == "object"}}
|
||||
out[i, j] = True if values[i, idx] > 0 else False
|
||||
{{else}}
|
||||
out[i, j] = values[i, idx]
|
||||
{{endif}}
|
||||
|
||||
|
||||
@cython.wraparound(False)
|
||||
@cython.boundscheck(False)
|
||||
def take_2d_multi_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values,
|
||||
indexer,
|
||||
ndarray[{{c_type_out}}, ndim=2] out,
|
||||
fill_value=np.nan):
|
||||
cdef:
|
||||
Py_ssize_t i, j, k, n, idx
|
||||
ndarray[intp_t, ndim=1] idx0 = indexer[0]
|
||||
ndarray[intp_t, ndim=1] idx1 = indexer[1]
|
||||
{{c_type_out}} fv
|
||||
|
||||
n = len(idx0)
|
||||
k = len(idx1)
|
||||
|
||||
fv = fill_value
|
||||
for i in range(n):
|
||||
idx = idx0[i]
|
||||
if idx == -1:
|
||||
for j in range(k):
|
||||
out[i, j] = fv
|
||||
else:
|
||||
for j in range(k):
|
||||
if idx1[j] == -1:
|
||||
out[i, j] = fv
|
||||
else:
|
||||
{{if c_type_in == "uint8_t" and c_type_out == "object"}}
|
||||
out[i, j] = True if values[idx, idx1[j]] > 0 else False
|
||||
{{else}}
|
||||
out[i, j] = values[idx, idx1[j]]
|
||||
{{endif}}
|
||||
|
||||
{{endfor}}
|
||||
Binary file not shown.
@@ -0,0 +1,11 @@
|
||||
|
||||
from numpy cimport ndarray
|
||||
|
||||
|
||||
cdef class NDArrayBacked:
|
||||
cdef:
|
||||
readonly ndarray _ndarray
|
||||
readonly object _dtype
|
||||
|
||||
cpdef NDArrayBacked _from_backing_data(self, ndarray values)
|
||||
cpdef __setstate__(self, state)
|
||||
@@ -0,0 +1,34 @@
|
||||
from typing import Sequence
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._typing import (
|
||||
DtypeObj,
|
||||
Shape,
|
||||
)
|
||||
|
||||
class NDArrayBacked:
|
||||
_dtype: DtypeObj
|
||||
_ndarray: np.ndarray
|
||||
def __init__(self, values: np.ndarray, dtype: DtypeObj): ...
|
||||
@classmethod
|
||||
def _simple_new(cls, values: np.ndarray, dtype: DtypeObj): ...
|
||||
def _from_backing_data(self, values: np.ndarray): ...
|
||||
def __setstate__(self, state): ...
|
||||
def __len__(self) -> int: ...
|
||||
@property
|
||||
def shape(self) -> Shape: ...
|
||||
@property
|
||||
def ndim(self) -> int: ...
|
||||
@property
|
||||
def size(self) -> int: ...
|
||||
@property
|
||||
def nbytes(self) -> int: ...
|
||||
def copy(self): ...
|
||||
def delete(self, loc, axis=...): ...
|
||||
def swapaxes(self, axis1, axis2): ...
|
||||
def repeat(self, repeats: int | Sequence[int], axis: int | None = ...): ...
|
||||
def reshape(self, *args, **kwargs): ...
|
||||
def ravel(self, order=...): ...
|
||||
@property
|
||||
def T(self): ...
|
||||
@@ -0,0 +1,183 @@
|
||||
"""
|
||||
Cython implementations for internal ExtensionArrays.
|
||||
"""
|
||||
cimport cython
|
||||
|
||||
import numpy as np
|
||||
|
||||
cimport numpy as cnp
|
||||
from cpython cimport PyErr_Clear
|
||||
from numpy cimport ndarray
|
||||
|
||||
cnp.import_array()
|
||||
|
||||
|
||||
@cython.freelist(16)
|
||||
cdef class NDArrayBacked:
|
||||
"""
|
||||
Implementing these methods in cython improves performance quite a bit.
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from pandas._libs.arrays import NDArrayBacked as cls
|
||||
|
||||
dti = pd.date_range("2016-01-01", periods=3)
|
||||
dta = dti._data
|
||||
arr = dta._ndarray
|
||||
|
||||
obj = cls._simple_new(arr, arr.dtype)
|
||||
|
||||
# for foo in [arr, dta, obj]: ...
|
||||
|
||||
%timeit foo.copy()
|
||||
299 ns ± 30 ns per loop # <-- arr underlying ndarray (for reference)
|
||||
530 ns ± 9.24 ns per loop # <-- dta with cython NDArrayBacked
|
||||
1.66 µs ± 46.3 ns per loop # <-- dta without cython NDArrayBacked
|
||||
328 ns ± 5.29 ns per loop # <-- obj with NDArrayBacked.__cinit__
|
||||
371 ns ± 6.97 ns per loop # <-- obj with NDArrayBacked._simple_new
|
||||
|
||||
%timeit foo.T
|
||||
125 ns ± 6.27 ns per loop # <-- arr underlying ndarray (for reference)
|
||||
226 ns ± 7.66 ns per loop # <-- dta with cython NDArrayBacked
|
||||
911 ns ± 16.6 ns per loop # <-- dta without cython NDArrayBacked
|
||||
215 ns ± 4.54 ns per loop # <-- obj with NDArrayBacked._simple_new
|
||||
|
||||
"""
|
||||
# TODO: implement take in terms of cnp.PyArray_TakeFrom
|
||||
# TODO: implement concat_same_type in terms of cnp.PyArray_Concatenate
|
||||
|
||||
# cdef:
|
||||
# readonly ndarray _ndarray
|
||||
# readonly object _dtype
|
||||
|
||||
def __init__(self, ndarray values, object dtype):
|
||||
self._ndarray = values
|
||||
self._dtype = dtype
|
||||
|
||||
@classmethod
|
||||
def _simple_new(cls, ndarray values, object dtype):
|
||||
cdef:
|
||||
NDArrayBacked obj
|
||||
obj = NDArrayBacked.__new__(cls)
|
||||
obj._ndarray = values
|
||||
obj._dtype = dtype
|
||||
return obj
|
||||
|
||||
cpdef NDArrayBacked _from_backing_data(self, ndarray values):
|
||||
"""
|
||||
Construct a new ExtensionArray `new_array` with `arr` as its _ndarray.
|
||||
|
||||
This should round-trip:
|
||||
self == self._from_backing_data(self._ndarray)
|
||||
"""
|
||||
# TODO: re-reuse simple_new if/when it can be cpdef
|
||||
cdef:
|
||||
NDArrayBacked obj
|
||||
obj = NDArrayBacked.__new__(type(self))
|
||||
obj._ndarray = values
|
||||
obj._dtype = self._dtype
|
||||
return obj
|
||||
|
||||
cpdef __setstate__(self, state):
|
||||
if isinstance(state, dict):
|
||||
if "_data" in state:
|
||||
data = state.pop("_data")
|
||||
elif "_ndarray" in state:
|
||||
data = state.pop("_ndarray")
|
||||
else:
|
||||
raise ValueError # pragma: no cover
|
||||
self._ndarray = data
|
||||
self._dtype = state.pop("_dtype")
|
||||
|
||||
for key, val in state.items():
|
||||
setattr(self, key, val)
|
||||
elif isinstance(state, tuple):
|
||||
if len(state) != 3:
|
||||
if len(state) == 1 and isinstance(state[0], dict):
|
||||
self.__setstate__(state[0])
|
||||
return
|
||||
raise NotImplementedError(state) # pragma: no cover
|
||||
|
||||
data, dtype = state[:2]
|
||||
if isinstance(dtype, np.ndarray):
|
||||
dtype, data = data, dtype
|
||||
self._ndarray = data
|
||||
self._dtype = dtype
|
||||
|
||||
if isinstance(state[2], dict):
|
||||
for key, val in state[2].items():
|
||||
setattr(self, key, val)
|
||||
else:
|
||||
raise NotImplementedError(state) # pragma: no cover
|
||||
else:
|
||||
raise NotImplementedError(state) # pragma: no cover
|
||||
|
||||
def __len__(self) -> int:
|
||||
return len(self._ndarray)
|
||||
|
||||
@property
|
||||
def shape(self):
|
||||
# object cast bc _ndarray.shape is npy_intp*
|
||||
return (<object>(self._ndarray)).shape
|
||||
|
||||
@property
|
||||
def ndim(self) -> int:
|
||||
return self._ndarray.ndim
|
||||
|
||||
@property
|
||||
def size(self) -> int:
|
||||
return self._ndarray.size
|
||||
|
||||
@property
|
||||
def nbytes(self) -> int:
|
||||
return self._ndarray.nbytes
|
||||
|
||||
def copy(self, order="C"):
|
||||
cdef:
|
||||
cnp.NPY_ORDER order_code
|
||||
int success
|
||||
|
||||
success = cnp.PyArray_OrderConverter(order, &order_code)
|
||||
if not success:
|
||||
# clear exception so that we don't get a SystemError
|
||||
PyErr_Clear()
|
||||
# same message used by numpy
|
||||
msg = f"order must be one of 'C', 'F', 'A', or 'K' (got '{order}')"
|
||||
raise ValueError(msg)
|
||||
|
||||
res_values = cnp.PyArray_NewCopy(self._ndarray, order_code)
|
||||
return self._from_backing_data(res_values)
|
||||
|
||||
def delete(self, loc, axis=0):
|
||||
res_values = np.delete(self._ndarray, loc, axis=axis)
|
||||
return self._from_backing_data(res_values)
|
||||
|
||||
def swapaxes(self, axis1, axis2):
|
||||
res_values = cnp.PyArray_SwapAxes(self._ndarray, axis1, axis2)
|
||||
return self._from_backing_data(res_values)
|
||||
|
||||
# TODO: pass NPY_MAXDIMS equiv to axis=None?
|
||||
def repeat(self, repeats, axis: int = 0):
|
||||
if axis is None:
|
||||
axis = 0
|
||||
res_values = cnp.PyArray_Repeat(self._ndarray, repeats, <int>axis)
|
||||
return self._from_backing_data(res_values)
|
||||
|
||||
def reshape(self, *args, **kwargs):
|
||||
res_values = self._ndarray.reshape(*args, **kwargs)
|
||||
return self._from_backing_data(res_values)
|
||||
|
||||
def ravel(self, order="C"):
|
||||
# cnp.PyArray_OrderConverter(PyObject* obj, NPY_ORDER* order)
|
||||
# res_values = cnp.PyArray_Ravel(self._ndarray, order)
|
||||
res_values = self._ndarray.ravel(order)
|
||||
return self._from_backing_data(res_values)
|
||||
|
||||
@property
|
||||
def T(self):
|
||||
res_values = self._ndarray.T
|
||||
return self._from_backing_data(res_values)
|
||||
|
||||
def transpose(self, *axes):
|
||||
res_values = self._ndarray.transpose(*axes)
|
||||
return self._from_backing_data(res_values)
|
||||
@@ -0,0 +1,48 @@
|
||||
"""
|
||||
Common location for shared fused types
|
||||
"""
|
||||
|
||||
from numpy cimport (
|
||||
float32_t,
|
||||
float64_t,
|
||||
int8_t,
|
||||
int16_t,
|
||||
int32_t,
|
||||
int64_t,
|
||||
uint8_t,
|
||||
uint16_t,
|
||||
uint32_t,
|
||||
uint64_t,
|
||||
)
|
||||
|
||||
# All numeric types except complex
|
||||
ctypedef fused numeric_t:
|
||||
int8_t
|
||||
int16_t
|
||||
int32_t
|
||||
int64_t
|
||||
|
||||
uint8_t
|
||||
uint16_t
|
||||
uint32_t
|
||||
uint64_t
|
||||
|
||||
float32_t
|
||||
float64_t
|
||||
|
||||
# All numeric types + object, doesn't include complex
|
||||
ctypedef fused numeric_object_t:
|
||||
numeric_t
|
||||
object
|
||||
|
||||
# i64 + u64 + all float types
|
||||
ctypedef fused iu_64_floating_t:
|
||||
float64_t
|
||||
float32_t
|
||||
int64_t
|
||||
uint64_t
|
||||
|
||||
# i64 + u64 + all float types + object
|
||||
ctypedef fused iu_64_floating_obj_t:
|
||||
iu_64_floating_t
|
||||
object
|
||||
Binary file not shown.
@@ -0,0 +1,159 @@
|
||||
from typing import Literal
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._typing import npt
|
||||
|
||||
def group_median_float64(
|
||||
out: np.ndarray, # ndarray[float64_t, ndim=2]
|
||||
counts: npt.NDArray[np.int64],
|
||||
values: np.ndarray, # ndarray[float64_t, ndim=2]
|
||||
labels: npt.NDArray[np.int64],
|
||||
min_count: int = ..., # Py_ssize_t
|
||||
) -> None: ...
|
||||
def group_cumprod_float64(
|
||||
out: np.ndarray, # float64_t[:, ::1]
|
||||
values: np.ndarray, # const float64_t[:, :]
|
||||
labels: np.ndarray, # const int64_t[:]
|
||||
ngroups: int,
|
||||
is_datetimelike: bool,
|
||||
skipna: bool = ...,
|
||||
) -> None: ...
|
||||
def group_cumsum(
|
||||
out: np.ndarray, # numeric[:, ::1]
|
||||
values: np.ndarray, # ndarray[numeric, ndim=2]
|
||||
labels: np.ndarray, # const int64_t[:]
|
||||
ngroups: int,
|
||||
is_datetimelike: bool,
|
||||
skipna: bool = ...,
|
||||
) -> None: ...
|
||||
def group_shift_indexer(
|
||||
out: np.ndarray, # int64_t[::1]
|
||||
labels: np.ndarray, # const int64_t[:]
|
||||
ngroups: int,
|
||||
periods: int,
|
||||
) -> None: ...
|
||||
def group_fillna_indexer(
|
||||
out: np.ndarray, # ndarray[intp_t]
|
||||
labels: np.ndarray, # ndarray[int64_t]
|
||||
sorted_labels: npt.NDArray[np.intp],
|
||||
mask: npt.NDArray[np.uint8],
|
||||
direction: Literal["ffill", "bfill"],
|
||||
limit: int, # int64_t
|
||||
dropna: bool,
|
||||
) -> None: ...
|
||||
def group_any_all(
|
||||
out: np.ndarray, # uint8_t[::1]
|
||||
values: np.ndarray, # const uint8_t[::1]
|
||||
labels: np.ndarray, # const int64_t[:]
|
||||
mask: np.ndarray, # const uint8_t[::1]
|
||||
val_test: Literal["any", "all"],
|
||||
skipna: bool,
|
||||
) -> None: ...
|
||||
def group_add(
|
||||
out: np.ndarray, # complexfloating_t[:, ::1]
|
||||
counts: np.ndarray, # int64_t[::1]
|
||||
values: np.ndarray, # ndarray[complexfloating_t, ndim=2]
|
||||
labels: np.ndarray, # const intp_t[:]
|
||||
min_count: int = ...,
|
||||
datetimelike: bool = ...,
|
||||
) -> None: ...
|
||||
def group_prod(
|
||||
out: np.ndarray, # floating[:, ::1]
|
||||
counts: np.ndarray, # int64_t[::1]
|
||||
values: np.ndarray, # ndarray[floating, ndim=2]
|
||||
labels: np.ndarray, # const intp_t[:]
|
||||
min_count: int = ...,
|
||||
) -> None: ...
|
||||
def group_var(
|
||||
out: np.ndarray, # floating[:, ::1]
|
||||
counts: np.ndarray, # int64_t[::1]
|
||||
values: np.ndarray, # ndarray[floating, ndim=2]
|
||||
labels: np.ndarray, # const intp_t[:]
|
||||
min_count: int = ..., # Py_ssize_t
|
||||
ddof: int = ..., # int64_t
|
||||
) -> None: ...
|
||||
def group_mean(
|
||||
out: np.ndarray, # floating[:, ::1]
|
||||
counts: np.ndarray, # int64_t[::1]
|
||||
values: np.ndarray, # ndarray[floating, ndim=2]
|
||||
labels: np.ndarray, # const intp_t[:]
|
||||
min_count: int = ..., # Py_ssize_t
|
||||
is_datetimelike: bool = ..., # bint
|
||||
mask: np.ndarray | None = ...,
|
||||
result_mask: np.ndarray | None = ...,
|
||||
) -> None: ...
|
||||
def group_ohlc(
|
||||
out: np.ndarray, # floating[:, ::1]
|
||||
counts: np.ndarray, # int64_t[::1]
|
||||
values: np.ndarray, # ndarray[floating, ndim=2]
|
||||
labels: np.ndarray, # const intp_t[:]
|
||||
min_count: int = ...,
|
||||
) -> None: ...
|
||||
def group_quantile(
|
||||
out: npt.NDArray[np.float64],
|
||||
values: np.ndarray, # ndarray[numeric, ndim=1]
|
||||
labels: npt.NDArray[np.intp],
|
||||
mask: npt.NDArray[np.uint8],
|
||||
sort_indexer: npt.NDArray[np.intp], # const
|
||||
qs: npt.NDArray[np.float64], # const
|
||||
interpolation: Literal["linear", "lower", "higher", "nearest", "midpoint"],
|
||||
) -> None: ...
|
||||
def group_last(
|
||||
out: np.ndarray, # rank_t[:, ::1]
|
||||
counts: np.ndarray, # int64_t[::1]
|
||||
values: np.ndarray, # ndarray[rank_t, ndim=2]
|
||||
labels: np.ndarray, # const int64_t[:]
|
||||
min_count: int = ..., # Py_ssize_t
|
||||
) -> None: ...
|
||||
def group_nth(
|
||||
out: np.ndarray, # rank_t[:, ::1]
|
||||
counts: np.ndarray, # int64_t[::1]
|
||||
values: np.ndarray, # ndarray[rank_t, ndim=2]
|
||||
labels: np.ndarray, # const int64_t[:]
|
||||
min_count: int = ..., # int64_t
|
||||
rank: int = ..., # int64_t
|
||||
) -> None: ...
|
||||
def group_rank(
|
||||
out: np.ndarray, # float64_t[:, ::1]
|
||||
values: np.ndarray, # ndarray[rank_t, ndim=2]
|
||||
labels: np.ndarray, # const int64_t[:]
|
||||
ngroups: int,
|
||||
is_datetimelike: bool,
|
||||
ties_method: Literal["aveage", "min", "max", "first", "dense"] = ...,
|
||||
ascending: bool = ...,
|
||||
pct: bool = ...,
|
||||
na_option: Literal["keep", "top", "bottom"] = ...,
|
||||
) -> None: ...
|
||||
def group_max(
|
||||
out: np.ndarray, # groupby_t[:, ::1]
|
||||
counts: np.ndarray, # int64_t[::1]
|
||||
values: np.ndarray, # ndarray[groupby_t, ndim=2]
|
||||
labels: np.ndarray, # const int64_t[:]
|
||||
min_count: int = ...,
|
||||
mask: np.ndarray | None = ...,
|
||||
result_mask: np.ndarray | None = ...,
|
||||
) -> None: ...
|
||||
def group_min(
|
||||
out: np.ndarray, # groupby_t[:, ::1]
|
||||
counts: np.ndarray, # int64_t[::1]
|
||||
values: np.ndarray, # ndarray[groupby_t, ndim=2]
|
||||
labels: np.ndarray, # const int64_t[:]
|
||||
min_count: int = ...,
|
||||
mask: np.ndarray | None = ...,
|
||||
result_mask: np.ndarray | None = ...,
|
||||
) -> None: ...
|
||||
def group_cummin(
|
||||
out: np.ndarray, # groupby_t[:, ::1]
|
||||
values: np.ndarray, # ndarray[groupby_t, ndim=2]
|
||||
labels: np.ndarray, # const int64_t[:]
|
||||
ngroups: int,
|
||||
is_datetimelike: bool,
|
||||
) -> None: ...
|
||||
def group_cummax(
|
||||
out: np.ndarray, # groupby_t[:, ::1]
|
||||
values: np.ndarray, # ndarray[groupby_t, ndim=2]
|
||||
labels: np.ndarray, # const int64_t[:]
|
||||
ngroups: int,
|
||||
is_datetimelike: bool,
|
||||
) -> None: ...
|
||||
File diff suppressed because it is too large
Load Diff
Binary file not shown.
@@ -0,0 +1,9 @@
|
||||
import numpy as np
|
||||
|
||||
from pandas._typing import npt
|
||||
|
||||
def hash_object_array(
|
||||
arr: npt.NDArray[np.object_],
|
||||
key: str,
|
||||
encoding: str = ...,
|
||||
) -> npt.NDArray[np.uint64]: ...
|
||||
@@ -0,0 +1,198 @@
|
||||
# Translated from the reference implementation
|
||||
# at https://github.com/veorq/SipHash
|
||||
|
||||
import cython
|
||||
|
||||
from libc.stdlib cimport (
|
||||
free,
|
||||
malloc,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
from numpy cimport (
|
||||
import_array,
|
||||
ndarray,
|
||||
uint8_t,
|
||||
uint32_t,
|
||||
uint64_t,
|
||||
)
|
||||
|
||||
import_array()
|
||||
|
||||
from pandas._libs.util cimport is_nan
|
||||
|
||||
DEF cROUNDS = 2
|
||||
DEF dROUNDS = 4
|
||||
|
||||
|
||||
@cython.boundscheck(False)
|
||||
def hash_object_array(
|
||||
ndarray[object] arr, str key, str encoding="utf8"
|
||||
) -> np.ndarray[np.uint64]:
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
arr : 1-d object ndarray of objects
|
||||
key : hash key, must be 16 byte len encoded
|
||||
encoding : encoding for key & arr, default to 'utf8'
|
||||
|
||||
Returns
|
||||
-------
|
||||
1-d uint64 ndarray of hashes.
|
||||
|
||||
Raises
|
||||
------
|
||||
TypeError
|
||||
If the array contains mixed types.
|
||||
|
||||
Notes
|
||||
-----
|
||||
Allowed values must be strings, or nulls
|
||||
mixed array types will raise TypeError.
|
||||
"""
|
||||
cdef:
|
||||
Py_ssize_t i, n
|
||||
uint64_t[:] result
|
||||
bytes data, k
|
||||
uint8_t *kb
|
||||
uint64_t *lens
|
||||
char **vecs
|
||||
char *cdata
|
||||
object val
|
||||
list datas = []
|
||||
|
||||
k = <bytes>key.encode(encoding)
|
||||
kb = <uint8_t *>k
|
||||
if len(k) != 16:
|
||||
raise ValueError(
|
||||
f"key should be a 16-byte string encoded, got {k} (len {len(k)})"
|
||||
)
|
||||
|
||||
n = len(arr)
|
||||
|
||||
# create an array of bytes
|
||||
vecs = <char **>malloc(n * sizeof(char *))
|
||||
lens = <uint64_t*>malloc(n * sizeof(uint64_t))
|
||||
|
||||
for i in range(n):
|
||||
val = arr[i]
|
||||
if isinstance(val, bytes):
|
||||
data = <bytes>val
|
||||
elif isinstance(val, str):
|
||||
data = <bytes>val.encode(encoding)
|
||||
elif val is None or is_nan(val):
|
||||
# null, stringify and encode
|
||||
data = <bytes>str(val).encode(encoding)
|
||||
|
||||
elif isinstance(val, tuple):
|
||||
# GH#28969 we could have a tuple, but need to ensure that
|
||||
# the tuple entries are themselves hashable before converting
|
||||
# to str
|
||||
hash(val)
|
||||
data = <bytes>str(val).encode(encoding)
|
||||
else:
|
||||
raise TypeError(
|
||||
f"{val} of type {type(val)} is not a valid type for hashing, "
|
||||
"must be string or null"
|
||||
)
|
||||
|
||||
lens[i] = len(data)
|
||||
cdata = data
|
||||
|
||||
# keep the references alive through the end of the
|
||||
# function
|
||||
datas.append(data)
|
||||
vecs[i] = cdata
|
||||
|
||||
result = np.empty(n, dtype=np.uint64)
|
||||
with nogil:
|
||||
for i in range(n):
|
||||
result[i] = low_level_siphash(<uint8_t *>vecs[i], lens[i], kb)
|
||||
|
||||
free(vecs)
|
||||
free(lens)
|
||||
return result.base # .base to retrieve underlying np.ndarray
|
||||
|
||||
|
||||
cdef inline uint64_t _rotl(uint64_t x, uint64_t b) nogil:
|
||||
return (x << b) | (x >> (64 - b))
|
||||
|
||||
|
||||
cdef inline uint64_t u8to64_le(uint8_t* p) nogil:
|
||||
return (<uint64_t>p[0] |
|
||||
<uint64_t>p[1] << 8 |
|
||||
<uint64_t>p[2] << 16 |
|
||||
<uint64_t>p[3] << 24 |
|
||||
<uint64_t>p[4] << 32 |
|
||||
<uint64_t>p[5] << 40 |
|
||||
<uint64_t>p[6] << 48 |
|
||||
<uint64_t>p[7] << 56)
|
||||
|
||||
|
||||
cdef inline void _sipround(uint64_t* v0, uint64_t* v1,
|
||||
uint64_t* v2, uint64_t* v3) nogil:
|
||||
v0[0] += v1[0]
|
||||
v1[0] = _rotl(v1[0], 13)
|
||||
v1[0] ^= v0[0]
|
||||
v0[0] = _rotl(v0[0], 32)
|
||||
v2[0] += v3[0]
|
||||
v3[0] = _rotl(v3[0], 16)
|
||||
v3[0] ^= v2[0]
|
||||
v0[0] += v3[0]
|
||||
v3[0] = _rotl(v3[0], 21)
|
||||
v3[0] ^= v0[0]
|
||||
v2[0] += v1[0]
|
||||
v1[0] = _rotl(v1[0], 17)
|
||||
v1[0] ^= v2[0]
|
||||
v2[0] = _rotl(v2[0], 32)
|
||||
|
||||
|
||||
@cython.cdivision(True)
|
||||
cdef uint64_t low_level_siphash(uint8_t* data, size_t datalen,
|
||||
uint8_t* key) nogil:
|
||||
cdef uint64_t v0 = 0x736f6d6570736575ULL
|
||||
cdef uint64_t v1 = 0x646f72616e646f6dULL
|
||||
cdef uint64_t v2 = 0x6c7967656e657261ULL
|
||||
cdef uint64_t v3 = 0x7465646279746573ULL
|
||||
cdef uint64_t b
|
||||
cdef uint64_t k0 = u8to64_le(key)
|
||||
cdef uint64_t k1 = u8to64_le(key + 8)
|
||||
cdef uint64_t m
|
||||
cdef int i
|
||||
cdef uint8_t* end = data + datalen - (datalen % sizeof(uint64_t))
|
||||
cdef int left = datalen & 7
|
||||
cdef int left_byte
|
||||
|
||||
b = (<uint64_t>datalen) << 56
|
||||
v3 ^= k1
|
||||
v2 ^= k0
|
||||
v1 ^= k1
|
||||
v0 ^= k0
|
||||
|
||||
while (data != end):
|
||||
m = u8to64_le(data)
|
||||
v3 ^= m
|
||||
for i in range(cROUNDS):
|
||||
_sipround(&v0, &v1, &v2, &v3)
|
||||
v0 ^= m
|
||||
|
||||
data += sizeof(uint64_t)
|
||||
|
||||
for i in range(left-1, -1, -1):
|
||||
b |= (<uint64_t>data[i]) << (i * 8)
|
||||
|
||||
v3 ^= b
|
||||
|
||||
for i in range(cROUNDS):
|
||||
_sipround(&v0, &v1, &v2, &v3)
|
||||
|
||||
v0 ^= b
|
||||
v2 ^= 0xff
|
||||
|
||||
for i in range(dROUNDS):
|
||||
_sipround(&v0, &v1, &v2, &v3)
|
||||
|
||||
b = v0 ^ v1 ^ v2 ^ v3
|
||||
|
||||
return b
|
||||
Binary file not shown.
@@ -0,0 +1,141 @@
|
||||
from numpy cimport (
|
||||
intp_t,
|
||||
ndarray,
|
||||
)
|
||||
|
||||
from pandas._libs.khash cimport (
|
||||
complex64_t,
|
||||
complex128_t,
|
||||
float32_t,
|
||||
float64_t,
|
||||
int8_t,
|
||||
int16_t,
|
||||
int32_t,
|
||||
int64_t,
|
||||
kh_complex64_t,
|
||||
kh_complex128_t,
|
||||
kh_float32_t,
|
||||
kh_float64_t,
|
||||
kh_int8_t,
|
||||
kh_int16_t,
|
||||
kh_int32_t,
|
||||
kh_int64_t,
|
||||
kh_pymap_t,
|
||||
kh_str_t,
|
||||
kh_uint8_t,
|
||||
kh_uint16_t,
|
||||
kh_uint32_t,
|
||||
kh_uint64_t,
|
||||
khcomplex64_t,
|
||||
khcomplex128_t,
|
||||
uint8_t,
|
||||
uint16_t,
|
||||
uint32_t,
|
||||
uint64_t,
|
||||
)
|
||||
|
||||
# prototypes for sharing
|
||||
|
||||
cdef class HashTable:
|
||||
pass
|
||||
|
||||
cdef class UInt64HashTable(HashTable):
|
||||
cdef kh_uint64_t *table
|
||||
|
||||
cpdef get_item(self, uint64_t val)
|
||||
cpdef set_item(self, uint64_t key, Py_ssize_t val)
|
||||
|
||||
cdef class Int64HashTable(HashTable):
|
||||
cdef kh_int64_t *table
|
||||
|
||||
cpdef get_item(self, int64_t val)
|
||||
cpdef set_item(self, int64_t key, Py_ssize_t val)
|
||||
|
||||
cdef class UInt32HashTable(HashTable):
|
||||
cdef kh_uint32_t *table
|
||||
|
||||
cpdef get_item(self, uint32_t val)
|
||||
cpdef set_item(self, uint32_t key, Py_ssize_t val)
|
||||
|
||||
cdef class Int32HashTable(HashTable):
|
||||
cdef kh_int32_t *table
|
||||
|
||||
cpdef get_item(self, int32_t val)
|
||||
cpdef set_item(self, int32_t key, Py_ssize_t val)
|
||||
|
||||
cdef class UInt16HashTable(HashTable):
|
||||
cdef kh_uint16_t *table
|
||||
|
||||
cpdef get_item(self, uint16_t val)
|
||||
cpdef set_item(self, uint16_t key, Py_ssize_t val)
|
||||
|
||||
cdef class Int16HashTable(HashTable):
|
||||
cdef kh_int16_t *table
|
||||
|
||||
cpdef get_item(self, int16_t val)
|
||||
cpdef set_item(self, int16_t key, Py_ssize_t val)
|
||||
|
||||
cdef class UInt8HashTable(HashTable):
|
||||
cdef kh_uint8_t *table
|
||||
|
||||
cpdef get_item(self, uint8_t val)
|
||||
cpdef set_item(self, uint8_t key, Py_ssize_t val)
|
||||
|
||||
cdef class Int8HashTable(HashTable):
|
||||
cdef kh_int8_t *table
|
||||
|
||||
cpdef get_item(self, int8_t val)
|
||||
cpdef set_item(self, int8_t key, Py_ssize_t val)
|
||||
|
||||
cdef class Float64HashTable(HashTable):
|
||||
cdef kh_float64_t *table
|
||||
|
||||
cpdef get_item(self, float64_t val)
|
||||
cpdef set_item(self, float64_t key, Py_ssize_t val)
|
||||
|
||||
cdef class Float32HashTable(HashTable):
|
||||
cdef kh_float32_t *table
|
||||
|
||||
cpdef get_item(self, float32_t val)
|
||||
cpdef set_item(self, float32_t key, Py_ssize_t val)
|
||||
|
||||
cdef class Complex64HashTable(HashTable):
|
||||
cdef kh_complex64_t *table
|
||||
|
||||
cpdef get_item(self, complex64_t val)
|
||||
cpdef set_item(self, complex64_t key, Py_ssize_t val)
|
||||
|
||||
cdef class Complex128HashTable(HashTable):
|
||||
cdef kh_complex128_t *table
|
||||
|
||||
cpdef get_item(self, complex128_t val)
|
||||
cpdef set_item(self, complex128_t key, Py_ssize_t val)
|
||||
|
||||
cdef class PyObjectHashTable(HashTable):
|
||||
cdef kh_pymap_t *table
|
||||
|
||||
cpdef get_item(self, object val)
|
||||
cpdef set_item(self, object key, Py_ssize_t val)
|
||||
|
||||
|
||||
cdef class StringHashTable(HashTable):
|
||||
cdef kh_str_t *table
|
||||
|
||||
cpdef get_item(self, str val)
|
||||
cpdef set_item(self, str key, Py_ssize_t val)
|
||||
|
||||
cdef struct Int64VectorData:
|
||||
int64_t *data
|
||||
Py_ssize_t n, m
|
||||
|
||||
cdef class Vector:
|
||||
cdef bint external_view_exists
|
||||
|
||||
cdef class Int64Vector(Vector):
|
||||
cdef Int64VectorData *data
|
||||
cdef ndarray ao
|
||||
|
||||
cdef resize(self)
|
||||
cpdef ndarray to_array(self)
|
||||
cdef inline void append(self, int64_t x)
|
||||
cdef extend(self, int64_t[:] x)
|
||||
@@ -0,0 +1,213 @@
|
||||
from typing import (
|
||||
Hashable,
|
||||
Literal,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._typing import npt
|
||||
|
||||
def unique_label_indices(
|
||||
labels: np.ndarray, # const int64_t[:]
|
||||
) -> np.ndarray: ...
|
||||
|
||||
class Factorizer:
|
||||
count: int
|
||||
def __init__(self, size_hint: int): ...
|
||||
def get_count(self) -> int: ...
|
||||
|
||||
class ObjectFactorizer(Factorizer):
|
||||
table: PyObjectHashTable
|
||||
uniques: ObjectVector
|
||||
def factorize(
|
||||
self,
|
||||
values: npt.NDArray[np.object_],
|
||||
sort: bool = ...,
|
||||
na_sentinel=...,
|
||||
na_value=...,
|
||||
) -> npt.NDArray[np.intp]: ...
|
||||
|
||||
class Int64Factorizer(Factorizer):
|
||||
table: Int64HashTable
|
||||
uniques: Int64Vector
|
||||
def factorize(
|
||||
self,
|
||||
values: np.ndarray, # const int64_t[:]
|
||||
sort: bool = ...,
|
||||
na_sentinel=...,
|
||||
na_value=...,
|
||||
) -> npt.NDArray[np.intp]: ...
|
||||
|
||||
class Int64Vector:
|
||||
def __init__(self): ...
|
||||
def __len__(self) -> int: ...
|
||||
def to_array(self) -> npt.NDArray[np.int64]: ...
|
||||
|
||||
class Int32Vector:
|
||||
def __init__(self): ...
|
||||
def __len__(self) -> int: ...
|
||||
def to_array(self) -> npt.NDArray[np.int32]: ...
|
||||
|
||||
class Int16Vector:
|
||||
def __init__(self): ...
|
||||
def __len__(self) -> int: ...
|
||||
def to_array(self) -> npt.NDArray[np.int16]: ...
|
||||
|
||||
class Int8Vector:
|
||||
def __init__(self): ...
|
||||
def __len__(self) -> int: ...
|
||||
def to_array(self) -> npt.NDArray[np.int8]: ...
|
||||
|
||||
class UInt64Vector:
|
||||
def __init__(self): ...
|
||||
def __len__(self) -> int: ...
|
||||
def to_array(self) -> npt.NDArray[np.uint64]: ...
|
||||
|
||||
class UInt32Vector:
|
||||
def __init__(self): ...
|
||||
def __len__(self) -> int: ...
|
||||
def to_array(self) -> npt.NDArray[np.uint32]: ...
|
||||
|
||||
class UInt16Vector:
|
||||
def __init__(self): ...
|
||||
def __len__(self) -> int: ...
|
||||
def to_array(self) -> npt.NDArray[np.uint16]: ...
|
||||
|
||||
class UInt8Vector:
|
||||
def __init__(self): ...
|
||||
def __len__(self) -> int: ...
|
||||
def to_array(self) -> npt.NDArray[np.uint8]: ...
|
||||
|
||||
class Float64Vector:
|
||||
def __init__(self): ...
|
||||
def __len__(self) -> int: ...
|
||||
def to_array(self) -> npt.NDArray[np.float64]: ...
|
||||
|
||||
class Float32Vector:
|
||||
def __init__(self): ...
|
||||
def __len__(self) -> int: ...
|
||||
def to_array(self) -> npt.NDArray[np.float32]: ...
|
||||
|
||||
class Complex128Vector:
|
||||
def __init__(self): ...
|
||||
def __len__(self) -> int: ...
|
||||
def to_array(self) -> npt.NDArray[np.complex128]: ...
|
||||
|
||||
class Complex64Vector:
|
||||
def __init__(self): ...
|
||||
def __len__(self) -> int: ...
|
||||
def to_array(self) -> npt.NDArray[np.complex64]: ...
|
||||
|
||||
class StringVector:
|
||||
def __init__(self): ...
|
||||
def __len__(self) -> int: ...
|
||||
def to_array(self) -> npt.NDArray[np.object_]: ...
|
||||
|
||||
class ObjectVector:
|
||||
def __init__(self): ...
|
||||
def __len__(self) -> int: ...
|
||||
def to_array(self) -> npt.NDArray[np.object_]: ...
|
||||
|
||||
class HashTable:
|
||||
# NB: The base HashTable class does _not_ actually have these methods;
|
||||
# we are putting the here for the sake of mypy to avoid
|
||||
# reproducing them in each subclass below.
|
||||
def __init__(self, size_hint: int = ...): ...
|
||||
def __len__(self) -> int: ...
|
||||
def __contains__(self, key: Hashable) -> bool: ...
|
||||
def sizeof(self, deep: bool = ...) -> int: ...
|
||||
def get_state(self) -> dict[str, int]: ...
|
||||
# TODO: `item` type is subclass-specific
|
||||
def get_item(self, item): ... # TODO: return type?
|
||||
def set_item(self, item) -> None: ...
|
||||
# FIXME: we don't actually have this for StringHashTable or ObjectHashTable?
|
||||
def map(
|
||||
self,
|
||||
keys: np.ndarray, # np.ndarray[subclass-specific]
|
||||
values: np.ndarray, # const int64_t[:]
|
||||
) -> None: ...
|
||||
def map_locations(
|
||||
self,
|
||||
values: np.ndarray, # np.ndarray[subclass-specific]
|
||||
) -> None: ...
|
||||
def lookup(
|
||||
self,
|
||||
values: np.ndarray, # np.ndarray[subclass-specific]
|
||||
) -> npt.NDArray[np.intp]: ...
|
||||
def get_labels(
|
||||
self,
|
||||
values: np.ndarray, # np.ndarray[subclass-specific]
|
||||
uniques, # SubclassTypeVector
|
||||
count_prior: int = ...,
|
||||
na_sentinel: int = ...,
|
||||
na_value: object = ...,
|
||||
) -> npt.NDArray[np.intp]: ...
|
||||
def unique(
|
||||
self,
|
||||
values: np.ndarray, # np.ndarray[subclass-specific]
|
||||
return_inverse: bool = ...,
|
||||
) -> tuple[
|
||||
np.ndarray, # np.ndarray[subclass-specific]
|
||||
npt.NDArray[np.intp],
|
||||
] | np.ndarray: ... # np.ndarray[subclass-specific]
|
||||
def _unique(
|
||||
self,
|
||||
values: np.ndarray, # np.ndarray[subclass-specific]
|
||||
uniques, # FooVector
|
||||
count_prior: int = ...,
|
||||
na_sentinel: int = ...,
|
||||
na_value: object = ...,
|
||||
ignore_na: bool = ...,
|
||||
return_inverse: bool = ...,
|
||||
) -> tuple[
|
||||
np.ndarray, # np.ndarray[subclass-specific]
|
||||
npt.NDArray[np.intp],
|
||||
] | np.ndarray: ... # np.ndarray[subclass-specific]
|
||||
def factorize(
|
||||
self,
|
||||
values: np.ndarray, # np.ndarray[subclass-specific]
|
||||
na_sentinel: int = ...,
|
||||
na_value: object = ...,
|
||||
mask=...,
|
||||
) -> tuple[np.ndarray, npt.NDArray[np.intp],]: ... # np.ndarray[subclass-specific]
|
||||
|
||||
class Complex128HashTable(HashTable): ...
|
||||
class Complex64HashTable(HashTable): ...
|
||||
class Float64HashTable(HashTable): ...
|
||||
class Float32HashTable(HashTable): ...
|
||||
|
||||
class Int64HashTable(HashTable):
|
||||
# Only Int64HashTable has get_labels_groupby
|
||||
def get_labels_groupby(
|
||||
self,
|
||||
values: np.ndarray, # const int64_t[:]
|
||||
) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.int64],]: ...
|
||||
|
||||
class Int32HashTable(HashTable): ...
|
||||
class Int16HashTable(HashTable): ...
|
||||
class Int8HashTable(HashTable): ...
|
||||
class UInt64HashTable(HashTable): ...
|
||||
class UInt32HashTable(HashTable): ...
|
||||
class UInt16HashTable(HashTable): ...
|
||||
class UInt8HashTable(HashTable): ...
|
||||
class StringHashTable(HashTable): ...
|
||||
class PyObjectHashTable(HashTable): ...
|
||||
class IntpHashTable(HashTable): ...
|
||||
|
||||
def duplicated(
|
||||
values: np.ndarray,
|
||||
keep: Literal["last", "first", False] = ...,
|
||||
) -> npt.NDArray[np.bool_]: ...
|
||||
def mode(values: np.ndarray, dropna: bool) -> np.ndarray: ...
|
||||
def value_count(
|
||||
values: np.ndarray,
|
||||
dropna: bool,
|
||||
) -> tuple[np.ndarray, npt.NDArray[np.int64],]: ... # np.ndarray[same-as-values]
|
||||
|
||||
# arr and values should have same dtype
|
||||
def ismember(
|
||||
arr: np.ndarray,
|
||||
values: np.ndarray,
|
||||
) -> npt.NDArray[np.bool_]: ...
|
||||
def object_hash(obj) -> int: ...
|
||||
def objects_are_equal(a, b) -> bool: ...
|
||||
@@ -0,0 +1,182 @@
|
||||
cimport cython
|
||||
from cpython.mem cimport (
|
||||
PyMem_Free,
|
||||
PyMem_Malloc,
|
||||
)
|
||||
from cpython.ref cimport (
|
||||
Py_INCREF,
|
||||
PyObject,
|
||||
)
|
||||
from libc.stdlib cimport (
|
||||
free,
|
||||
malloc,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
cimport numpy as cnp
|
||||
from numpy cimport (
|
||||
float64_t,
|
||||
ndarray,
|
||||
uint8_t,
|
||||
uint32_t,
|
||||
)
|
||||
from numpy.math cimport NAN
|
||||
|
||||
cnp.import_array()
|
||||
|
||||
|
||||
from pandas._libs cimport util
|
||||
from pandas._libs.khash cimport (
|
||||
KHASH_TRACE_DOMAIN,
|
||||
are_equivalent_float32_t,
|
||||
are_equivalent_float64_t,
|
||||
are_equivalent_khcomplex64_t,
|
||||
are_equivalent_khcomplex128_t,
|
||||
kh_needed_n_buckets,
|
||||
kh_python_hash_equal,
|
||||
kh_python_hash_func,
|
||||
kh_str_t,
|
||||
khcomplex64_t,
|
||||
khcomplex128_t,
|
||||
khiter_t,
|
||||
)
|
||||
from pandas._libs.missing cimport checknull
|
||||
|
||||
|
||||
def get_hashtable_trace_domain():
|
||||
return KHASH_TRACE_DOMAIN
|
||||
|
||||
|
||||
def object_hash(obj):
|
||||
return kh_python_hash_func(obj)
|
||||
|
||||
|
||||
def objects_are_equal(a, b):
|
||||
return kh_python_hash_equal(a, b)
|
||||
|
||||
|
||||
cdef int64_t NPY_NAT = util.get_nat()
|
||||
SIZE_HINT_LIMIT = (1 << 20) + 7
|
||||
|
||||
|
||||
cdef Py_ssize_t _INIT_VEC_CAP = 128
|
||||
|
||||
include "hashtable_class_helper.pxi"
|
||||
include "hashtable_func_helper.pxi"
|
||||
|
||||
|
||||
# map derived hash-map types onto basic hash-map types:
|
||||
if np.dtype(np.intp) == np.dtype(np.int64):
|
||||
IntpHashTable = Int64HashTable
|
||||
unique_label_indices = _unique_label_indices_int64
|
||||
elif np.dtype(np.intp) == np.dtype(np.int32):
|
||||
IntpHashTable = Int32HashTable
|
||||
unique_label_indices = _unique_label_indices_int32
|
||||
else:
|
||||
raise ValueError(np.dtype(np.intp))
|
||||
|
||||
|
||||
cdef class Factorizer:
|
||||
cdef readonly:
|
||||
Py_ssize_t count
|
||||
|
||||
def __cinit__(self, size_hint: int):
|
||||
self.count = 0
|
||||
|
||||
def get_count(self) -> int:
|
||||
return self.count
|
||||
|
||||
|
||||
cdef class ObjectFactorizer(Factorizer):
|
||||
cdef public:
|
||||
PyObjectHashTable table
|
||||
ObjectVector uniques
|
||||
|
||||
def __cinit__(self, size_hint: int):
|
||||
self.table = PyObjectHashTable(size_hint)
|
||||
self.uniques = ObjectVector()
|
||||
|
||||
def factorize(
|
||||
self, ndarray[object] values, sort=False, na_sentinel=-1, na_value=None
|
||||
) -> np.ndarray:
|
||||
"""
|
||||
|
||||
Returns
|
||||
-------
|
||||
np.ndarray[np.intp]
|
||||
|
||||
Examples
|
||||
--------
|
||||
Factorize values with nans replaced by na_sentinel
|
||||
|
||||
>>> fac = ObjectFactorizer(3)
|
||||
>>> fac.factorize(np.array([1,2,np.nan], dtype='O'), na_sentinel=20)
|
||||
array([ 0, 1, 20])
|
||||
"""
|
||||
cdef:
|
||||
ndarray[intp_t] labels
|
||||
|
||||
if self.uniques.external_view_exists:
|
||||
uniques = ObjectVector()
|
||||
uniques.extend(self.uniques.to_array())
|
||||
self.uniques = uniques
|
||||
labels = self.table.get_labels(values, self.uniques,
|
||||
self.count, na_sentinel, na_value)
|
||||
mask = (labels == na_sentinel)
|
||||
# sort on
|
||||
if sort:
|
||||
sorter = self.uniques.to_array().argsort()
|
||||
reverse_indexer = np.empty(len(sorter), dtype=np.intp)
|
||||
reverse_indexer.put(sorter, np.arange(len(sorter)))
|
||||
labels = reverse_indexer.take(labels, mode='clip')
|
||||
labels[mask] = na_sentinel
|
||||
self.count = len(self.uniques)
|
||||
return labels
|
||||
|
||||
|
||||
cdef class Int64Factorizer(Factorizer):
|
||||
cdef public:
|
||||
Int64HashTable table
|
||||
Int64Vector uniques
|
||||
|
||||
def __cinit__(self, size_hint: int):
|
||||
self.table = Int64HashTable(size_hint)
|
||||
self.uniques = Int64Vector()
|
||||
|
||||
def factorize(self, const int64_t[:] values, sort=False,
|
||||
na_sentinel=-1, na_value=None) -> np.ndarray:
|
||||
"""
|
||||
Returns
|
||||
-------
|
||||
ndarray[intp_t]
|
||||
|
||||
Examples
|
||||
--------
|
||||
Factorize values with nans replaced by na_sentinel
|
||||
|
||||
>>> fac = Int64Factorizer(3)
|
||||
>>> fac.factorize(np.array([1,2,3]), na_sentinel=20)
|
||||
array([0, 1, 2])
|
||||
"""
|
||||
cdef:
|
||||
ndarray[intp_t] labels
|
||||
|
||||
if self.uniques.external_view_exists:
|
||||
uniques = Int64Vector()
|
||||
uniques.extend(self.uniques.to_array())
|
||||
self.uniques = uniques
|
||||
labels = self.table.get_labels(values, self.uniques,
|
||||
self.count, na_sentinel,
|
||||
na_value=na_value)
|
||||
|
||||
# sort on
|
||||
if sort:
|
||||
sorter = self.uniques.to_array().argsort()
|
||||
reverse_indexer = np.empty(len(sorter), dtype=np.intp)
|
||||
reverse_indexer.put(sorter, np.arange(len(sorter)))
|
||||
|
||||
labels = reverse_indexer.take(labels)
|
||||
|
||||
self.count = len(self.uniques)
|
||||
return labels
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,515 @@
|
||||
"""
|
||||
Template for each `dtype` helper function for hashtable
|
||||
|
||||
WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
|
||||
"""
|
||||
|
||||
{{py:
|
||||
|
||||
# name, dtype, ttype, c_type, to_c_type
|
||||
dtypes = [('Complex128', 'complex128', 'complex128',
|
||||
'khcomplex128_t', 'to_khcomplex128_t'),
|
||||
('Complex64', 'complex64', 'complex64',
|
||||
'khcomplex64_t', 'to_khcomplex64_t'),
|
||||
('Float64', 'float64', 'float64', 'float64_t', ''),
|
||||
('Float32', 'float32', 'float32', 'float32_t', ''),
|
||||
('UInt64', 'uint64', 'uint64', 'uint64_t', ''),
|
||||
('UInt32', 'uint32', 'uint32', 'uint32_t', ''),
|
||||
('UInt16', 'uint16', 'uint16', 'uint16_t', ''),
|
||||
('UInt8', 'uint8', 'uint8', 'uint8_t', ''),
|
||||
('Object', 'object', 'pymap', 'object', ''),
|
||||
('Int64', 'int64', 'int64', 'int64_t', ''),
|
||||
('Int32', 'int32', 'int32', 'int32_t', ''),
|
||||
('Int16', 'int16', 'int16', 'int16_t', ''),
|
||||
('Int8', 'int8', 'int8', 'int8_t', '')]
|
||||
|
||||
}}
|
||||
|
||||
{{for name, dtype, ttype, c_type, to_c_type in dtypes}}
|
||||
|
||||
|
||||
@cython.wraparound(False)
|
||||
@cython.boundscheck(False)
|
||||
{{if dtype == 'object'}}
|
||||
cdef value_count_{{dtype}}(ndarray[{{dtype}}] values, bint dropna):
|
||||
{{else}}
|
||||
cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna):
|
||||
{{endif}}
|
||||
cdef:
|
||||
Py_ssize_t i = 0
|
||||
Py_ssize_t n = len(values)
|
||||
kh_{{ttype}}_t *table
|
||||
|
||||
# Don't use Py_ssize_t, since table.n_buckets is unsigned
|
||||
khiter_t k
|
||||
|
||||
{{c_type}} val
|
||||
|
||||
int ret = 0
|
||||
|
||||
# we track the order in which keys are first seen (GH39009),
|
||||
# khash-map isn't insertion-ordered, thus:
|
||||
# table maps keys to counts
|
||||
# result_keys remembers the original order of keys
|
||||
|
||||
result_keys = {{name}}Vector()
|
||||
table = kh_init_{{ttype}}()
|
||||
|
||||
{{if dtype == 'object'}}
|
||||
kh_resize_{{ttype}}(table, n // 10)
|
||||
|
||||
for i in range(n):
|
||||
val = values[i]
|
||||
if not dropna or not checknull(val):
|
||||
k = kh_get_{{ttype}}(table, <PyObject*>val)
|
||||
if k != table.n_buckets:
|
||||
table.vals[k] += 1
|
||||
else:
|
||||
k = kh_put_{{ttype}}(table, <PyObject*>val, &ret)
|
||||
table.vals[k] = 1
|
||||
result_keys.append(val)
|
||||
{{else}}
|
||||
kh_resize_{{ttype}}(table, n)
|
||||
|
||||
for i in range(n):
|
||||
val = {{to_c_type}}(values[i])
|
||||
|
||||
if not is_nan_{{c_type}}(val) or not dropna:
|
||||
k = kh_get_{{ttype}}(table, val)
|
||||
if k != table.n_buckets:
|
||||
table.vals[k] += 1
|
||||
else:
|
||||
k = kh_put_{{ttype}}(table, val, &ret)
|
||||
table.vals[k] = 1
|
||||
result_keys.append(val)
|
||||
{{endif}}
|
||||
|
||||
# collect counts in the order corresponding to result_keys:
|
||||
cdef int64_t[:] result_counts = np.empty(table.size, dtype=np.int64)
|
||||
for i in range(table.size):
|
||||
{{if dtype == 'object'}}
|
||||
k = kh_get_{{ttype}}(table, result_keys.data[i])
|
||||
{{else}}
|
||||
k = kh_get_{{ttype}}(table, result_keys.data.data[i])
|
||||
{{endif}}
|
||||
result_counts[i] = table.vals[k]
|
||||
|
||||
kh_destroy_{{ttype}}(table)
|
||||
|
||||
return result_keys.to_array(), result_counts.base
|
||||
|
||||
|
||||
@cython.wraparound(False)
|
||||
@cython.boundscheck(False)
|
||||
{{if dtype == 'object'}}
|
||||
cdef duplicated_{{dtype}}(ndarray[{{dtype}}] values, object keep='first'):
|
||||
{{else}}
|
||||
cdef duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first'):
|
||||
{{endif}}
|
||||
cdef:
|
||||
int ret = 0
|
||||
{{if dtype != 'object'}}
|
||||
{{c_type}} value
|
||||
{{endif}}
|
||||
Py_ssize_t i, n = len(values)
|
||||
khiter_t k
|
||||
kh_{{ttype}}_t *table = kh_init_{{ttype}}()
|
||||
ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool')
|
||||
|
||||
kh_resize_{{ttype}}(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT))
|
||||
|
||||
if keep not in ('last', 'first', False):
|
||||
raise ValueError('keep must be either "first", "last" or False')
|
||||
|
||||
if keep == 'last':
|
||||
{{if dtype == 'object'}}
|
||||
for i in range(n - 1, -1, -1):
|
||||
# equivalent: range(n)[::-1], which cython doesn't like in nogil
|
||||
kh_put_{{ttype}}(table, <PyObject*>values[i], &ret)
|
||||
out[i] = ret == 0
|
||||
{{else}}
|
||||
with nogil:
|
||||
for i in range(n - 1, -1, -1):
|
||||
# equivalent: range(n)[::-1], which cython doesn't like in nogil
|
||||
value = {{to_c_type}}(values[i])
|
||||
kh_put_{{ttype}}(table, value, &ret)
|
||||
out[i] = ret == 0
|
||||
{{endif}}
|
||||
elif keep == 'first':
|
||||
{{if dtype == 'object'}}
|
||||
for i in range(n):
|
||||
kh_put_{{ttype}}(table, <PyObject*>values[i], &ret)
|
||||
out[i] = ret == 0
|
||||
{{else}}
|
||||
with nogil:
|
||||
for i in range(n):
|
||||
value = {{to_c_type}}(values[i])
|
||||
kh_put_{{ttype}}(table, value, &ret)
|
||||
out[i] = ret == 0
|
||||
{{endif}}
|
||||
else:
|
||||
{{if dtype == 'object'}}
|
||||
for i in range(n):
|
||||
value = values[i]
|
||||
k = kh_get_{{ttype}}(table, <PyObject*>value)
|
||||
if k != table.n_buckets:
|
||||
out[table.vals[k]] = 1
|
||||
out[i] = 1
|
||||
else:
|
||||
k = kh_put_{{ttype}}(table, <PyObject*>value, &ret)
|
||||
table.vals[k] = i
|
||||
out[i] = 0
|
||||
{{else}}
|
||||
with nogil:
|
||||
for i in range(n):
|
||||
value = {{to_c_type}}(values[i])
|
||||
k = kh_get_{{ttype}}(table, value)
|
||||
if k != table.n_buckets:
|
||||
out[table.vals[k]] = 1
|
||||
out[i] = 1
|
||||
else:
|
||||
k = kh_put_{{ttype}}(table, value, &ret)
|
||||
table.vals[k] = i
|
||||
out[i] = 0
|
||||
{{endif}}
|
||||
kh_destroy_{{ttype}}(table)
|
||||
return out
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# Membership
|
||||
# ----------------------------------------------------------------------
|
||||
|
||||
|
||||
@cython.wraparound(False)
|
||||
@cython.boundscheck(False)
|
||||
{{if dtype == 'object'}}
|
||||
cdef ismember_{{dtype}}(ndarray[{{c_type}}] arr, ndarray[{{c_type}}] values):
|
||||
{{else}}
|
||||
cdef ismember_{{dtype}}(const {{dtype}}_t[:] arr, const {{dtype}}_t[:] values):
|
||||
{{endif}}
|
||||
"""
|
||||
Return boolean of values in arr on an
|
||||
element by-element basis
|
||||
|
||||
Parameters
|
||||
----------
|
||||
arr : {{dtype}} ndarray
|
||||
values : {{dtype}} ndarray
|
||||
|
||||
Returns
|
||||
-------
|
||||
boolean ndarry len of (arr)
|
||||
"""
|
||||
cdef:
|
||||
Py_ssize_t i, n
|
||||
khiter_t k
|
||||
int ret = 0
|
||||
ndarray[uint8_t] result
|
||||
{{c_type}} val
|
||||
kh_{{ttype}}_t *table = kh_init_{{ttype}}()
|
||||
|
||||
# construct the table
|
||||
n = len(values)
|
||||
kh_resize_{{ttype}}(table, n)
|
||||
|
||||
{{if dtype == 'object'}}
|
||||
for i in range(n):
|
||||
kh_put_{{ttype}}(table, <PyObject*>values[i], &ret)
|
||||
{{else}}
|
||||
with nogil:
|
||||
for i in range(n):
|
||||
val = {{to_c_type}}(values[i])
|
||||
kh_put_{{ttype}}(table, val, &ret)
|
||||
{{endif}}
|
||||
|
||||
# test membership
|
||||
n = len(arr)
|
||||
result = np.empty(n, dtype=np.uint8)
|
||||
|
||||
{{if dtype == 'object'}}
|
||||
for i in range(n):
|
||||
val = arr[i]
|
||||
k = kh_get_{{ttype}}(table, <PyObject*>val)
|
||||
result[i] = (k != table.n_buckets)
|
||||
{{else}}
|
||||
with nogil:
|
||||
for i in range(n):
|
||||
val = {{to_c_type}}(arr[i])
|
||||
k = kh_get_{{ttype}}(table, val)
|
||||
result[i] = (k != table.n_buckets)
|
||||
{{endif}}
|
||||
|
||||
kh_destroy_{{ttype}}(table)
|
||||
return result.view(np.bool_)
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# Mode Computations
|
||||
# ----------------------------------------------------------------------
|
||||
|
||||
|
||||
@cython.wraparound(False)
|
||||
@cython.boundscheck(False)
|
||||
{{if dtype == 'object'}}
|
||||
cdef mode_{{dtype}}(ndarray[{{dtype}}] values, bint dropna):
|
||||
{{else}}
|
||||
cdef mode_{{dtype}}(const {{dtype}}_t[:] values, bint dropna):
|
||||
{{endif}}
|
||||
cdef:
|
||||
{{if dtype == 'object'}}
|
||||
ndarray[{{dtype}}] keys
|
||||
ndarray[{{dtype}}] modes
|
||||
{{else}}
|
||||
{{dtype}}_t[:] keys
|
||||
ndarray[{{dtype}}_t] modes
|
||||
{{endif}}
|
||||
int64_t[:] counts
|
||||
int64_t count, max_count = -1
|
||||
Py_ssize_t k, j = 0
|
||||
|
||||
keys, counts = value_count_{{dtype}}(values, dropna)
|
||||
|
||||
{{if dtype == 'object'}}
|
||||
modes = np.empty(len(keys), dtype=np.object_)
|
||||
{{else}}
|
||||
modes = np.empty(len(keys), dtype=np.{{dtype}})
|
||||
{{endif}}
|
||||
|
||||
{{if dtype != 'object'}}
|
||||
with nogil:
|
||||
for k in range(len(keys)):
|
||||
count = counts[k]
|
||||
if count == max_count:
|
||||
j += 1
|
||||
elif count > max_count:
|
||||
max_count = count
|
||||
j = 0
|
||||
else:
|
||||
continue
|
||||
|
||||
modes[j] = keys[k]
|
||||
{{else}}
|
||||
for k in range(len(keys)):
|
||||
count = counts[k]
|
||||
if count == max_count:
|
||||
j += 1
|
||||
elif count > max_count:
|
||||
max_count = count
|
||||
j = 0
|
||||
else:
|
||||
continue
|
||||
|
||||
modes[j] = keys[k]
|
||||
{{endif}}
|
||||
|
||||
return modes[:j + 1]
|
||||
|
||||
{{endfor}}
|
||||
|
||||
|
||||
ctypedef fused htfunc_t:
|
||||
complex128_t
|
||||
complex64_t
|
||||
float64_t
|
||||
float32_t
|
||||
uint64_t
|
||||
uint32_t
|
||||
uint16_t
|
||||
uint8_t
|
||||
int64_t
|
||||
int32_t
|
||||
int16_t
|
||||
int8_t
|
||||
object
|
||||
|
||||
|
||||
cpdef value_count(ndarray[htfunc_t] values, bint dropna):
|
||||
if htfunc_t is object:
|
||||
return value_count_object(values, dropna)
|
||||
|
||||
elif htfunc_t is int8_t:
|
||||
return value_count_int8(values, dropna)
|
||||
elif htfunc_t is int16_t:
|
||||
return value_count_int16(values, dropna)
|
||||
elif htfunc_t is int32_t:
|
||||
return value_count_int32(values, dropna)
|
||||
elif htfunc_t is int64_t:
|
||||
return value_count_int64(values, dropna)
|
||||
|
||||
elif htfunc_t is uint8_t:
|
||||
return value_count_uint8(values, dropna)
|
||||
elif htfunc_t is uint16_t:
|
||||
return value_count_uint16(values, dropna)
|
||||
elif htfunc_t is uint32_t:
|
||||
return value_count_uint32(values, dropna)
|
||||
elif htfunc_t is uint64_t:
|
||||
return value_count_uint64(values, dropna)
|
||||
|
||||
elif htfunc_t is float64_t:
|
||||
return value_count_float64(values, dropna)
|
||||
elif htfunc_t is float32_t:
|
||||
return value_count_float32(values, dropna)
|
||||
|
||||
elif htfunc_t is complex128_t:
|
||||
return value_count_complex128(values, dropna)
|
||||
elif htfunc_t is complex64_t:
|
||||
return value_count_complex64(values, dropna)
|
||||
|
||||
else:
|
||||
raise TypeError(values.dtype)
|
||||
|
||||
|
||||
cpdef duplicated(ndarray[htfunc_t] values, object keep="first"):
|
||||
if htfunc_t is object:
|
||||
return duplicated_object(values, keep)
|
||||
|
||||
elif htfunc_t is int8_t:
|
||||
return duplicated_int8(values, keep)
|
||||
elif htfunc_t is int16_t:
|
||||
return duplicated_int16(values, keep)
|
||||
elif htfunc_t is int32_t:
|
||||
return duplicated_int32(values, keep)
|
||||
elif htfunc_t is int64_t:
|
||||
return duplicated_int64(values, keep)
|
||||
|
||||
elif htfunc_t is uint8_t:
|
||||
return duplicated_uint8(values, keep)
|
||||
elif htfunc_t is uint16_t:
|
||||
return duplicated_uint16(values, keep)
|
||||
elif htfunc_t is uint32_t:
|
||||
return duplicated_uint32(values, keep)
|
||||
elif htfunc_t is uint64_t:
|
||||
return duplicated_uint64(values, keep)
|
||||
|
||||
elif htfunc_t is float64_t:
|
||||
return duplicated_float64(values, keep)
|
||||
elif htfunc_t is float32_t:
|
||||
return duplicated_float32(values, keep)
|
||||
|
||||
elif htfunc_t is complex128_t:
|
||||
return duplicated_complex128(values, keep)
|
||||
elif htfunc_t is complex64_t:
|
||||
return duplicated_complex64(values, keep)
|
||||
|
||||
else:
|
||||
raise TypeError(values.dtype)
|
||||
|
||||
|
||||
cpdef ismember(ndarray[htfunc_t] arr, ndarray[htfunc_t] values):
|
||||
if htfunc_t is object:
|
||||
return ismember_object(arr, values)
|
||||
|
||||
elif htfunc_t is int8_t:
|
||||
return ismember_int8(arr, values)
|
||||
elif htfunc_t is int16_t:
|
||||
return ismember_int16(arr, values)
|
||||
elif htfunc_t is int32_t:
|
||||
return ismember_int32(arr, values)
|
||||
elif htfunc_t is int64_t:
|
||||
return ismember_int64(arr, values)
|
||||
|
||||
elif htfunc_t is uint8_t:
|
||||
return ismember_uint8(arr, values)
|
||||
elif htfunc_t is uint16_t:
|
||||
return ismember_uint16(arr, values)
|
||||
elif htfunc_t is uint32_t:
|
||||
return ismember_uint32(arr, values)
|
||||
elif htfunc_t is uint64_t:
|
||||
return ismember_uint64(arr, values)
|
||||
|
||||
elif htfunc_t is float64_t:
|
||||
return ismember_float64(arr, values)
|
||||
elif htfunc_t is float32_t:
|
||||
return ismember_float32(arr, values)
|
||||
|
||||
elif htfunc_t is complex128_t:
|
||||
return ismember_complex128(arr, values)
|
||||
elif htfunc_t is complex64_t:
|
||||
return ismember_complex64(arr, values)
|
||||
|
||||
else:
|
||||
raise TypeError(values.dtype)
|
||||
|
||||
|
||||
cpdef mode(ndarray[htfunc_t] values, bint dropna):
|
||||
if htfunc_t is object:
|
||||
return mode_object(values, dropna)
|
||||
|
||||
elif htfunc_t is int8_t:
|
||||
return mode_int8(values, dropna)
|
||||
elif htfunc_t is int16_t:
|
||||
return mode_int16(values, dropna)
|
||||
elif htfunc_t is int32_t:
|
||||
return mode_int32(values, dropna)
|
||||
elif htfunc_t is int64_t:
|
||||
return mode_int64(values, dropna)
|
||||
|
||||
elif htfunc_t is uint8_t:
|
||||
return mode_uint8(values, dropna)
|
||||
elif htfunc_t is uint16_t:
|
||||
return mode_uint16(values, dropna)
|
||||
elif htfunc_t is uint32_t:
|
||||
return mode_uint32(values, dropna)
|
||||
elif htfunc_t is uint64_t:
|
||||
return mode_uint64(values, dropna)
|
||||
|
||||
elif htfunc_t is float64_t:
|
||||
return mode_float64(values, dropna)
|
||||
elif htfunc_t is float32_t:
|
||||
return mode_float32(values, dropna)
|
||||
|
||||
elif htfunc_t is complex128_t:
|
||||
return mode_complex128(values, dropna)
|
||||
elif htfunc_t is complex64_t:
|
||||
return mode_complex64(values, dropna)
|
||||
|
||||
else:
|
||||
raise TypeError(values.dtype)
|
||||
|
||||
|
||||
{{py:
|
||||
|
||||
# name, dtype, ttype, c_type
|
||||
dtypes = [('Int64', 'int64', 'int64', 'int64_t'),
|
||||
('Int32', 'int32', 'int32', 'int32_t'), ]
|
||||
|
||||
}}
|
||||
|
||||
{{for name, dtype, ttype, c_type in dtypes}}
|
||||
|
||||
|
||||
@cython.wraparound(False)
|
||||
@cython.boundscheck(False)
|
||||
def _unique_label_indices_{{dtype}}(const {{c_type}}[:] labels) -> ndarray:
|
||||
"""
|
||||
Indices of the first occurrences of the unique labels
|
||||
*excluding* -1. equivalent to:
|
||||
np.unique(labels, return_index=True)[1]
|
||||
"""
|
||||
cdef:
|
||||
int ret = 0
|
||||
Py_ssize_t i, n = len(labels)
|
||||
kh_{{ttype}}_t *table = kh_init_{{ttype}}()
|
||||
{{name}}Vector idx = {{name}}Vector()
|
||||
ndarray[{{c_type}}, ndim=1] arr
|
||||
{{name}}VectorData *ud = idx.data
|
||||
|
||||
kh_resize_{{ttype}}(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT))
|
||||
|
||||
with nogil:
|
||||
for i in range(n):
|
||||
kh_put_{{ttype}}(table, labels[i], &ret)
|
||||
if ret != 0:
|
||||
if needs_resize(ud):
|
||||
with gil:
|
||||
idx.resize()
|
||||
append_data_{{ttype}}(ud, i)
|
||||
|
||||
kh_destroy_{{ttype}}(table)
|
||||
|
||||
arr = idx.to_array()
|
||||
arr = arr[np.asarray(labels)[arr].argsort()]
|
||||
|
||||
return arr[1:] if arr.size != 0 and labels[arr[0]] == -1 else arr
|
||||
|
||||
{{endfor}}
|
||||
Binary file not shown.
@@ -0,0 +1,65 @@
|
||||
import numpy as np
|
||||
|
||||
from pandas._typing import npt
|
||||
|
||||
from pandas import MultiIndex
|
||||
|
||||
class IndexEngine:
|
||||
over_size_threshold: bool
|
||||
def __init__(self, values: np.ndarray): ...
|
||||
def __contains__(self, val: object) -> bool: ...
|
||||
# -> int | slice | np.ndarray[bool]
|
||||
def get_loc(self, val: object) -> int | slice | np.ndarray: ...
|
||||
def sizeof(self, deep: bool = ...) -> int: ...
|
||||
def __sizeof__(self) -> int: ...
|
||||
@property
|
||||
def is_unique(self) -> bool: ...
|
||||
@property
|
||||
def is_monotonic_increasing(self) -> bool: ...
|
||||
@property
|
||||
def is_monotonic_decreasing(self) -> bool: ...
|
||||
@property
|
||||
def is_mapping_populated(self) -> bool: ...
|
||||
def clear_mapping(self): ...
|
||||
def get_indexer(self, values: np.ndarray) -> npt.NDArray[np.intp]: ...
|
||||
def get_indexer_non_unique(
|
||||
self,
|
||||
targets: np.ndarray,
|
||||
) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ...
|
||||
|
||||
class Float64Engine(IndexEngine): ...
|
||||
class Float32Engine(IndexEngine): ...
|
||||
class Int64Engine(IndexEngine): ...
|
||||
class Int32Engine(IndexEngine): ...
|
||||
class Int16Engine(IndexEngine): ...
|
||||
class Int8Engine(IndexEngine): ...
|
||||
class UInt64Engine(IndexEngine): ...
|
||||
class UInt32Engine(IndexEngine): ...
|
||||
class UInt16Engine(IndexEngine): ...
|
||||
class UInt8Engine(IndexEngine): ...
|
||||
class ObjectEngine(IndexEngine): ...
|
||||
class DatetimeEngine(Int64Engine): ...
|
||||
class TimedeltaEngine(DatetimeEngine): ...
|
||||
class PeriodEngine(Int64Engine): ...
|
||||
|
||||
class BaseMultiIndexCodesEngine:
|
||||
levels: list[np.ndarray]
|
||||
offsets: np.ndarray # ndarray[uint64_t, ndim=1]
|
||||
def __init__(
|
||||
self,
|
||||
levels: list[np.ndarray], # all entries hashable
|
||||
labels: list[np.ndarray], # all entries integer-dtyped
|
||||
offsets: np.ndarray, # np.ndarray[np.uint64, ndim=1]
|
||||
): ...
|
||||
def get_indexer(
|
||||
self,
|
||||
target: npt.NDArray[np.object_],
|
||||
) -> npt.NDArray[np.intp]: ...
|
||||
def _extract_level_codes(self, target: MultiIndex) -> np.ndarray: ...
|
||||
def get_indexer_with_fill(
|
||||
self,
|
||||
target: np.ndarray, # np.ndarray[object] of tuples
|
||||
values: np.ndarray, # np.ndarray[object] of tuples
|
||||
method: str,
|
||||
limit: int | None,
|
||||
) -> npt.NDArray[np.intp]: ...
|
||||
@@ -0,0 +1,799 @@
|
||||
cimport cython
|
||||
|
||||
import numpy as np
|
||||
|
||||
cimport numpy as cnp
|
||||
from numpy cimport (
|
||||
float32_t,
|
||||
float64_t,
|
||||
int8_t,
|
||||
int16_t,
|
||||
int32_t,
|
||||
int64_t,
|
||||
intp_t,
|
||||
ndarray,
|
||||
uint8_t,
|
||||
uint16_t,
|
||||
uint32_t,
|
||||
uint64_t,
|
||||
)
|
||||
|
||||
cnp.import_array()
|
||||
|
||||
|
||||
from pandas._libs cimport util
|
||||
from pandas._libs.hashtable cimport HashTable
|
||||
from pandas._libs.tslibs.nattype cimport c_NaT as NaT
|
||||
from pandas._libs.tslibs.period cimport is_period_object
|
||||
from pandas._libs.tslibs.timedeltas cimport _Timedelta
|
||||
from pandas._libs.tslibs.timestamps cimport _Timestamp
|
||||
|
||||
from pandas._libs import (
|
||||
algos,
|
||||
hashtable as _hash,
|
||||
)
|
||||
|
||||
from pandas._libs.lib cimport eq_NA_compat
|
||||
from pandas._libs.missing cimport (
|
||||
C_NA as NA,
|
||||
checknull,
|
||||
is_matching_na,
|
||||
)
|
||||
|
||||
|
||||
cdef inline bint is_definitely_invalid_key(object val):
|
||||
try:
|
||||
hash(val)
|
||||
except TypeError:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
cdef ndarray _get_bool_indexer(ndarray values, object val):
|
||||
"""
|
||||
Return a ndarray[bool] of locations where val matches self.values.
|
||||
|
||||
If val is not NA, this is equivalent to `self.values == val`
|
||||
"""
|
||||
# Caller is responsible for ensuring _check_type has already been called
|
||||
cdef:
|
||||
ndarray[uint8_t, ndim=1, cast=True] indexer
|
||||
Py_ssize_t i
|
||||
object item
|
||||
|
||||
if values.descr.type_num == cnp.NPY_OBJECT:
|
||||
# i.e. values.dtype == object
|
||||
if not checknull(val):
|
||||
indexer = eq_NA_compat(values, val)
|
||||
|
||||
else:
|
||||
# We need to check for _matching_ NA values
|
||||
indexer = np.empty(len(values), dtype=np.uint8)
|
||||
|
||||
for i in range(len(values)):
|
||||
item = values[i]
|
||||
indexer[i] = is_matching_na(item, val)
|
||||
|
||||
else:
|
||||
if util.is_nan(val):
|
||||
indexer = np.isnan(values)
|
||||
else:
|
||||
indexer = values == val
|
||||
|
||||
return indexer.view(bool)
|
||||
|
||||
|
||||
# Don't populate hash tables in monotonic indexes larger than this
|
||||
_SIZE_CUTOFF = 1_000_000
|
||||
|
||||
|
||||
cdef _unpack_bool_indexer(ndarray[uint8_t, ndim=1, cast=True] indexer, object val):
|
||||
"""
|
||||
Possibly unpack a boolean mask to a single indexer.
|
||||
"""
|
||||
# Returns ndarray[bool] or int
|
||||
cdef:
|
||||
ndarray[intp_t, ndim=1] found
|
||||
int count
|
||||
|
||||
found = np.where(indexer)[0]
|
||||
count = len(found)
|
||||
|
||||
if count > 1:
|
||||
return indexer
|
||||
if count == 1:
|
||||
return int(found[0])
|
||||
|
||||
raise KeyError(val)
|
||||
|
||||
|
||||
@cython.freelist(32)
|
||||
cdef class IndexEngine:
|
||||
|
||||
cdef readonly:
|
||||
ndarray values
|
||||
HashTable mapping
|
||||
bint over_size_threshold
|
||||
|
||||
cdef:
|
||||
bint unique, monotonic_inc, monotonic_dec
|
||||
bint need_monotonic_check, need_unique_check
|
||||
object _np_type
|
||||
|
||||
def __init__(self, ndarray values):
|
||||
self.values = values
|
||||
|
||||
self.over_size_threshold = len(values) >= _SIZE_CUTOFF
|
||||
self.clear_mapping()
|
||||
self._np_type = values.dtype.type
|
||||
|
||||
def __contains__(self, val: object) -> bool:
|
||||
# We assume before we get here:
|
||||
# - val is hashable
|
||||
self._ensure_mapping_populated()
|
||||
return val in self.mapping
|
||||
|
||||
cpdef get_loc(self, object val):
|
||||
# -> Py_ssize_t | slice | ndarray[bool]
|
||||
cdef:
|
||||
Py_ssize_t loc
|
||||
|
||||
if is_definitely_invalid_key(val):
|
||||
raise TypeError(f"'{val}' is an invalid key")
|
||||
|
||||
self._check_type(val)
|
||||
|
||||
if self.over_size_threshold and self.is_monotonic_increasing:
|
||||
if not self.is_unique:
|
||||
return self._get_loc_duplicates(val)
|
||||
values = self.values
|
||||
|
||||
loc = self._searchsorted_left(val)
|
||||
if loc >= len(values):
|
||||
raise KeyError(val)
|
||||
if values[loc] != val:
|
||||
raise KeyError(val)
|
||||
return loc
|
||||
|
||||
self._ensure_mapping_populated()
|
||||
if not self.unique:
|
||||
return self._get_loc_duplicates(val)
|
||||
|
||||
try:
|
||||
return self.mapping.get_item(val)
|
||||
except OverflowError as err:
|
||||
# GH#41775 OverflowError e.g. if we are uint64 and val is -1
|
||||
# or if we are int64 and value is np.iinfo(np.int64).max+1
|
||||
# (the uint64 with -1 case should actually be excluded by _check_type)
|
||||
raise KeyError(val) from err
|
||||
|
||||
cdef Py_ssize_t _searchsorted_left(self, val) except? -1:
|
||||
"""
|
||||
See ObjectEngine._searchsorted_left.__doc__.
|
||||
"""
|
||||
# Caller is responsible for ensuring _check_type has already been called
|
||||
loc = self.values.searchsorted(self._np_type(val), side="left")
|
||||
return loc
|
||||
|
||||
cdef inline _get_loc_duplicates(self, object val):
|
||||
# -> Py_ssize_t | slice | ndarray[bool]
|
||||
cdef:
|
||||
Py_ssize_t diff, left, right
|
||||
|
||||
if self.is_monotonic_increasing:
|
||||
values = self.values
|
||||
try:
|
||||
left = values.searchsorted(val, side='left')
|
||||
right = values.searchsorted(val, side='right')
|
||||
except TypeError:
|
||||
# e.g. GH#29189 get_loc(None) with a Float64Index
|
||||
# 2021-09-29 Now only reached for object-dtype
|
||||
raise KeyError(val)
|
||||
|
||||
diff = right - left
|
||||
if diff == 0:
|
||||
raise KeyError(val)
|
||||
elif diff == 1:
|
||||
return left
|
||||
else:
|
||||
return slice(left, right)
|
||||
|
||||
return self._maybe_get_bool_indexer(val)
|
||||
|
||||
cdef _maybe_get_bool_indexer(self, object val):
|
||||
# Returns ndarray[bool] or int
|
||||
cdef:
|
||||
ndarray[uint8_t, ndim=1, cast=True] indexer
|
||||
|
||||
indexer = _get_bool_indexer(self.values, val)
|
||||
return _unpack_bool_indexer(indexer, val)
|
||||
|
||||
def sizeof(self, deep: bool = False) -> int:
|
||||
""" return the sizeof our mapping """
|
||||
if not self.is_mapping_populated:
|
||||
return 0
|
||||
return self.mapping.sizeof(deep=deep)
|
||||
|
||||
def __sizeof__(self) -> int:
|
||||
return self.sizeof()
|
||||
|
||||
@property
|
||||
def is_unique(self) -> bool:
|
||||
if self.need_unique_check:
|
||||
self._do_unique_check()
|
||||
|
||||
return self.unique == 1
|
||||
|
||||
cdef inline _do_unique_check(self):
|
||||
|
||||
# this de-facto the same
|
||||
self._ensure_mapping_populated()
|
||||
|
||||
@property
|
||||
def is_monotonic_increasing(self) -> bool:
|
||||
if self.need_monotonic_check:
|
||||
self._do_monotonic_check()
|
||||
|
||||
return self.monotonic_inc == 1
|
||||
|
||||
@property
|
||||
def is_monotonic_decreasing(self) -> bool:
|
||||
if self.need_monotonic_check:
|
||||
self._do_monotonic_check()
|
||||
|
||||
return self.monotonic_dec == 1
|
||||
|
||||
cdef inline _do_monotonic_check(self):
|
||||
cdef:
|
||||
bint is_unique
|
||||
try:
|
||||
values = self.values
|
||||
self.monotonic_inc, self.monotonic_dec, is_unique = \
|
||||
self._call_monotonic(values)
|
||||
except TypeError:
|
||||
self.monotonic_inc = 0
|
||||
self.monotonic_dec = 0
|
||||
is_unique = 0
|
||||
|
||||
self.need_monotonic_check = 0
|
||||
|
||||
# we can only be sure of uniqueness if is_unique=1
|
||||
if is_unique:
|
||||
self.unique = 1
|
||||
self.need_unique_check = 0
|
||||
|
||||
cdef _call_monotonic(self, values):
|
||||
return algos.is_monotonic(values, timelike=False)
|
||||
|
||||
cdef _make_hash_table(self, Py_ssize_t n):
|
||||
raise NotImplementedError # pragma: no cover
|
||||
|
||||
cdef _check_type(self, object val):
|
||||
hash(val)
|
||||
|
||||
@property
|
||||
def is_mapping_populated(self) -> bool:
|
||||
return self.mapping is not None
|
||||
|
||||
cdef inline _ensure_mapping_populated(self):
|
||||
# this populates the mapping
|
||||
# if its not already populated
|
||||
# also satisfies the need_unique_check
|
||||
|
||||
if not self.is_mapping_populated:
|
||||
|
||||
values = self.values
|
||||
self.mapping = self._make_hash_table(len(values))
|
||||
self.mapping.map_locations(values)
|
||||
|
||||
if len(self.mapping) == len(values):
|
||||
self.unique = 1
|
||||
|
||||
self.need_unique_check = 0
|
||||
|
||||
def clear_mapping(self):
|
||||
self.mapping = None
|
||||
self.need_monotonic_check = 1
|
||||
self.need_unique_check = 1
|
||||
|
||||
self.unique = 0
|
||||
self.monotonic_inc = 0
|
||||
self.monotonic_dec = 0
|
||||
|
||||
def get_indexer(self, ndarray values) -> np.ndarray:
|
||||
self._ensure_mapping_populated()
|
||||
return self.mapping.lookup(values)
|
||||
|
||||
def get_indexer_non_unique(self, ndarray targets):
|
||||
"""
|
||||
Return an indexer suitable for taking from a non unique index
|
||||
return the labels in the same order as the target
|
||||
and a missing indexer into the targets (which correspond
|
||||
to the -1 indices in the results
|
||||
|
||||
Returns
|
||||
-------
|
||||
indexer : np.ndarray[np.intp]
|
||||
missing : np.ndarray[np.intp]
|
||||
"""
|
||||
cdef:
|
||||
ndarray values
|
||||
ndarray[intp_t] result, missing
|
||||
set stargets, remaining_stargets, found_nas
|
||||
dict d = {}
|
||||
object val
|
||||
Py_ssize_t count = 0, count_missing = 0
|
||||
Py_ssize_t i, j, n, n_t, n_alloc, start, end
|
||||
bint check_na_values = False
|
||||
|
||||
values = self.values
|
||||
stargets = set(targets)
|
||||
|
||||
n = len(values)
|
||||
n_t = len(targets)
|
||||
if n > 10_000:
|
||||
n_alloc = 10_000
|
||||
else:
|
||||
n_alloc = n
|
||||
|
||||
result = np.empty(n_alloc, dtype=np.intp)
|
||||
missing = np.empty(n_t, dtype=np.intp)
|
||||
|
||||
# map each starget to its position in the index
|
||||
if (
|
||||
stargets and
|
||||
len(stargets) < 5 and
|
||||
not any([checknull(t) for t in stargets]) and
|
||||
self.is_monotonic_increasing
|
||||
):
|
||||
# if there are few enough stargets and the index is monotonically
|
||||
# increasing, then use binary search for each starget
|
||||
remaining_stargets = set()
|
||||
for starget in stargets:
|
||||
try:
|
||||
start = values.searchsorted(starget, side='left')
|
||||
end = values.searchsorted(starget, side='right')
|
||||
except TypeError: # e.g. if we tried to search for string in int array
|
||||
remaining_stargets.add(starget)
|
||||
else:
|
||||
if start != end:
|
||||
d[starget] = list(range(start, end))
|
||||
|
||||
stargets = remaining_stargets
|
||||
|
||||
if stargets:
|
||||
# otherwise, map by iterating through all items in the index
|
||||
|
||||
# short-circuit na check
|
||||
if values.dtype == object:
|
||||
check_na_values = True
|
||||
# keep track of nas in values
|
||||
found_nas = set()
|
||||
|
||||
for i in range(n):
|
||||
val = values[i]
|
||||
|
||||
# GH#43870
|
||||
# handle lookup for nas
|
||||
# (ie. np.nan, float("NaN"), Decimal("NaN"), dt64nat, td64nat)
|
||||
if check_na_values and checknull(val):
|
||||
match = [na for na in found_nas if is_matching_na(val, na)]
|
||||
|
||||
# matching na not found
|
||||
if not len(match):
|
||||
found_nas.add(val)
|
||||
|
||||
# add na to stargets to utilize `in` for stargets/d lookup
|
||||
match_stargets = [
|
||||
x for x in stargets if is_matching_na(val, x)
|
||||
]
|
||||
|
||||
if len(match_stargets):
|
||||
# add our 'standardized' na
|
||||
stargets.add(val)
|
||||
|
||||
# matching na found
|
||||
else:
|
||||
assert len(match) == 1
|
||||
val = match[0]
|
||||
|
||||
if val in stargets:
|
||||
if val not in d:
|
||||
d[val] = []
|
||||
d[val].append(i)
|
||||
|
||||
for i in range(n_t):
|
||||
val = targets[i]
|
||||
|
||||
# ensure there are nas in values before looking for a matching na
|
||||
if check_na_values and checknull(val):
|
||||
match = [na for na in found_nas if is_matching_na(val, na)]
|
||||
if len(match):
|
||||
assert len(match) == 1
|
||||
val = match[0]
|
||||
|
||||
# found
|
||||
if val in d:
|
||||
key = val
|
||||
|
||||
for j in d[key]:
|
||||
|
||||
# realloc if needed
|
||||
if count >= n_alloc:
|
||||
n_alloc += 10_000
|
||||
result = np.resize(result, n_alloc)
|
||||
|
||||
result[count] = j
|
||||
count += 1
|
||||
|
||||
# value not found
|
||||
else:
|
||||
|
||||
if count >= n_alloc:
|
||||
n_alloc += 10_000
|
||||
result = np.resize(result, n_alloc)
|
||||
result[count] = -1
|
||||
count += 1
|
||||
missing[count_missing] = i
|
||||
count_missing += 1
|
||||
|
||||
return result[0:count], missing[0:count_missing]
|
||||
|
||||
|
||||
cdef Py_ssize_t _bin_search(ndarray values, object val) except -1:
|
||||
# GH#1757 ndarray.searchsorted is not safe to use with array of tuples
|
||||
# (treats a tuple `val` as a sequence of keys instead of a single key),
|
||||
# so we implement something similar.
|
||||
# This is equivalent to the stdlib's bisect.bisect_left
|
||||
|
||||
cdef:
|
||||
Py_ssize_t mid = 0, lo = 0, hi = len(values) - 1
|
||||
object pval
|
||||
|
||||
if hi == 0 or (hi > 0 and val > values[hi]):
|
||||
return len(values)
|
||||
|
||||
while lo < hi:
|
||||
mid = (lo + hi) // 2
|
||||
pval = values[mid]
|
||||
if val < pval:
|
||||
hi = mid
|
||||
elif val > pval:
|
||||
lo = mid + 1
|
||||
else:
|
||||
while mid > 0 and val == values[mid - 1]:
|
||||
mid -= 1
|
||||
return mid
|
||||
|
||||
if val <= values[mid]:
|
||||
return mid
|
||||
else:
|
||||
return mid + 1
|
||||
|
||||
|
||||
cdef class ObjectEngine(IndexEngine):
|
||||
"""
|
||||
Index Engine for use with object-dtype Index, namely the base class Index.
|
||||
"""
|
||||
cdef _make_hash_table(self, Py_ssize_t n):
|
||||
return _hash.PyObjectHashTable(n)
|
||||
|
||||
cdef Py_ssize_t _searchsorted_left(self, val) except? -1:
|
||||
# using values.searchsorted here would treat a tuple `val` as a sequence
|
||||
# instead of a single key, so we use a different implementation
|
||||
try:
|
||||
loc = _bin_search(self.values, val)
|
||||
except TypeError as err:
|
||||
raise KeyError(val) from err
|
||||
return loc
|
||||
|
||||
|
||||
cdef class DatetimeEngine(Int64Engine):
|
||||
|
||||
cdef int64_t _unbox_scalar(self, scalar) except? -1:
|
||||
# NB: caller is responsible for ensuring tzawareness compat
|
||||
# before we get here
|
||||
if not (isinstance(scalar, _Timestamp) or scalar is NaT):
|
||||
raise TypeError(scalar)
|
||||
return scalar.value
|
||||
|
||||
def __contains__(self, val: object) -> bool:
|
||||
# We assume before we get here:
|
||||
# - val is hashable
|
||||
self._unbox_scalar(val)
|
||||
try:
|
||||
self.get_loc(val)
|
||||
return True
|
||||
except KeyError:
|
||||
return False
|
||||
|
||||
cdef _call_monotonic(self, values):
|
||||
return algos.is_monotonic(values, timelike=True)
|
||||
|
||||
cpdef get_loc(self, object val):
|
||||
# NB: the caller is responsible for ensuring that we are called
|
||||
# with either a Timestamp or NaT (Timedelta or NaT for TimedeltaEngine)
|
||||
|
||||
cdef:
|
||||
Py_ssize_t loc
|
||||
|
||||
if is_definitely_invalid_key(val):
|
||||
raise TypeError(f"'{val}' is an invalid key")
|
||||
|
||||
try:
|
||||
conv = self._unbox_scalar(val)
|
||||
except TypeError:
|
||||
raise KeyError(val)
|
||||
|
||||
# Welcome to the spaghetti factory
|
||||
if self.over_size_threshold and self.is_monotonic_increasing:
|
||||
if not self.is_unique:
|
||||
return self._get_loc_duplicates(conv)
|
||||
values = self.values
|
||||
|
||||
loc = values.searchsorted(conv, side='left')
|
||||
|
||||
if loc == len(values) or values[loc] != conv:
|
||||
raise KeyError(val)
|
||||
return loc
|
||||
|
||||
self._ensure_mapping_populated()
|
||||
if not self.unique:
|
||||
return self._get_loc_duplicates(conv)
|
||||
|
||||
try:
|
||||
return self.mapping.get_item(conv)
|
||||
except KeyError:
|
||||
raise KeyError(val)
|
||||
|
||||
|
||||
cdef class TimedeltaEngine(DatetimeEngine):
|
||||
|
||||
cdef int64_t _unbox_scalar(self, scalar) except? -1:
|
||||
if not (isinstance(scalar, _Timedelta) or scalar is NaT):
|
||||
raise TypeError(scalar)
|
||||
return scalar.value
|
||||
|
||||
|
||||
cdef class PeriodEngine(Int64Engine):
|
||||
|
||||
cdef int64_t _unbox_scalar(self, scalar) except? -1:
|
||||
if scalar is NaT:
|
||||
return scalar.value
|
||||
if is_period_object(scalar):
|
||||
# NB: we assume that we have the correct freq here.
|
||||
return scalar.ordinal
|
||||
raise TypeError(scalar)
|
||||
|
||||
cpdef get_loc(self, object val):
|
||||
# NB: the caller is responsible for ensuring that we are called
|
||||
# with either a Period or NaT
|
||||
cdef:
|
||||
int64_t conv
|
||||
|
||||
try:
|
||||
conv = self._unbox_scalar(val)
|
||||
except TypeError:
|
||||
raise KeyError(val)
|
||||
|
||||
return Int64Engine.get_loc(self, conv)
|
||||
|
||||
cdef _call_monotonic(self, values):
|
||||
return algos.is_monotonic(values, timelike=True)
|
||||
|
||||
|
||||
cdef class BaseMultiIndexCodesEngine:
|
||||
"""
|
||||
Base class for MultiIndexUIntEngine and MultiIndexPyIntEngine, which
|
||||
represent each label in a MultiIndex as an integer, by juxtaposing the bits
|
||||
encoding each level, with appropriate offsets.
|
||||
|
||||
For instance: if 3 levels have respectively 3, 6 and 1 possible values,
|
||||
then their labels can be represented using respectively 2, 3 and 1 bits,
|
||||
as follows:
|
||||
_ _ _ _____ _ __ __ __
|
||||
|0|0|0| ... |0| 0|a1|a0| -> offset 0 (first level)
|
||||
— — — ————— — —— —— ——
|
||||
|0|0|0| ... |0|b2|b1|b0| -> offset 2 (bits required for first level)
|
||||
— — — ————— — —— —— ——
|
||||
|0|0|0| ... |0| 0| 0|c0| -> offset 5 (bits required for first two levels)
|
||||
‾ ‾ ‾ ‾‾‾‾‾ ‾ ‾‾ ‾‾ ‾‾
|
||||
and the resulting unsigned integer representation will be:
|
||||
_ _ _ _____ _ __ __ __ __ __ __
|
||||
|0|0|0| ... |0|c0|b2|b1|b0|a1|a0|
|
||||
‾ ‾ ‾ ‾‾‾‾‾ ‾ ‾‾ ‾‾ ‾‾ ‾‾ ‾‾ ‾‾
|
||||
|
||||
Offsets are calculated at initialization, labels are transformed by method
|
||||
_codes_to_ints.
|
||||
|
||||
Keys are located by first locating each component against the respective
|
||||
level, then locating (the integer representation of) codes.
|
||||
"""
|
||||
def __init__(self, object levels, object labels,
|
||||
ndarray[uint64_t, ndim=1] offsets):
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
levels : list-like of numpy arrays
|
||||
Levels of the MultiIndex.
|
||||
labels : list-like of numpy arrays of integer dtype
|
||||
Labels of the MultiIndex.
|
||||
offsets : numpy array of uint64 dtype
|
||||
Pre-calculated offsets, one for each level of the index.
|
||||
"""
|
||||
self.levels = levels
|
||||
self.offsets = offsets
|
||||
|
||||
# Transform labels in a single array, and add 1 so that we are working
|
||||
# with positive integers (-1 for NaN becomes 0):
|
||||
codes = (np.array(labels, dtype='int64').T + 1).astype('uint64',
|
||||
copy=False)
|
||||
|
||||
# Map each codes combination in the index to an integer unambiguously
|
||||
# (no collisions possible), based on the "offsets", which describe the
|
||||
# number of bits to switch labels for each level:
|
||||
lab_ints = self._codes_to_ints(codes)
|
||||
|
||||
# Initialize underlying index (e.g. libindex.UInt64Engine) with
|
||||
# integers representing labels: we will use its get_loc and get_indexer
|
||||
self._base.__init__(self, lab_ints)
|
||||
|
||||
def _codes_to_ints(self, ndarray[uint64_t] codes) -> np.ndarray:
|
||||
raise NotImplementedError("Implemented by subclass") # pragma: no cover
|
||||
|
||||
def _extract_level_codes(self, target) -> np.ndarray:
|
||||
"""
|
||||
Map the requested list of (tuple) keys to their integer representations
|
||||
for searching in the underlying integer index.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
target : MultiIndex
|
||||
|
||||
Returns
|
||||
------
|
||||
int_keys : 1-dimensional array of dtype uint64 or object
|
||||
Integers representing one combination each
|
||||
"""
|
||||
zt = [target._get_level_values(i) for i in range(target.nlevels)]
|
||||
level_codes = [lev.get_indexer_for(codes) + 1 for lev, codes
|
||||
in zip(self.levels, zt)]
|
||||
return self._codes_to_ints(np.array(level_codes, dtype='uint64').T)
|
||||
|
||||
def get_indexer(self, target: np.ndarray) -> np.ndarray:
|
||||
"""
|
||||
Returns an array giving the positions of each value of `target` in
|
||||
`self.values`, where -1 represents a value in `target` which does not
|
||||
appear in `self.values`
|
||||
|
||||
Parameters
|
||||
----------
|
||||
target : np.ndarray
|
||||
|
||||
Returns
|
||||
-------
|
||||
np.ndarray[intp_t, ndim=1] of the indexer of `target` into
|
||||
`self.values`
|
||||
"""
|
||||
return self._base.get_indexer(self, target)
|
||||
|
||||
def get_indexer_with_fill(self, ndarray target, ndarray values,
|
||||
str method, object limit) -> np.ndarray:
|
||||
"""
|
||||
Returns an array giving the positions of each value of `target` in
|
||||
`values`, where -1 represents a value in `target` which does not
|
||||
appear in `values`
|
||||
|
||||
If `method` is "backfill" then the position for a value in `target`
|
||||
which does not appear in `values` is that of the next greater value
|
||||
in `values` (if one exists), and -1 if there is no such value.
|
||||
|
||||
Similarly, if the method is "pad" then the position for a value in
|
||||
`target` which does not appear in `values` is that of the next smaller
|
||||
value in `values` (if one exists), and -1 if there is no such value.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
target: ndarray[object] of tuples
|
||||
need not be sorted, but all must have the same length, which must be
|
||||
the same as the length of all tuples in `values`
|
||||
values : ndarray[object] of tuples
|
||||
must be sorted and all have the same length. Should be the set of
|
||||
the MultiIndex's values.
|
||||
method: string
|
||||
"backfill" or "pad"
|
||||
limit: int or None
|
||||
if provided, limit the number of fills to this value
|
||||
|
||||
Returns
|
||||
-------
|
||||
np.ndarray[intp_t, ndim=1] of the indexer of `target` into `values`,
|
||||
filled with the `method` (and optionally `limit`) specified
|
||||
"""
|
||||
assert method in ("backfill", "pad")
|
||||
cdef:
|
||||
int64_t i, j, next_code
|
||||
int64_t num_values, num_target_values
|
||||
ndarray[int64_t, ndim=1] target_order
|
||||
ndarray[object, ndim=1] target_values
|
||||
ndarray[int64_t, ndim=1] new_codes, new_target_codes
|
||||
ndarray[intp_t, ndim=1] sorted_indexer
|
||||
|
||||
target_order = np.argsort(target).astype('int64')
|
||||
target_values = target[target_order]
|
||||
num_values, num_target_values = len(values), len(target_values)
|
||||
new_codes, new_target_codes = (
|
||||
np.empty((num_values,)).astype('int64'),
|
||||
np.empty((num_target_values,)).astype('int64'),
|
||||
)
|
||||
|
||||
# `values` and `target_values` are both sorted, so we walk through them
|
||||
# and memoize the (ordered) set of indices in the (implicit) merged-and
|
||||
# sorted list of the two which belong to each of them
|
||||
# the effect of this is to create a factorization for the (sorted)
|
||||
# merger of the index values, where `new_codes` and `new_target_codes`
|
||||
# are the subset of the factors which appear in `values` and `target`,
|
||||
# respectively
|
||||
i, j, next_code = 0, 0, 0
|
||||
while i < num_values and j < num_target_values:
|
||||
val, target_val = values[i], target_values[j]
|
||||
if val <= target_val:
|
||||
new_codes[i] = next_code
|
||||
i += 1
|
||||
if target_val <= val:
|
||||
new_target_codes[j] = next_code
|
||||
j += 1
|
||||
next_code += 1
|
||||
|
||||
# at this point, at least one should have reached the end
|
||||
# the remaining values of the other should be added to the end
|
||||
assert i == num_values or j == num_target_values
|
||||
while i < num_values:
|
||||
new_codes[i] = next_code
|
||||
i += 1
|
||||
next_code += 1
|
||||
while j < num_target_values:
|
||||
new_target_codes[j] = next_code
|
||||
j += 1
|
||||
next_code += 1
|
||||
|
||||
# get the indexer, and undo the sorting of `target.values`
|
||||
algo = algos.backfill if method == "backfill" else algos.pad
|
||||
sorted_indexer = algo(new_codes, new_target_codes, limit=limit)
|
||||
return sorted_indexer[np.argsort(target_order)]
|
||||
|
||||
def get_loc(self, object key):
|
||||
if is_definitely_invalid_key(key):
|
||||
raise TypeError(f"'{key}' is an invalid key")
|
||||
if not isinstance(key, tuple):
|
||||
raise KeyError(key)
|
||||
try:
|
||||
indices = [0 if checknull(v) else lev.get_loc(v) + 1
|
||||
for lev, v in zip(self.levels, key)]
|
||||
except KeyError:
|
||||
raise KeyError(key)
|
||||
|
||||
# Transform indices into single integer:
|
||||
lab_int = self._codes_to_ints(np.array(indices, dtype='uint64'))
|
||||
|
||||
return self._base.get_loc(self, lab_int)
|
||||
|
||||
def get_indexer_non_unique(self, target: np.ndarray) -> np.ndarray:
|
||||
indexer = self._base.get_indexer_non_unique(self, target)
|
||||
|
||||
return indexer
|
||||
|
||||
def __contains__(self, val: object) -> bool:
|
||||
# We assume before we get here:
|
||||
# - val is hashable
|
||||
# Default __contains__ looks in the underlying mapping, which in this
|
||||
# case only contains integer representations.
|
||||
try:
|
||||
self.get_loc(val)
|
||||
return True
|
||||
except (KeyError, TypeError, ValueError):
|
||||
return False
|
||||
|
||||
|
||||
# Generated from template.
|
||||
include "index_class_helper.pxi"
|
||||
@@ -0,0 +1,51 @@
|
||||
"""
|
||||
Template for functions of IndexEngine subclasses.
|
||||
|
||||
WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
|
||||
"""
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# IndexEngine Subclass Methods
|
||||
# ----------------------------------------------------------------------
|
||||
|
||||
{{py:
|
||||
|
||||
# name, dtype
|
||||
dtypes = [('Float64', 'float64'),
|
||||
('Float32', 'float32'),
|
||||
('Int64', 'int64'),
|
||||
('Int32', 'int32'),
|
||||
('Int16', 'int16'),
|
||||
('Int8', 'int8'),
|
||||
('UInt64', 'uint64'),
|
||||
('UInt32', 'uint32'),
|
||||
('UInt16', 'uint16'),
|
||||
('UInt8', 'uint8'),
|
||||
]
|
||||
}}
|
||||
|
||||
{{for name, dtype in dtypes}}
|
||||
|
||||
|
||||
cdef class {{name}}Engine(IndexEngine):
|
||||
|
||||
cdef _make_hash_table(self, Py_ssize_t n):
|
||||
return _hash.{{name}}HashTable(n)
|
||||
|
||||
cdef _check_type(self, object val):
|
||||
{{if name not in {'Float64', 'Float32'} }}
|
||||
if not util.is_integer_object(val):
|
||||
raise KeyError(val)
|
||||
{{if name.startswith("U")}}
|
||||
if val < 0:
|
||||
# cannot have negative values with unsigned int dtype
|
||||
raise KeyError(val)
|
||||
{{endif}}
|
||||
{{else}}
|
||||
if not util.is_integer_object(val) and not util.is_float_object(val):
|
||||
# in particular catch bool and avoid casting True -> 1.0
|
||||
raise KeyError(val)
|
||||
{{endif}}
|
||||
|
||||
|
||||
{{endfor}}
|
||||
Binary file not shown.
@@ -0,0 +1,25 @@
|
||||
cdef class NDFrameIndexerBase:
|
||||
"""
|
||||
A base class for _NDFrameIndexer for fast instantiation and attribute access.
|
||||
"""
|
||||
cdef public:
|
||||
str name
|
||||
object obj, _ndim
|
||||
|
||||
def __init__(self, name: str, obj):
|
||||
self.obj = obj
|
||||
self.name = name
|
||||
self._ndim = None
|
||||
|
||||
@property
|
||||
def ndim(self) -> int:
|
||||
# Delay `ndim` instantiation until required as reading it
|
||||
# from `obj` isn't entirely cheap.
|
||||
ndim = self._ndim
|
||||
if ndim is None:
|
||||
ndim = self._ndim = self.obj.ndim
|
||||
if ndim > 2:
|
||||
raise ValueError( # pragma: no cover
|
||||
"NDFrameIndexer does not support NDFrame objects with ndim > 2"
|
||||
)
|
||||
return ndim
|
||||
Binary file not shown.
@@ -0,0 +1,85 @@
|
||||
from typing import (
|
||||
Iterator,
|
||||
Sequence,
|
||||
final,
|
||||
overload,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._typing import (
|
||||
ArrayLike,
|
||||
T,
|
||||
npt,
|
||||
)
|
||||
|
||||
from pandas import Index
|
||||
from pandas.core.arrays._mixins import NDArrayBackedExtensionArray
|
||||
from pandas.core.internals.blocks import Block as B
|
||||
|
||||
def slice_len(slc: slice, objlen: int = ...) -> int: ...
|
||||
def get_blkno_indexers(
|
||||
blknos: np.ndarray, # int64_t[:]
|
||||
group: bool = ...,
|
||||
) -> list[tuple[int, slice | np.ndarray]]: ...
|
||||
def get_blkno_placements(
|
||||
blknos: np.ndarray,
|
||||
group: bool = ...,
|
||||
) -> Iterator[tuple[int, BlockPlacement]]: ...
|
||||
def update_blklocs_and_blknos(
|
||||
blklocs: npt.NDArray[np.intp],
|
||||
blknos: npt.NDArray[np.intp],
|
||||
loc: int,
|
||||
nblocks: int,
|
||||
) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ...
|
||||
|
||||
class BlockPlacement:
|
||||
def __init__(self, val: int | slice | np.ndarray): ...
|
||||
@property
|
||||
def indexer(self) -> np.ndarray | slice: ...
|
||||
@property
|
||||
def as_array(self) -> np.ndarray: ...
|
||||
@property
|
||||
def as_slice(self) -> slice: ...
|
||||
@property
|
||||
def is_slice_like(self) -> bool: ...
|
||||
@overload
|
||||
def __getitem__(self, loc: slice | Sequence[int]) -> BlockPlacement: ...
|
||||
@overload
|
||||
def __getitem__(self, loc: int) -> int: ...
|
||||
def __iter__(self) -> Iterator[int]: ...
|
||||
def __len__(self) -> int: ...
|
||||
def delete(self, loc) -> BlockPlacement: ...
|
||||
def append(self, others: list[BlockPlacement]) -> BlockPlacement: ...
|
||||
def tile_for_unstack(self, factor: int) -> npt.NDArray[np.intp]: ...
|
||||
|
||||
class SharedBlock:
|
||||
_mgr_locs: BlockPlacement
|
||||
ndim: int
|
||||
values: ArrayLike
|
||||
def __init__(self, values: ArrayLike, placement: BlockPlacement, ndim: int): ...
|
||||
|
||||
class NumpyBlock(SharedBlock):
|
||||
values: np.ndarray
|
||||
@final
|
||||
def getitem_block_index(self: T, slicer: slice) -> T: ...
|
||||
|
||||
class NDArrayBackedBlock(SharedBlock):
|
||||
values: NDArrayBackedExtensionArray
|
||||
@final
|
||||
def getitem_block_index(self: T, slicer: slice) -> T: ...
|
||||
|
||||
class Block(SharedBlock): ...
|
||||
|
||||
class BlockManager:
|
||||
blocks: tuple[B, ...]
|
||||
axes: list[Index]
|
||||
_known_consolidated: bool
|
||||
_is_consolidated: bool
|
||||
_blknos: np.ndarray
|
||||
_blklocs: np.ndarray
|
||||
def __init__(
|
||||
self, blocks: tuple[B, ...], axes: list[Index], verify_integrity=...
|
||||
): ...
|
||||
def get_slice(self: T, slobj: slice, axis: int = ...) -> T: ...
|
||||
def _rebuild_blknos_and_blklocs(self) -> None: ...
|
||||
@@ -0,0 +1,824 @@
|
||||
from collections import defaultdict
|
||||
|
||||
import cython
|
||||
from cython import Py_ssize_t
|
||||
|
||||
from cpython.slice cimport PySlice_GetIndicesEx
|
||||
|
||||
|
||||
cdef extern from "Python.h":
|
||||
Py_ssize_t PY_SSIZE_T_MAX
|
||||
|
||||
import numpy as np
|
||||
|
||||
cimport numpy as cnp
|
||||
from numpy cimport (
|
||||
NPY_INTP,
|
||||
int64_t,
|
||||
intp_t,
|
||||
ndarray,
|
||||
)
|
||||
|
||||
cnp.import_array()
|
||||
|
||||
from pandas._libs.algos import ensure_int64
|
||||
|
||||
from pandas._libs.arrays cimport NDArrayBacked
|
||||
from pandas._libs.util cimport (
|
||||
is_array,
|
||||
is_integer_object,
|
||||
)
|
||||
|
||||
|
||||
@cython.final
|
||||
@cython.freelist(32)
|
||||
cdef class BlockPlacement:
|
||||
# __slots__ = '_as_slice', '_as_array', '_len'
|
||||
cdef:
|
||||
slice _as_slice
|
||||
ndarray _as_array # Note: this still allows `None`; will be intp_t
|
||||
bint _has_slice, _has_array, _is_known_slice_like
|
||||
|
||||
def __cinit__(self, val):
|
||||
cdef:
|
||||
slice slc
|
||||
|
||||
self._as_slice = None
|
||||
self._as_array = None
|
||||
self._has_slice = False
|
||||
self._has_array = False
|
||||
|
||||
if is_integer_object(val):
|
||||
slc = slice(val, val + 1, 1)
|
||||
self._as_slice = slc
|
||||
self._has_slice = True
|
||||
elif isinstance(val, slice):
|
||||
slc = slice_canonize(val)
|
||||
|
||||
if slc.start != slc.stop:
|
||||
self._as_slice = slc
|
||||
self._has_slice = True
|
||||
else:
|
||||
arr = np.empty(0, dtype=np.intp)
|
||||
self._as_array = arr
|
||||
self._has_array = True
|
||||
else:
|
||||
# Cython memoryview interface requires ndarray to be writeable.
|
||||
if (
|
||||
not is_array(val)
|
||||
or not cnp.PyArray_ISWRITEABLE(val)
|
||||
or (<ndarray>val).descr.type_num != cnp.NPY_INTP
|
||||
):
|
||||
arr = np.require(val, dtype=np.intp, requirements='W')
|
||||
else:
|
||||
arr = val
|
||||
# Caller is responsible for ensuring arr.ndim == 1
|
||||
self._as_array = arr
|
||||
self._has_array = True
|
||||
|
||||
def __str__(self) -> str:
|
||||
cdef:
|
||||
slice s = self._ensure_has_slice()
|
||||
|
||||
if s is not None:
|
||||
v = self._as_slice
|
||||
else:
|
||||
v = self._as_array
|
||||
|
||||
return f"{type(self).__name__}({v})"
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return str(self)
|
||||
|
||||
def __len__(self) -> int:
|
||||
cdef:
|
||||
slice s = self._ensure_has_slice()
|
||||
|
||||
if s is not None:
|
||||
return slice_len(s)
|
||||
else:
|
||||
return len(self._as_array)
|
||||
|
||||
def __iter__(self):
|
||||
cdef:
|
||||
slice s = self._ensure_has_slice()
|
||||
Py_ssize_t start, stop, step, _
|
||||
|
||||
if s is not None:
|
||||
start, stop, step, _ = slice_get_indices_ex(s)
|
||||
return iter(range(start, stop, step))
|
||||
else:
|
||||
return iter(self._as_array)
|
||||
|
||||
@property
|
||||
def as_slice(self) -> slice:
|
||||
cdef:
|
||||
slice s = self._ensure_has_slice()
|
||||
|
||||
if s is not None:
|
||||
return s
|
||||
else:
|
||||
raise TypeError("Not slice-like")
|
||||
|
||||
@property
|
||||
def indexer(self):
|
||||
cdef:
|
||||
slice s = self._ensure_has_slice()
|
||||
|
||||
if s is not None:
|
||||
return s
|
||||
else:
|
||||
return self._as_array
|
||||
|
||||
@property
|
||||
def as_array(self) -> np.ndarray:
|
||||
cdef:
|
||||
Py_ssize_t start, stop, end, _
|
||||
|
||||
if not self._has_array:
|
||||
start, stop, step, _ = slice_get_indices_ex(self._as_slice)
|
||||
# NOTE: this is the C-optimized equivalent of
|
||||
# `np.arange(start, stop, step, dtype=np.intp)`
|
||||
self._as_array = cnp.PyArray_Arange(start, stop, step, NPY_INTP)
|
||||
self._has_array = True
|
||||
|
||||
return self._as_array
|
||||
|
||||
@property
|
||||
def is_slice_like(self) -> bool:
|
||||
cdef:
|
||||
slice s = self._ensure_has_slice()
|
||||
|
||||
return s is not None
|
||||
|
||||
def __getitem__(self, loc):
|
||||
cdef:
|
||||
slice s = self._ensure_has_slice()
|
||||
|
||||
if s is not None:
|
||||
val = slice_getitem(s, loc)
|
||||
else:
|
||||
val = self._as_array[loc]
|
||||
|
||||
if not isinstance(val, slice) and val.ndim == 0:
|
||||
return val
|
||||
|
||||
return BlockPlacement(val)
|
||||
|
||||
def delete(self, loc) -> BlockPlacement:
|
||||
return BlockPlacement(np.delete(self.as_array, loc, axis=0))
|
||||
|
||||
def append(self, others) -> BlockPlacement:
|
||||
if not len(others):
|
||||
return self
|
||||
|
||||
return BlockPlacement(
|
||||
np.concatenate([self.as_array] + [o.as_array for o in others])
|
||||
)
|
||||
|
||||
cdef BlockPlacement iadd(self, other):
|
||||
cdef:
|
||||
slice s = self._ensure_has_slice()
|
||||
Py_ssize_t other_int, start, stop, step
|
||||
|
||||
if is_integer_object(other) and s is not None:
|
||||
other_int = <Py_ssize_t>other
|
||||
|
||||
if other_int == 0:
|
||||
# BlockPlacement is treated as immutable
|
||||
return self
|
||||
|
||||
start, stop, step, _ = slice_get_indices_ex(s)
|
||||
start += other_int
|
||||
stop += other_int
|
||||
|
||||
if (step > 0 and start < 0) or (step < 0 and stop < step):
|
||||
raise ValueError("iadd causes length change")
|
||||
|
||||
if stop < 0:
|
||||
val = slice(start, None, step)
|
||||
else:
|
||||
val = slice(start, stop, step)
|
||||
|
||||
return BlockPlacement(val)
|
||||
else:
|
||||
newarr = self.as_array + other
|
||||
if (newarr < 0).any():
|
||||
raise ValueError("iadd causes length change")
|
||||
|
||||
val = newarr
|
||||
return BlockPlacement(val)
|
||||
|
||||
def add(self, other) -> BlockPlacement:
|
||||
# We can get here with int or ndarray
|
||||
return self.iadd(other)
|
||||
|
||||
cdef slice _ensure_has_slice(self):
|
||||
if not self._has_slice:
|
||||
self._as_slice = indexer_as_slice(self._as_array)
|
||||
self._has_slice = True
|
||||
|
||||
return self._as_slice
|
||||
|
||||
cpdef BlockPlacement increment_above(self, Py_ssize_t loc):
|
||||
"""
|
||||
Increment any entries of 'loc' or above by one.
|
||||
"""
|
||||
cdef:
|
||||
slice nv, s = self._ensure_has_slice()
|
||||
Py_ssize_t other_int, start, stop, step
|
||||
ndarray[intp_t, ndim=1] newarr
|
||||
|
||||
if s is not None:
|
||||
# see if we are either all-above or all-below, each of which
|
||||
# have fastpaths available.
|
||||
|
||||
start, stop, step, _ = slice_get_indices_ex(s)
|
||||
|
||||
if start < loc and stop <= loc:
|
||||
# We are entirely below, nothing to increment
|
||||
return self
|
||||
|
||||
if start >= loc and stop >= loc:
|
||||
# We are entirely above, we can efficiently increment out slice
|
||||
nv = slice(start + 1, stop + 1, step)
|
||||
return BlockPlacement(nv)
|
||||
|
||||
if loc == 0:
|
||||
# fastpath where we know everything is >= 0
|
||||
newarr = self.as_array + 1
|
||||
return BlockPlacement(newarr)
|
||||
|
||||
newarr = self.as_array.copy()
|
||||
newarr[newarr >= loc] += 1
|
||||
return BlockPlacement(newarr)
|
||||
|
||||
def tile_for_unstack(self, factor: int) -> np.ndarray:
|
||||
"""
|
||||
Find the new mgr_locs for the un-stacked version of a Block.
|
||||
"""
|
||||
cdef:
|
||||
slice slc = self._ensure_has_slice()
|
||||
slice new_slice
|
||||
ndarray[intp_t, ndim=1] new_placement
|
||||
|
||||
if slc is not None and slc.step == 1:
|
||||
new_slc = slice(slc.start * factor, slc.stop * factor, 1)
|
||||
# equiv: np.arange(new_slc.start, new_slc.stop, dtype=np.intp)
|
||||
new_placement = cnp.PyArray_Arange(new_slc.start, new_slc.stop, 1, NPY_INTP)
|
||||
else:
|
||||
# Note: test_pivot_table_empty_aggfunc gets here with `slc is not None`
|
||||
mapped = [
|
||||
# equiv: np.arange(x * factor, (x + 1) * factor, dtype=np.intp)
|
||||
cnp.PyArray_Arange(x * factor, (x + 1) * factor, 1, NPY_INTP)
|
||||
for x in self
|
||||
]
|
||||
new_placement = np.concatenate(mapped)
|
||||
return new_placement
|
||||
|
||||
|
||||
cdef slice slice_canonize(slice s):
|
||||
"""
|
||||
Convert slice to canonical bounded form.
|
||||
"""
|
||||
cdef:
|
||||
Py_ssize_t start = 0, stop = 0, step = 1
|
||||
|
||||
if s.step is None:
|
||||
step = 1
|
||||
else:
|
||||
step = <Py_ssize_t>s.step
|
||||
if step == 0:
|
||||
raise ValueError("slice step cannot be zero")
|
||||
|
||||
if step > 0:
|
||||
if s.stop is None:
|
||||
raise ValueError("unbounded slice")
|
||||
|
||||
stop = <Py_ssize_t>s.stop
|
||||
if s.start is None:
|
||||
start = 0
|
||||
else:
|
||||
start = <Py_ssize_t>s.start
|
||||
if start > stop:
|
||||
start = stop
|
||||
elif step < 0:
|
||||
if s.start is None:
|
||||
raise ValueError("unbounded slice")
|
||||
|
||||
start = <Py_ssize_t>s.start
|
||||
if s.stop is None:
|
||||
stop = -1
|
||||
else:
|
||||
stop = <Py_ssize_t>s.stop
|
||||
if stop > start:
|
||||
stop = start
|
||||
|
||||
if start < 0 or (stop < 0 and s.stop is not None and step > 0):
|
||||
raise ValueError("unbounded slice")
|
||||
|
||||
if stop < 0:
|
||||
return slice(start, None, step)
|
||||
else:
|
||||
return slice(start, stop, step)
|
||||
|
||||
|
||||
cpdef Py_ssize_t slice_len(slice slc, Py_ssize_t objlen=PY_SSIZE_T_MAX) except -1:
|
||||
"""
|
||||
Get length of a bounded slice.
|
||||
|
||||
The slice must not have any "open" bounds that would create dependency on
|
||||
container size, i.e.:
|
||||
- if ``s.step is None or s.step > 0``, ``s.stop`` is not ``None``
|
||||
- if ``s.step < 0``, ``s.start`` is not ``None``
|
||||
|
||||
Otherwise, the result is unreliable.
|
||||
"""
|
||||
cdef:
|
||||
Py_ssize_t start, stop, step, length
|
||||
|
||||
if slc is None:
|
||||
raise TypeError("slc must be slice") # pragma: no cover
|
||||
|
||||
PySlice_GetIndicesEx(slc, objlen, &start, &stop, &step, &length)
|
||||
|
||||
return length
|
||||
|
||||
|
||||
cdef (Py_ssize_t, Py_ssize_t, Py_ssize_t, Py_ssize_t) slice_get_indices_ex(
|
||||
slice slc, Py_ssize_t objlen=PY_SSIZE_T_MAX
|
||||
):
|
||||
"""
|
||||
Get (start, stop, step, length) tuple for a slice.
|
||||
|
||||
If `objlen` is not specified, slice must be bounded, otherwise the result
|
||||
will be wrong.
|
||||
"""
|
||||
cdef:
|
||||
Py_ssize_t start, stop, step, length
|
||||
|
||||
if slc is None:
|
||||
raise TypeError("slc should be a slice") # pragma: no cover
|
||||
|
||||
PySlice_GetIndicesEx(slc, objlen, &start, &stop, &step, &length)
|
||||
|
||||
return start, stop, step, length
|
||||
|
||||
|
||||
cdef slice_getitem(slice slc, ind):
|
||||
cdef:
|
||||
Py_ssize_t s_start, s_stop, s_step, s_len
|
||||
Py_ssize_t ind_start, ind_stop, ind_step, ind_len
|
||||
|
||||
s_start, s_stop, s_step, s_len = slice_get_indices_ex(slc)
|
||||
|
||||
if isinstance(ind, slice):
|
||||
ind_start, ind_stop, ind_step, ind_len = slice_get_indices_ex(ind, s_len)
|
||||
|
||||
if ind_step > 0 and ind_len == s_len:
|
||||
# short-cut for no-op slice
|
||||
if ind_len == s_len:
|
||||
return slc
|
||||
|
||||
if ind_step < 0:
|
||||
s_start = s_stop - s_step
|
||||
ind_step = -ind_step
|
||||
|
||||
s_step *= ind_step
|
||||
s_stop = s_start + ind_stop * s_step
|
||||
s_start = s_start + ind_start * s_step
|
||||
|
||||
if s_step < 0 and s_stop < 0:
|
||||
return slice(s_start, None, s_step)
|
||||
else:
|
||||
return slice(s_start, s_stop, s_step)
|
||||
|
||||
else:
|
||||
# NOTE:
|
||||
# this is the C-optimized equivalent of
|
||||
# `np.arange(s_start, s_stop, s_step, dtype=np.intp)[ind]`
|
||||
return cnp.PyArray_Arange(s_start, s_stop, s_step, NPY_INTP)[ind]
|
||||
|
||||
|
||||
@cython.boundscheck(False)
|
||||
@cython.wraparound(False)
|
||||
cdef slice indexer_as_slice(intp_t[:] vals):
|
||||
cdef:
|
||||
Py_ssize_t i, n, start, stop
|
||||
int64_t d
|
||||
|
||||
if vals is None:
|
||||
raise TypeError("vals must be ndarray") # pragma: no cover
|
||||
|
||||
n = vals.shape[0]
|
||||
|
||||
if n == 0 or vals[0] < 0:
|
||||
return None
|
||||
|
||||
if n == 1:
|
||||
return slice(vals[0], vals[0] + 1, 1)
|
||||
|
||||
if vals[1] < 0:
|
||||
return None
|
||||
|
||||
# n > 2
|
||||
d = vals[1] - vals[0]
|
||||
|
||||
if d == 0:
|
||||
return None
|
||||
|
||||
for i in range(2, n):
|
||||
if vals[i] < 0 or vals[i] - vals[i - 1] != d:
|
||||
return None
|
||||
|
||||
start = vals[0]
|
||||
stop = start + n * d
|
||||
if stop < 0 and d < 0:
|
||||
return slice(start, None, d)
|
||||
else:
|
||||
return slice(start, stop, d)
|
||||
|
||||
|
||||
@cython.boundscheck(False)
|
||||
@cython.wraparound(False)
|
||||
def get_blkno_indexers(
|
||||
int64_t[:] blknos, bint group=True
|
||||
) -> list[tuple[int, slice | np.ndarray]]:
|
||||
"""
|
||||
Enumerate contiguous runs of integers in ndarray.
|
||||
|
||||
Iterate over elements of `blknos` yielding ``(blkno, slice(start, stop))``
|
||||
pairs for each contiguous run found.
|
||||
|
||||
If `group` is True and there is more than one run for a certain blkno,
|
||||
``(blkno, array)`` with an array containing positions of all elements equal
|
||||
to blkno.
|
||||
|
||||
Returns
|
||||
-------
|
||||
list[tuple[int, slice | np.ndarray]]
|
||||
"""
|
||||
# There's blkno in this function's name because it's used in block &
|
||||
# blockno handling.
|
||||
cdef:
|
||||
int64_t cur_blkno
|
||||
Py_ssize_t i, start, stop, n, diff
|
||||
cnp.npy_intp tot_len
|
||||
int64_t blkno
|
||||
object group_dict = defaultdict(list)
|
||||
ndarray[int64_t, ndim=1] arr
|
||||
|
||||
n = blknos.shape[0]
|
||||
result = list()
|
||||
start = 0
|
||||
cur_blkno = blknos[start]
|
||||
|
||||
if n == 0:
|
||||
pass
|
||||
elif group is False:
|
||||
for i in range(1, n):
|
||||
if blknos[i] != cur_blkno:
|
||||
result.append((cur_blkno, slice(start, i)))
|
||||
|
||||
start = i
|
||||
cur_blkno = blknos[i]
|
||||
|
||||
result.append((cur_blkno, slice(start, n)))
|
||||
else:
|
||||
for i in range(1, n):
|
||||
if blknos[i] != cur_blkno:
|
||||
group_dict[cur_blkno].append((start, i))
|
||||
|
||||
start = i
|
||||
cur_blkno = blknos[i]
|
||||
|
||||
group_dict[cur_blkno].append((start, n))
|
||||
|
||||
for blkno, slices in group_dict.items():
|
||||
if len(slices) == 1:
|
||||
result.append((blkno, slice(slices[0][0], slices[0][1])))
|
||||
else:
|
||||
tot_len = sum(stop - start for start, stop in slices)
|
||||
# equiv np.empty(tot_len, dtype=np.int64)
|
||||
arr = cnp.PyArray_EMPTY(1, &tot_len, cnp.NPY_INT64, 0)
|
||||
|
||||
i = 0
|
||||
for start, stop in slices:
|
||||
for diff in range(start, stop):
|
||||
arr[i] = diff
|
||||
i += 1
|
||||
|
||||
result.append((blkno, arr))
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def get_blkno_placements(blknos, group: bool = True):
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
blknos : np.ndarray[int64]
|
||||
group : bool, default True
|
||||
|
||||
Returns
|
||||
-------
|
||||
iterator
|
||||
yield (blkno, BlockPlacement)
|
||||
"""
|
||||
blknos = ensure_int64(blknos)
|
||||
|
||||
for blkno, indexer in get_blkno_indexers(blknos, group):
|
||||
yield blkno, BlockPlacement(indexer)
|
||||
|
||||
|
||||
@cython.boundscheck(False)
|
||||
@cython.wraparound(False)
|
||||
cpdef update_blklocs_and_blknos(
|
||||
ndarray[intp_t, ndim=1] blklocs,
|
||||
ndarray[intp_t, ndim=1] blknos,
|
||||
Py_ssize_t loc,
|
||||
intp_t nblocks,
|
||||
):
|
||||
"""
|
||||
Update blklocs and blknos when a new column is inserted at 'loc'.
|
||||
"""
|
||||
cdef:
|
||||
Py_ssize_t i
|
||||
cnp.npy_intp length = len(blklocs) + 1
|
||||
ndarray[intp_t, ndim=1] new_blklocs, new_blknos
|
||||
|
||||
# equiv: new_blklocs = np.empty(length, dtype=np.intp)
|
||||
new_blklocs = cnp.PyArray_EMPTY(1, &length, cnp.NPY_INTP, 0)
|
||||
new_blknos = cnp.PyArray_EMPTY(1, &length, cnp.NPY_INTP, 0)
|
||||
|
||||
for i in range(loc):
|
||||
new_blklocs[i] = blklocs[i]
|
||||
new_blknos[i] = blknos[i]
|
||||
|
||||
new_blklocs[loc] = 0
|
||||
new_blknos[loc] = nblocks
|
||||
|
||||
for i in range(loc, length - 1):
|
||||
new_blklocs[i + 1] = blklocs[i]
|
||||
new_blknos[i + 1] = blknos[i]
|
||||
|
||||
return new_blklocs, new_blknos
|
||||
|
||||
|
||||
def _unpickle_block(values, placement, ndim):
|
||||
# We have to do some gymnastics b/c "ndim" is keyword-only
|
||||
|
||||
from pandas.core.internals.blocks import new_block
|
||||
|
||||
return new_block(values, placement, ndim=ndim)
|
||||
|
||||
|
||||
@cython.freelist(64)
|
||||
cdef class SharedBlock:
|
||||
"""
|
||||
Defining __init__ in a cython class significantly improves performance.
|
||||
"""
|
||||
cdef:
|
||||
public BlockPlacement _mgr_locs
|
||||
readonly int ndim
|
||||
|
||||
def __cinit__(self, values, placement: BlockPlacement, ndim: int):
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
values : np.ndarray or ExtensionArray
|
||||
We assume maybe_coerce_values has already been called.
|
||||
placement : BlockPlacement
|
||||
ndim : int
|
||||
1 for SingleBlockManager/Series, 2 for BlockManager/DataFrame
|
||||
"""
|
||||
self._mgr_locs = placement
|
||||
self.ndim = ndim
|
||||
|
||||
cpdef __reduce__(self):
|
||||
args = (self.values, self.mgr_locs.indexer, self.ndim)
|
||||
return _unpickle_block, args
|
||||
|
||||
cpdef __setstate__(self, state):
|
||||
from pandas.core.construction import extract_array
|
||||
|
||||
self.mgr_locs = BlockPlacement(state[0])
|
||||
self.values = extract_array(state[1], extract_numpy=True)
|
||||
if len(state) > 2:
|
||||
# we stored ndim
|
||||
self.ndim = state[2]
|
||||
else:
|
||||
# older pickle
|
||||
from pandas.core.internals.api import maybe_infer_ndim
|
||||
|
||||
ndim = maybe_infer_ndim(self.values, self.mgr_locs)
|
||||
self.ndim = ndim
|
||||
|
||||
|
||||
cdef class NumpyBlock(SharedBlock):
|
||||
cdef:
|
||||
public ndarray values
|
||||
|
||||
def __cinit__(self, ndarray values, BlockPlacement placement, int ndim):
|
||||
# set values here the (implicit) call to SharedBlock.__cinit__ will
|
||||
# set placement and ndim
|
||||
self.values = values
|
||||
|
||||
cpdef NumpyBlock getitem_block_index(self, slice slicer):
|
||||
"""
|
||||
Perform __getitem__-like specialized to slicing along index.
|
||||
|
||||
Assumes self.ndim == 2
|
||||
"""
|
||||
new_values = self.values[..., slicer]
|
||||
return type(self)(new_values, self._mgr_locs, ndim=self.ndim)
|
||||
|
||||
|
||||
cdef class NDArrayBackedBlock(SharedBlock):
|
||||
"""
|
||||
Block backed by NDArrayBackedExtensionArray
|
||||
"""
|
||||
cdef public:
|
||||
NDArrayBacked values
|
||||
|
||||
def __cinit__(self, NDArrayBacked values, BlockPlacement placement, int ndim):
|
||||
# set values here the (implicit) call to SharedBlock.__cinit__ will
|
||||
# set placement and ndim
|
||||
self.values = values
|
||||
|
||||
cpdef NDArrayBackedBlock getitem_block_index(self, slice slicer):
|
||||
"""
|
||||
Perform __getitem__-like specialized to slicing along index.
|
||||
|
||||
Assumes self.ndim == 2
|
||||
"""
|
||||
new_values = self.values[..., slicer]
|
||||
return type(self)(new_values, self._mgr_locs, ndim=self.ndim)
|
||||
|
||||
|
||||
cdef class Block(SharedBlock):
|
||||
cdef:
|
||||
public object values
|
||||
|
||||
def __cinit__(self, object values, BlockPlacement placement, int ndim):
|
||||
# set values here the (implicit) call to SharedBlock.__cinit__ will
|
||||
# set placement and ndim
|
||||
self.values = values
|
||||
|
||||
|
||||
@cython.freelist(64)
|
||||
cdef class BlockManager:
|
||||
cdef:
|
||||
public tuple blocks
|
||||
public list axes
|
||||
public bint _known_consolidated, _is_consolidated
|
||||
public ndarray _blknos, _blklocs
|
||||
|
||||
def __cinit__(self, blocks=None, axes=None, verify_integrity=True):
|
||||
# None as defaults for unpickling GH#42345
|
||||
if blocks is None:
|
||||
# This adds 1-2 microseconds to DataFrame(np.array([]))
|
||||
return
|
||||
|
||||
if isinstance(blocks, list):
|
||||
# Backward compat for e.g. pyarrow
|
||||
blocks = tuple(blocks)
|
||||
|
||||
self.blocks = blocks
|
||||
self.axes = axes.copy() # copy to make sure we are not remotely-mutable
|
||||
|
||||
# Populate known_consolidate, blknos, and blklocs lazily
|
||||
self._known_consolidated = False
|
||||
self._is_consolidated = False
|
||||
self._blknos = None
|
||||
self._blklocs = None
|
||||
|
||||
# -------------------------------------------------------------------
|
||||
# Block Placement
|
||||
|
||||
def _rebuild_blknos_and_blklocs(self) -> None:
|
||||
"""
|
||||
Update mgr._blknos / mgr._blklocs.
|
||||
"""
|
||||
cdef:
|
||||
intp_t blkno, i, j
|
||||
cnp.npy_intp length = self.shape[0]
|
||||
SharedBlock blk
|
||||
BlockPlacement bp
|
||||
ndarray[intp_t, ndim=1] new_blknos, new_blklocs
|
||||
|
||||
# equiv: np.empty(length, dtype=np.intp)
|
||||
new_blknos = cnp.PyArray_EMPTY(1, &length, cnp.NPY_INTP, 0)
|
||||
new_blklocs = cnp.PyArray_EMPTY(1, &length, cnp.NPY_INTP, 0)
|
||||
# equiv: new_blknos.fill(-1)
|
||||
cnp.PyArray_FILLWBYTE(new_blknos, -1)
|
||||
cnp.PyArray_FILLWBYTE(new_blklocs, -1)
|
||||
|
||||
for blkno, blk in enumerate(self.blocks):
|
||||
bp = blk._mgr_locs
|
||||
# Iterating over `bp` is a faster equivalent to
|
||||
# new_blknos[bp.indexer] = blkno
|
||||
# new_blklocs[bp.indexer] = np.arange(len(bp))
|
||||
for i, j in enumerate(bp):
|
||||
new_blknos[j] = blkno
|
||||
new_blklocs[j] = i
|
||||
|
||||
for i in range(length):
|
||||
# faster than `for blkno in new_blknos`
|
||||
# https://github.com/cython/cython/issues/4393
|
||||
blkno = new_blknos[i]
|
||||
|
||||
# If there are any -1s remaining, this indicates that our mgr_locs
|
||||
# are invalid.
|
||||
if blkno == -1:
|
||||
raise AssertionError("Gaps in blk ref_locs")
|
||||
|
||||
self._blknos = new_blknos
|
||||
self._blklocs = new_blklocs
|
||||
|
||||
# -------------------------------------------------------------------
|
||||
# Pickle
|
||||
|
||||
cpdef __reduce__(self):
|
||||
if len(self.axes) == 1:
|
||||
# SingleBlockManager, __init__ expects Block, axis
|
||||
args = (self.blocks[0], self.axes[0])
|
||||
else:
|
||||
args = (self.blocks, self.axes)
|
||||
return type(self), args
|
||||
|
||||
cpdef __setstate__(self, state):
|
||||
from pandas.core.construction import extract_array
|
||||
from pandas.core.internals.blocks import (
|
||||
ensure_block_shape,
|
||||
new_block,
|
||||
)
|
||||
from pandas.core.internals.managers import ensure_index
|
||||
|
||||
if isinstance(state, tuple) and len(state) >= 4 and "0.14.1" in state[3]:
|
||||
state = state[3]["0.14.1"]
|
||||
axes = [ensure_index(ax) for ax in state["axes"]]
|
||||
ndim = len(axes)
|
||||
|
||||
for blk in state["blocks"]:
|
||||
vals = blk["values"]
|
||||
# older versions may hold e.g. DatetimeIndex instead of DTA
|
||||
vals = extract_array(vals, extract_numpy=True)
|
||||
blk["values"] = ensure_block_shape(vals, ndim=ndim)
|
||||
|
||||
nbs = [
|
||||
new_block(blk["values"], blk["mgr_locs"], ndim=ndim)
|
||||
for blk in state["blocks"]
|
||||
]
|
||||
blocks = tuple(nbs)
|
||||
self.blocks = blocks
|
||||
self.axes = axes
|
||||
|
||||
else: # pragma: no cover
|
||||
raise NotImplementedError("pre-0.14.1 pickles are no longer supported")
|
||||
|
||||
self._post_setstate()
|
||||
|
||||
def _post_setstate(self) -> None:
|
||||
self._is_consolidated = False
|
||||
self._known_consolidated = False
|
||||
self._rebuild_blknos_and_blklocs()
|
||||
|
||||
# -------------------------------------------------------------------
|
||||
# Indexing
|
||||
|
||||
cdef BlockManager _get_index_slice(self, slobj):
|
||||
cdef:
|
||||
SharedBlock blk, nb
|
||||
BlockManager mgr
|
||||
ndarray blknos, blklocs
|
||||
|
||||
nbs = []
|
||||
for blk in self.blocks:
|
||||
nb = blk.getitem_block_index(slobj)
|
||||
nbs.append(nb)
|
||||
|
||||
new_axes = [self.axes[0], self.axes[1]._getitem_slice(slobj)]
|
||||
mgr = type(self)(tuple(nbs), new_axes, verify_integrity=False)
|
||||
|
||||
# We can avoid having to rebuild blklocs/blknos
|
||||
blklocs = self._blklocs
|
||||
blknos = self._blknos
|
||||
if blknos is not None:
|
||||
mgr._blknos = blknos.copy()
|
||||
mgr._blklocs = blklocs.copy()
|
||||
return mgr
|
||||
|
||||
def get_slice(self, slobj: slice, axis: int = 0) -> BlockManager:
|
||||
|
||||
if axis == 0:
|
||||
new_blocks = self._slice_take_blocks_ax0(slobj)
|
||||
elif axis == 1:
|
||||
return self._get_index_slice(slobj)
|
||||
else:
|
||||
raise IndexError("Requested axis not found in manager")
|
||||
|
||||
new_axes = list(self.axes)
|
||||
new_axes[axis] = new_axes[axis]._getitem_slice(slobj)
|
||||
|
||||
return type(self)(tuple(new_blocks), new_axes, verify_integrity=False)
|
||||
Binary file not shown.
@@ -0,0 +1,557 @@
|
||||
import numbers
|
||||
from operator import (
|
||||
le,
|
||||
lt,
|
||||
)
|
||||
|
||||
from cpython.datetime cimport (
|
||||
PyDateTime_IMPORT,
|
||||
PyDelta_Check,
|
||||
)
|
||||
|
||||
PyDateTime_IMPORT
|
||||
|
||||
from cpython.object cimport (
|
||||
Py_EQ,
|
||||
Py_GE,
|
||||
Py_GT,
|
||||
Py_LE,
|
||||
Py_LT,
|
||||
Py_NE,
|
||||
PyObject_RichCompare,
|
||||
)
|
||||
|
||||
import cython
|
||||
from cython import Py_ssize_t
|
||||
import numpy as np
|
||||
|
||||
cimport numpy as cnp
|
||||
from numpy cimport (
|
||||
NPY_QUICKSORT,
|
||||
PyArray_ArgSort,
|
||||
PyArray_Take,
|
||||
float32_t,
|
||||
float64_t,
|
||||
int32_t,
|
||||
int64_t,
|
||||
ndarray,
|
||||
uint64_t,
|
||||
)
|
||||
|
||||
cnp.import_array()
|
||||
|
||||
|
||||
from pandas._libs cimport util
|
||||
from pandas._libs.hashtable cimport Int64Vector
|
||||
from pandas._libs.tslibs.timedeltas cimport _Timedelta
|
||||
from pandas._libs.tslibs.timestamps cimport _Timestamp
|
||||
from pandas._libs.tslibs.timezones cimport tz_compare
|
||||
from pandas._libs.tslibs.util cimport (
|
||||
is_float_object,
|
||||
is_integer_object,
|
||||
is_timedelta64_object,
|
||||
)
|
||||
|
||||
VALID_CLOSED = frozenset(['left', 'right', 'both', 'neither'])
|
||||
|
||||
|
||||
cdef class IntervalMixin:
|
||||
|
||||
@property
|
||||
def closed_left(self):
|
||||
"""
|
||||
Check if the interval is closed on the left side.
|
||||
|
||||
For the meaning of `closed` and `open` see :class:`~pandas.Interval`.
|
||||
|
||||
Returns
|
||||
-------
|
||||
bool
|
||||
True if the Interval is closed on the left-side.
|
||||
"""
|
||||
return self.closed in ('left', 'both')
|
||||
|
||||
@property
|
||||
def closed_right(self):
|
||||
"""
|
||||
Check if the interval is closed on the right side.
|
||||
|
||||
For the meaning of `closed` and `open` see :class:`~pandas.Interval`.
|
||||
|
||||
Returns
|
||||
-------
|
||||
bool
|
||||
True if the Interval is closed on the left-side.
|
||||
"""
|
||||
return self.closed in ('right', 'both')
|
||||
|
||||
@property
|
||||
def open_left(self):
|
||||
"""
|
||||
Check if the interval is open on the left side.
|
||||
|
||||
For the meaning of `closed` and `open` see :class:`~pandas.Interval`.
|
||||
|
||||
Returns
|
||||
-------
|
||||
bool
|
||||
True if the Interval is closed on the left-side.
|
||||
"""
|
||||
return not self.closed_left
|
||||
|
||||
@property
|
||||
def open_right(self):
|
||||
"""
|
||||
Check if the interval is open on the right side.
|
||||
|
||||
For the meaning of `closed` and `open` see :class:`~pandas.Interval`.
|
||||
|
||||
Returns
|
||||
-------
|
||||
bool
|
||||
True if the Interval is closed on the left-side.
|
||||
"""
|
||||
return not self.closed_right
|
||||
|
||||
@property
|
||||
def mid(self):
|
||||
"""
|
||||
Return the midpoint of the Interval.
|
||||
"""
|
||||
try:
|
||||
return 0.5 * (self.left + self.right)
|
||||
except TypeError:
|
||||
# datetime safe version
|
||||
return self.left + 0.5 * self.length
|
||||
|
||||
@property
|
||||
def length(self):
|
||||
"""
|
||||
Return the length of the Interval.
|
||||
"""
|
||||
return self.right - self.left
|
||||
|
||||
@property
|
||||
def is_empty(self):
|
||||
"""
|
||||
Indicates if an interval is empty, meaning it contains no points.
|
||||
|
||||
.. versionadded:: 0.25.0
|
||||
|
||||
Returns
|
||||
-------
|
||||
bool or ndarray
|
||||
A boolean indicating if a scalar :class:`Interval` is empty, or a
|
||||
boolean ``ndarray`` positionally indicating if an ``Interval`` in
|
||||
an :class:`~arrays.IntervalArray` or :class:`IntervalIndex` is
|
||||
empty.
|
||||
|
||||
Examples
|
||||
--------
|
||||
An :class:`Interval` that contains points is not empty:
|
||||
|
||||
>>> pd.Interval(0, 1, closed='right').is_empty
|
||||
False
|
||||
|
||||
An ``Interval`` that does not contain any points is empty:
|
||||
|
||||
>>> pd.Interval(0, 0, closed='right').is_empty
|
||||
True
|
||||
>>> pd.Interval(0, 0, closed='left').is_empty
|
||||
True
|
||||
>>> pd.Interval(0, 0, closed='neither').is_empty
|
||||
True
|
||||
|
||||
An ``Interval`` that contains a single point is not empty:
|
||||
|
||||
>>> pd.Interval(0, 0, closed='both').is_empty
|
||||
False
|
||||
|
||||
An :class:`~arrays.IntervalArray` or :class:`IntervalIndex` returns a
|
||||
boolean ``ndarray`` positionally indicating if an ``Interval`` is
|
||||
empty:
|
||||
|
||||
>>> ivs = [pd.Interval(0, 0, closed='neither'),
|
||||
... pd.Interval(1, 2, closed='neither')]
|
||||
>>> pd.arrays.IntervalArray(ivs).is_empty
|
||||
array([ True, False])
|
||||
|
||||
Missing values are not considered empty:
|
||||
|
||||
>>> ivs = [pd.Interval(0, 0, closed='neither'), np.nan]
|
||||
>>> pd.IntervalIndex(ivs).is_empty
|
||||
array([ True, False])
|
||||
"""
|
||||
return (self.right == self.left) & (self.closed != 'both')
|
||||
|
||||
def _check_closed_matches(self, other, name='other'):
|
||||
"""
|
||||
Check if the closed attribute of `other` matches.
|
||||
|
||||
Note that 'left' and 'right' are considered different from 'both'.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
other : Interval, IntervalIndex, IntervalArray
|
||||
name : str
|
||||
Name to use for 'other' in the error message.
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
When `other` is not closed exactly the same as self.
|
||||
"""
|
||||
if self.closed != other.closed:
|
||||
raise ValueError(f"'{name}.closed' is {repr(other.closed)}, "
|
||||
f"expected {repr(self.closed)}.")
|
||||
|
||||
|
||||
cdef bint _interval_like(other):
|
||||
return (hasattr(other, 'left')
|
||||
and hasattr(other, 'right')
|
||||
and hasattr(other, 'closed'))
|
||||
|
||||
|
||||
cdef class Interval(IntervalMixin):
|
||||
"""
|
||||
Immutable object implementing an Interval, a bounded slice-like interval.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
left : orderable scalar
|
||||
Left bound for the interval.
|
||||
right : orderable scalar
|
||||
Right bound for the interval.
|
||||
closed : {'right', 'left', 'both', 'neither'}, default 'right'
|
||||
Whether the interval is closed on the left-side, right-side, both or
|
||||
neither. See the Notes for more detailed explanation.
|
||||
|
||||
See Also
|
||||
--------
|
||||
IntervalIndex : An Index of Interval objects that are all closed on the
|
||||
same side.
|
||||
cut : Convert continuous data into discrete bins (Categorical
|
||||
of Interval objects).
|
||||
qcut : Convert continuous data into bins (Categorical of Interval objects)
|
||||
based on quantiles.
|
||||
Period : Represents a period of time.
|
||||
|
||||
Notes
|
||||
-----
|
||||
The parameters `left` and `right` must be from the same type, you must be
|
||||
able to compare them and they must satisfy ``left <= right``.
|
||||
|
||||
A closed interval (in mathematics denoted by square brackets) contains
|
||||
its endpoints, i.e. the closed interval ``[0, 5]`` is characterized by the
|
||||
conditions ``0 <= x <= 5``. This is what ``closed='both'`` stands for.
|
||||
An open interval (in mathematics denoted by parentheses) does not contain
|
||||
its endpoints, i.e. the open interval ``(0, 5)`` is characterized by the
|
||||
conditions ``0 < x < 5``. This is what ``closed='neither'`` stands for.
|
||||
Intervals can also be half-open or half-closed, i.e. ``[0, 5)`` is
|
||||
described by ``0 <= x < 5`` (``closed='left'``) and ``(0, 5]`` is
|
||||
described by ``0 < x <= 5`` (``closed='right'``).
|
||||
|
||||
Examples
|
||||
--------
|
||||
It is possible to build Intervals of different types, like numeric ones:
|
||||
|
||||
>>> iv = pd.Interval(left=0, right=5)
|
||||
>>> iv
|
||||
Interval(0, 5, closed='right')
|
||||
|
||||
You can check if an element belongs to it
|
||||
|
||||
>>> 2.5 in iv
|
||||
True
|
||||
|
||||
You can test the bounds (``closed='right'``, so ``0 < x <= 5``):
|
||||
|
||||
>>> 0 in iv
|
||||
False
|
||||
>>> 5 in iv
|
||||
True
|
||||
>>> 0.0001 in iv
|
||||
True
|
||||
|
||||
Calculate its length
|
||||
|
||||
>>> iv.length
|
||||
5
|
||||
|
||||
You can operate with `+` and `*` over an Interval and the operation
|
||||
is applied to each of its bounds, so the result depends on the type
|
||||
of the bound elements
|
||||
|
||||
>>> shifted_iv = iv + 3
|
||||
>>> shifted_iv
|
||||
Interval(3, 8, closed='right')
|
||||
>>> extended_iv = iv * 10.0
|
||||
>>> extended_iv
|
||||
Interval(0.0, 50.0, closed='right')
|
||||
|
||||
To create a time interval you can use Timestamps as the bounds
|
||||
|
||||
>>> year_2017 = pd.Interval(pd.Timestamp('2017-01-01 00:00:00'),
|
||||
... pd.Timestamp('2018-01-01 00:00:00'),
|
||||
... closed='left')
|
||||
>>> pd.Timestamp('2017-01-01 00:00') in year_2017
|
||||
True
|
||||
>>> year_2017.length
|
||||
Timedelta('365 days 00:00:00')
|
||||
"""
|
||||
_typ = "interval"
|
||||
__array_priority__ = 1000
|
||||
|
||||
cdef readonly object left
|
||||
"""
|
||||
Left bound for the interval.
|
||||
"""
|
||||
|
||||
cdef readonly object right
|
||||
"""
|
||||
Right bound for the interval.
|
||||
"""
|
||||
|
||||
cdef readonly str closed
|
||||
"""
|
||||
Whether the interval is closed on the left-side, right-side, both or
|
||||
neither.
|
||||
"""
|
||||
|
||||
def __init__(self, left, right, str closed='right'):
|
||||
# note: it is faster to just do these checks than to use a special
|
||||
# constructor (__cinit__/__new__) to avoid them
|
||||
|
||||
self._validate_endpoint(left)
|
||||
self._validate_endpoint(right)
|
||||
|
||||
if closed not in VALID_CLOSED:
|
||||
raise ValueError(f"invalid option for 'closed': {closed}")
|
||||
if not left <= right:
|
||||
raise ValueError("left side of interval must be <= right side")
|
||||
if (isinstance(left, _Timestamp) and
|
||||
not tz_compare(left.tzinfo, right.tzinfo)):
|
||||
# GH 18538
|
||||
raise ValueError("left and right must have the same time zone, got "
|
||||
f"{repr(left.tzinfo)}' and {repr(right.tzinfo)}")
|
||||
self.left = left
|
||||
self.right = right
|
||||
self.closed = closed
|
||||
|
||||
def _validate_endpoint(self, endpoint):
|
||||
# GH 23013
|
||||
if not (is_integer_object(endpoint) or is_float_object(endpoint) or
|
||||
isinstance(endpoint, (_Timestamp, _Timedelta))):
|
||||
raise ValueError("Only numeric, Timestamp and Timedelta endpoints "
|
||||
"are allowed when constructing an Interval.")
|
||||
|
||||
def __hash__(self):
|
||||
return hash((self.left, self.right, self.closed))
|
||||
|
||||
def __contains__(self, key) -> bool:
|
||||
if _interval_like(key):
|
||||
raise TypeError("__contains__ not defined for two intervals")
|
||||
return ((self.left < key if self.open_left else self.left <= key) and
|
||||
(key < self.right if self.open_right else key <= self.right))
|
||||
|
||||
def __richcmp__(self, other, op: int):
|
||||
if isinstance(other, Interval):
|
||||
self_tuple = (self.left, self.right, self.closed)
|
||||
other_tuple = (other.left, other.right, other.closed)
|
||||
return PyObject_RichCompare(self_tuple, other_tuple, op)
|
||||
elif util.is_array(other):
|
||||
return np.array(
|
||||
[PyObject_RichCompare(self, x, op) for x in other],
|
||||
dtype=bool,
|
||||
)
|
||||
|
||||
return NotImplemented
|
||||
|
||||
def __reduce__(self):
|
||||
args = (self.left, self.right, self.closed)
|
||||
return (type(self), args)
|
||||
|
||||
def _repr_base(self):
|
||||
left = self.left
|
||||
right = self.right
|
||||
|
||||
# TODO: need more general formatting methodology here
|
||||
if isinstance(left, _Timestamp) and isinstance(right, _Timestamp):
|
||||
left = left._short_repr
|
||||
right = right._short_repr
|
||||
|
||||
return left, right
|
||||
|
||||
def __repr__(self) -> str:
|
||||
|
||||
left, right = self._repr_base()
|
||||
name = type(self).__name__
|
||||
repr_str = f'{name}({repr(left)}, {repr(right)}, closed={repr(self.closed)})'
|
||||
return repr_str
|
||||
|
||||
def __str__(self) -> str:
|
||||
|
||||
left, right = self._repr_base()
|
||||
start_symbol = '[' if self.closed_left else '('
|
||||
end_symbol = ']' if self.closed_right else ')'
|
||||
return f'{start_symbol}{left}, {right}{end_symbol}'
|
||||
|
||||
def __add__(self, y):
|
||||
if (
|
||||
isinstance(y, numbers.Number)
|
||||
or PyDelta_Check(y)
|
||||
or is_timedelta64_object(y)
|
||||
):
|
||||
return Interval(self.left + y, self.right + y, closed=self.closed)
|
||||
elif (
|
||||
isinstance(y, Interval)
|
||||
and (
|
||||
isinstance(self, numbers.Number)
|
||||
or PyDelta_Check(self)
|
||||
or is_timedelta64_object(self)
|
||||
)
|
||||
):
|
||||
return Interval(y.left + self, y.right + self, closed=y.closed)
|
||||
return NotImplemented
|
||||
|
||||
def __sub__(self, y):
|
||||
if (
|
||||
isinstance(y, numbers.Number)
|
||||
or PyDelta_Check(y)
|
||||
or is_timedelta64_object(y)
|
||||
):
|
||||
return Interval(self.left - y, self.right - y, closed=self.closed)
|
||||
return NotImplemented
|
||||
|
||||
def __mul__(self, y):
|
||||
if isinstance(y, numbers.Number):
|
||||
return Interval(self.left * y, self.right * y, closed=self.closed)
|
||||
elif isinstance(y, Interval) and isinstance(self, numbers.Number):
|
||||
return Interval(y.left * self, y.right * self, closed=y.closed)
|
||||
return NotImplemented
|
||||
|
||||
def __truediv__(self, y):
|
||||
if isinstance(y, numbers.Number):
|
||||
return Interval(self.left / y, self.right / y, closed=self.closed)
|
||||
return NotImplemented
|
||||
|
||||
def __floordiv__(self, y):
|
||||
if isinstance(y, numbers.Number):
|
||||
return Interval(
|
||||
self.left // y, self.right // y, closed=self.closed)
|
||||
return NotImplemented
|
||||
|
||||
def overlaps(self, other):
|
||||
"""
|
||||
Check whether two Interval objects overlap.
|
||||
|
||||
Two intervals overlap if they share a common point, including closed
|
||||
endpoints. Intervals that only have an open endpoint in common do not
|
||||
overlap.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
other : Interval
|
||||
Interval to check against for an overlap.
|
||||
|
||||
Returns
|
||||
-------
|
||||
bool
|
||||
True if the two intervals overlap.
|
||||
|
||||
See Also
|
||||
--------
|
||||
IntervalArray.overlaps : The corresponding method for IntervalArray.
|
||||
IntervalIndex.overlaps : The corresponding method for IntervalIndex.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> i1 = pd.Interval(0, 2)
|
||||
>>> i2 = pd.Interval(1, 3)
|
||||
>>> i1.overlaps(i2)
|
||||
True
|
||||
>>> i3 = pd.Interval(4, 5)
|
||||
>>> i1.overlaps(i3)
|
||||
False
|
||||
|
||||
Intervals that share closed endpoints overlap:
|
||||
|
||||
>>> i4 = pd.Interval(0, 1, closed='both')
|
||||
>>> i5 = pd.Interval(1, 2, closed='both')
|
||||
>>> i4.overlaps(i5)
|
||||
True
|
||||
|
||||
Intervals that only have an open endpoint in common do not overlap:
|
||||
|
||||
>>> i6 = pd.Interval(1, 2, closed='neither')
|
||||
>>> i4.overlaps(i6)
|
||||
False
|
||||
"""
|
||||
if not isinstance(other, Interval):
|
||||
raise TypeError("`other` must be an Interval, "
|
||||
f"got {type(other).__name__}")
|
||||
|
||||
# equality is okay if both endpoints are closed (overlap at a point)
|
||||
op1 = le if (self.closed_left and other.closed_right) else lt
|
||||
op2 = le if (other.closed_left and self.closed_right) else lt
|
||||
|
||||
# overlaps is equivalent negation of two interval being disjoint:
|
||||
# disjoint = (A.left > B.right) or (B.left > A.right)
|
||||
# (simplifying the negation allows this to be done in less operations)
|
||||
return op1(self.left, other.right) and op2(other.left, self.right)
|
||||
|
||||
|
||||
@cython.wraparound(False)
|
||||
@cython.boundscheck(False)
|
||||
def intervals_to_interval_bounds(ndarray intervals, bint validate_closed=True):
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
intervals : ndarray
|
||||
Object array of Intervals / nulls.
|
||||
|
||||
validate_closed: bool, default True
|
||||
Boolean indicating if all intervals must be closed on the same side.
|
||||
Mismatching closed will raise if True, else return None for closed.
|
||||
|
||||
Returns
|
||||
-------
|
||||
tuple of
|
||||
left : ndarray
|
||||
right : ndarray
|
||||
closed: str
|
||||
"""
|
||||
cdef:
|
||||
object closed = None, interval
|
||||
Py_ssize_t i, n = len(intervals)
|
||||
ndarray left, right
|
||||
bint seen_closed = False
|
||||
|
||||
left = np.empty(n, dtype=intervals.dtype)
|
||||
right = np.empty(n, dtype=intervals.dtype)
|
||||
|
||||
for i in range(n):
|
||||
interval = intervals[i]
|
||||
if interval is None or util.is_nan(interval):
|
||||
left[i] = np.nan
|
||||
right[i] = np.nan
|
||||
continue
|
||||
|
||||
if not isinstance(interval, Interval):
|
||||
raise TypeError(f"type {type(interval)} with value "
|
||||
f"{interval} is not an interval")
|
||||
|
||||
left[i] = interval.left
|
||||
right[i] = interval.right
|
||||
if not seen_closed:
|
||||
seen_closed = True
|
||||
closed = interval.closed
|
||||
elif closed != interval.closed:
|
||||
closed = None
|
||||
if validate_closed:
|
||||
raise ValueError("intervals must all be closed on the same side")
|
||||
|
||||
return left, right, closed
|
||||
|
||||
|
||||
include "intervaltree.pxi"
|
||||
@@ -0,0 +1,427 @@
|
||||
"""
|
||||
Template for intervaltree
|
||||
|
||||
WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
|
||||
"""
|
||||
|
||||
from pandas._libs.algos import is_monotonic
|
||||
|
||||
ctypedef fused int_scalar_t:
|
||||
int64_t
|
||||
float64_t
|
||||
|
||||
ctypedef fused uint_scalar_t:
|
||||
uint64_t
|
||||
float64_t
|
||||
|
||||
ctypedef fused scalar_t:
|
||||
int_scalar_t
|
||||
uint_scalar_t
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# IntervalTree
|
||||
# ----------------------------------------------------------------------
|
||||
|
||||
cdef class IntervalTree(IntervalMixin):
|
||||
"""A centered interval tree
|
||||
|
||||
Based off the algorithm described on Wikipedia:
|
||||
https://en.wikipedia.org/wiki/Interval_tree
|
||||
|
||||
we are emulating the IndexEngine interface
|
||||
"""
|
||||
cdef readonly:
|
||||
ndarray left, right
|
||||
IntervalNode root
|
||||
object dtype
|
||||
str closed
|
||||
object _is_overlapping, _left_sorter, _right_sorter
|
||||
Py_ssize_t _na_count
|
||||
|
||||
def __init__(self, left, right, closed='right', leaf_size=100):
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
left, right : np.ndarray[ndim=1]
|
||||
Left and right bounds for each interval. Assumed to contain no
|
||||
NaNs.
|
||||
closed : {'left', 'right', 'both', 'neither'}, optional
|
||||
Whether the intervals are closed on the left-side, right-side, both
|
||||
or neither. Defaults to 'right'.
|
||||
leaf_size : int, optional
|
||||
Parameter that controls when the tree switches from creating nodes
|
||||
to brute-force search. Tune this parameter to optimize query
|
||||
performance.
|
||||
"""
|
||||
if closed not in ['left', 'right', 'both', 'neither']:
|
||||
raise ValueError("invalid option for 'closed': %s" % closed)
|
||||
|
||||
left = np.asarray(left)
|
||||
right = np.asarray(right)
|
||||
self.dtype = np.result_type(left, right)
|
||||
self.left = np.asarray(left, dtype=self.dtype)
|
||||
self.right = np.asarray(right, dtype=self.dtype)
|
||||
|
||||
indices = np.arange(len(left), dtype='int64')
|
||||
|
||||
self.closed = closed
|
||||
|
||||
# GH 23352: ensure no nan in nodes
|
||||
mask = ~np.isnan(self.left)
|
||||
self._na_count = len(mask) - mask.sum()
|
||||
self.left = self.left[mask]
|
||||
self.right = self.right[mask]
|
||||
indices = indices[mask]
|
||||
|
||||
node_cls = NODE_CLASSES[str(self.dtype), closed]
|
||||
self.root = node_cls(self.left, self.right, indices, leaf_size)
|
||||
|
||||
@property
|
||||
def left_sorter(self) -> np.ndarray:
|
||||
"""How to sort the left labels; this is used for binary search
|
||||
"""
|
||||
if self._left_sorter is None:
|
||||
self._left_sorter = np.argsort(self.left)
|
||||
return self._left_sorter
|
||||
|
||||
@property
|
||||
def right_sorter(self) -> np.ndarray:
|
||||
"""How to sort the right labels
|
||||
"""
|
||||
if self._right_sorter is None:
|
||||
self._right_sorter = np.argsort(self.right)
|
||||
return self._right_sorter
|
||||
|
||||
@property
|
||||
def is_overlapping(self) -> bool:
|
||||
"""
|
||||
Determine if the IntervalTree contains overlapping intervals.
|
||||
Cached as self._is_overlapping.
|
||||
"""
|
||||
if self._is_overlapping is not None:
|
||||
return self._is_overlapping
|
||||
|
||||
# <= when both sides closed since endpoints can overlap
|
||||
op = le if self.closed == 'both' else lt
|
||||
|
||||
# overlap if start of current interval < end of previous interval
|
||||
# (current and previous in terms of sorted order by left/start side)
|
||||
current = self.left[self.left_sorter[1:]]
|
||||
previous = self.right[self.left_sorter[:-1]]
|
||||
self._is_overlapping = bool(op(current, previous).any())
|
||||
|
||||
return self._is_overlapping
|
||||
|
||||
@property
|
||||
def is_monotonic_increasing(self) -> bool:
|
||||
"""
|
||||
Return True if the IntervalTree is monotonic increasing (only equal or
|
||||
increasing values), else False
|
||||
"""
|
||||
if self._na_count > 0:
|
||||
return False
|
||||
values = [self.right, self.left]
|
||||
|
||||
sort_order = np.lexsort(values)
|
||||
return is_monotonic(sort_order, False)[0]
|
||||
|
||||
def get_indexer(self, scalar_t[:] target) -> np.ndarray:
|
||||
"""Return the positions corresponding to unique intervals that overlap
|
||||
with the given array of scalar targets.
|
||||
"""
|
||||
|
||||
# TODO: write get_indexer_intervals
|
||||
cdef:
|
||||
Py_ssize_t old_len
|
||||
Py_ssize_t i
|
||||
Int64Vector result
|
||||
|
||||
result = Int64Vector()
|
||||
old_len = 0
|
||||
for i in range(len(target)):
|
||||
try:
|
||||
self.root.query(result, target[i])
|
||||
except OverflowError:
|
||||
# overflow -> no match, which is already handled below
|
||||
pass
|
||||
|
||||
if result.data.n == old_len:
|
||||
result.append(-1)
|
||||
elif result.data.n > old_len + 1:
|
||||
raise KeyError(
|
||||
'indexer does not intersect a unique set of intervals')
|
||||
old_len = result.data.n
|
||||
return result.to_array().astype('intp')
|
||||
|
||||
def get_indexer_non_unique(self, scalar_t[:] target):
|
||||
"""Return the positions corresponding to intervals that overlap with
|
||||
the given array of scalar targets. Non-unique positions are repeated.
|
||||
"""
|
||||
cdef:
|
||||
Py_ssize_t old_len
|
||||
Py_ssize_t i
|
||||
Int64Vector result, missing
|
||||
|
||||
result = Int64Vector()
|
||||
missing = Int64Vector()
|
||||
old_len = 0
|
||||
for i in range(len(target)):
|
||||
try:
|
||||
self.root.query(result, target[i])
|
||||
except OverflowError:
|
||||
# overflow -> no match, which is already handled below
|
||||
pass
|
||||
|
||||
if result.data.n == old_len:
|
||||
result.append(-1)
|
||||
missing.append(i)
|
||||
old_len = result.data.n
|
||||
return (result.to_array().astype('intp'),
|
||||
missing.to_array().astype('intp'))
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return ('<IntervalTree[{dtype},{closed}]: '
|
||||
'{n_elements} elements>'.format(
|
||||
dtype=self.dtype, closed=self.closed,
|
||||
n_elements=self.root.n_elements))
|
||||
|
||||
# compat with IndexEngine interface
|
||||
def clear_mapping(self) -> None:
|
||||
pass
|
||||
|
||||
|
||||
cdef take(ndarray source, ndarray indices):
|
||||
"""Take the given positions from a 1D ndarray
|
||||
"""
|
||||
return PyArray_Take(source, indices, 0)
|
||||
|
||||
|
||||
cdef sort_values_and_indices(all_values, all_indices, subset):
|
||||
indices = take(all_indices, subset)
|
||||
values = take(all_values, subset)
|
||||
sorter = PyArray_ArgSort(values, 0, NPY_QUICKSORT)
|
||||
sorted_values = take(values, sorter)
|
||||
sorted_indices = take(indices, sorter)
|
||||
return sorted_values, sorted_indices
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# Nodes
|
||||
# ----------------------------------------------------------------------
|
||||
|
||||
@cython.internal
|
||||
cdef class IntervalNode:
|
||||
cdef readonly:
|
||||
int64_t n_elements, n_center, leaf_size
|
||||
bint is_leaf_node
|
||||
|
||||
def __repr__(self) -> str:
|
||||
if self.is_leaf_node:
|
||||
return (
|
||||
f"<{type(self).__name__}: {self.n_elements} elements (terminal)>"
|
||||
)
|
||||
else:
|
||||
n_left = self.left_node.n_elements
|
||||
n_right = self.right_node.n_elements
|
||||
n_center = self.n_elements - n_left - n_right
|
||||
return (
|
||||
f"<{type(self).__name__}: "
|
||||
f"pivot {self.pivot}, {self.n_elements} elements "
|
||||
f"({n_left} left, {n_right} right, {n_center} overlapping)>"
|
||||
)
|
||||
|
||||
def counts(self):
|
||||
"""
|
||||
Inspect counts on this node
|
||||
useful for debugging purposes
|
||||
"""
|
||||
if self.is_leaf_node:
|
||||
return self.n_elements
|
||||
else:
|
||||
m = len(self.center_left_values)
|
||||
l = self.left_node.counts()
|
||||
r = self.right_node.counts()
|
||||
return (m, (l, r))
|
||||
|
||||
|
||||
# we need specialized nodes and leaves to optimize for different dtype and
|
||||
# closed values
|
||||
|
||||
{{py:
|
||||
|
||||
nodes = []
|
||||
for dtype in ['float64', 'int64', 'uint64']:
|
||||
for closed, cmp_left, cmp_right in [
|
||||
('left', '<=', '<'),
|
||||
('right', '<', '<='),
|
||||
('both', '<=', '<='),
|
||||
('neither', '<', '<')]:
|
||||
cmp_left_converse = '<' if cmp_left == '<=' else '<='
|
||||
cmp_right_converse = '<' if cmp_right == '<=' else '<='
|
||||
if dtype.startswith('int'):
|
||||
fused_prefix = 'int_'
|
||||
elif dtype.startswith('uint'):
|
||||
fused_prefix = 'uint_'
|
||||
elif dtype.startswith('float'):
|
||||
fused_prefix = ''
|
||||
nodes.append((dtype, dtype.title(),
|
||||
closed, closed.title(),
|
||||
cmp_left,
|
||||
cmp_right,
|
||||
cmp_left_converse,
|
||||
cmp_right_converse,
|
||||
fused_prefix))
|
||||
|
||||
}}
|
||||
|
||||
NODE_CLASSES = {}
|
||||
|
||||
{{for dtype, dtype_title, closed, closed_title, cmp_left, cmp_right,
|
||||
cmp_left_converse, cmp_right_converse, fused_prefix in nodes}}
|
||||
|
||||
|
||||
@cython.internal
|
||||
cdef class {{dtype_title}}Closed{{closed_title}}IntervalNode(IntervalNode):
|
||||
"""Non-terminal node for an IntervalTree
|
||||
|
||||
Categorizes intervals by those that fall to the left, those that fall to
|
||||
the right, and those that overlap with the pivot.
|
||||
"""
|
||||
cdef readonly:
|
||||
{{dtype_title}}Closed{{closed_title}}IntervalNode left_node, right_node
|
||||
{{dtype}}_t[:] center_left_values, center_right_values, left, right
|
||||
int64_t[:] center_left_indices, center_right_indices, indices
|
||||
{{dtype}}_t min_left, max_right
|
||||
{{dtype}}_t pivot
|
||||
|
||||
def __init__(self,
|
||||
ndarray[{{dtype}}_t, ndim=1] left,
|
||||
ndarray[{{dtype}}_t, ndim=1] right,
|
||||
ndarray[int64_t, ndim=1] indices,
|
||||
int64_t leaf_size):
|
||||
|
||||
self.n_elements = len(left)
|
||||
self.leaf_size = leaf_size
|
||||
|
||||
# min_left and min_right are used to speed-up query by skipping
|
||||
# query on sub-nodes. If this node has size 0, query is cheap,
|
||||
# so these values don't matter.
|
||||
if left.size > 0:
|
||||
self.min_left = left.min()
|
||||
self.max_right = right.max()
|
||||
else:
|
||||
self.min_left = 0
|
||||
self.max_right = 0
|
||||
|
||||
if self.n_elements <= leaf_size:
|
||||
# make this a terminal (leaf) node
|
||||
self.is_leaf_node = True
|
||||
self.left = left
|
||||
self.right = right
|
||||
self.indices = indices
|
||||
self.n_center = 0
|
||||
else:
|
||||
# calculate a pivot so we can create child nodes
|
||||
self.is_leaf_node = False
|
||||
self.pivot = np.median(left / 2 + right / 2)
|
||||
left_set, right_set, center_set = self.classify_intervals(
|
||||
left, right)
|
||||
|
||||
self.left_node = self.new_child_node(left, right,
|
||||
indices, left_set)
|
||||
self.right_node = self.new_child_node(left, right,
|
||||
indices, right_set)
|
||||
|
||||
self.center_left_values, self.center_left_indices = \
|
||||
sort_values_and_indices(left, indices, center_set)
|
||||
self.center_right_values, self.center_right_indices = \
|
||||
sort_values_and_indices(right, indices, center_set)
|
||||
self.n_center = len(self.center_left_indices)
|
||||
|
||||
@cython.wraparound(False)
|
||||
@cython.boundscheck(False)
|
||||
cdef classify_intervals(self, {{dtype}}_t[:] left, {{dtype}}_t[:] right):
|
||||
"""Classify the given intervals based upon whether they fall to the
|
||||
left, right, or overlap with this node's pivot.
|
||||
"""
|
||||
cdef:
|
||||
Int64Vector left_ind, right_ind, overlapping_ind
|
||||
Py_ssize_t i
|
||||
|
||||
left_ind = Int64Vector()
|
||||
right_ind = Int64Vector()
|
||||
overlapping_ind = Int64Vector()
|
||||
|
||||
for i in range(self.n_elements):
|
||||
if right[i] {{cmp_right_converse}} self.pivot:
|
||||
left_ind.append(i)
|
||||
elif self.pivot {{cmp_left_converse}} left[i]:
|
||||
right_ind.append(i)
|
||||
else:
|
||||
overlapping_ind.append(i)
|
||||
|
||||
return (left_ind.to_array(),
|
||||
right_ind.to_array(),
|
||||
overlapping_ind.to_array())
|
||||
|
||||
cdef new_child_node(self,
|
||||
ndarray[{{dtype}}_t, ndim=1] left,
|
||||
ndarray[{{dtype}}_t, ndim=1] right,
|
||||
ndarray[int64_t, ndim=1] indices,
|
||||
ndarray[int64_t, ndim=1] subset):
|
||||
"""Create a new child node.
|
||||
"""
|
||||
left = take(left, subset)
|
||||
right = take(right, subset)
|
||||
indices = take(indices, subset)
|
||||
return {{dtype_title}}Closed{{closed_title}}IntervalNode(
|
||||
left, right, indices, self.leaf_size)
|
||||
|
||||
@cython.wraparound(False)
|
||||
@cython.boundscheck(False)
|
||||
@cython.initializedcheck(False)
|
||||
cpdef query(self, Int64Vector result, {{fused_prefix}}scalar_t point):
|
||||
"""Recursively query this node and its sub-nodes for intervals that
|
||||
overlap with the query point.
|
||||
"""
|
||||
cdef:
|
||||
int64_t[:] indices
|
||||
{{dtype}}_t[:] values
|
||||
Py_ssize_t i
|
||||
|
||||
if self.is_leaf_node:
|
||||
# Once we get down to a certain size, it doesn't make sense to
|
||||
# continue the binary tree structure. Instead, we use linear
|
||||
# search.
|
||||
for i in range(self.n_elements):
|
||||
if self.left[i] {{cmp_left}} point {{cmp_right}} self.right[i]:
|
||||
result.append(self.indices[i])
|
||||
else:
|
||||
# There are child nodes. Based on comparing our query to the pivot,
|
||||
# look at the center values, then go to the relevant child.
|
||||
if point < self.pivot:
|
||||
values = self.center_left_values
|
||||
indices = self.center_left_indices
|
||||
for i in range(self.n_center):
|
||||
if not values[i] {{cmp_left}} point:
|
||||
break
|
||||
result.append(indices[i])
|
||||
if point {{cmp_right}} self.left_node.max_right:
|
||||
self.left_node.query(result, point)
|
||||
elif point > self.pivot:
|
||||
values = self.center_right_values
|
||||
indices = self.center_right_indices
|
||||
for i in range(self.n_center - 1, -1, -1):
|
||||
if not point {{cmp_right}} values[i]:
|
||||
break
|
||||
result.append(indices[i])
|
||||
if self.right_node.min_left {{cmp_left}} point:
|
||||
self.right_node.query(result, point)
|
||||
else:
|
||||
result.extend(self.center_left_indices)
|
||||
|
||||
|
||||
NODE_CLASSES['{{dtype}}',
|
||||
'{{closed}}'] = {{dtype_title}}Closed{{closed_title}}IntervalNode
|
||||
|
||||
{{endfor}}
|
||||
Binary file not shown.
@@ -0,0 +1,93 @@
|
||||
import numpy as np
|
||||
|
||||
from pandas._typing import npt
|
||||
|
||||
def inner_join(
|
||||
left: np.ndarray, # const intp_t[:]
|
||||
right: np.ndarray, # const intp_t[:]
|
||||
max_groups: int,
|
||||
) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ...
|
||||
def left_outer_join(
|
||||
left: np.ndarray, # const intp_t[:]
|
||||
right: np.ndarray, # const intp_t[:]
|
||||
max_groups: int,
|
||||
sort: bool = ...,
|
||||
) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ...
|
||||
def full_outer_join(
|
||||
left: np.ndarray, # const intp_t[:]
|
||||
right: np.ndarray, # const intp_t[:]
|
||||
max_groups: int,
|
||||
) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ...
|
||||
def ffill_indexer(
|
||||
indexer: np.ndarray, # const intp_t[:]
|
||||
) -> npt.NDArray[np.intp]: ...
|
||||
def left_join_indexer_unique(
|
||||
left: np.ndarray, # ndarray[join_t]
|
||||
right: np.ndarray, # ndarray[join_t]
|
||||
) -> npt.NDArray[np.intp]: ...
|
||||
def left_join_indexer(
|
||||
left: np.ndarray, # ndarray[join_t]
|
||||
right: np.ndarray, # ndarray[join_t]
|
||||
) -> tuple[
|
||||
np.ndarray, # np.ndarray[join_t]
|
||||
npt.NDArray[np.intp],
|
||||
npt.NDArray[np.intp],
|
||||
]: ...
|
||||
def inner_join_indexer(
|
||||
left: np.ndarray, # ndarray[join_t]
|
||||
right: np.ndarray, # ndarray[join_t]
|
||||
) -> tuple[
|
||||
np.ndarray, # np.ndarray[join_t]
|
||||
npt.NDArray[np.intp],
|
||||
npt.NDArray[np.intp],
|
||||
]: ...
|
||||
def outer_join_indexer(
|
||||
left: np.ndarray, # ndarray[join_t]
|
||||
right: np.ndarray, # ndarray[join_t]
|
||||
) -> tuple[
|
||||
np.ndarray, # np.ndarray[join_t]
|
||||
npt.NDArray[np.intp],
|
||||
npt.NDArray[np.intp],
|
||||
]: ...
|
||||
def asof_join_backward_on_X_by_Y(
|
||||
left_values: np.ndarray, # asof_t[:]
|
||||
right_values: np.ndarray, # asof_t[:]
|
||||
left_by_values: np.ndarray, # by_t[:]
|
||||
right_by_values: np.ndarray, # by_t[:]
|
||||
allow_exact_matches: bool = ...,
|
||||
tolerance: np.number | int | float | None = ...,
|
||||
) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ...
|
||||
def asof_join_forward_on_X_by_Y(
|
||||
left_values: np.ndarray, # asof_t[:]
|
||||
right_values: np.ndarray, # asof_t[:]
|
||||
left_by_values: np.ndarray, # by_t[:]
|
||||
right_by_values: np.ndarray, # by_t[:]
|
||||
allow_exact_matches: bool = ...,
|
||||
tolerance: np.number | int | float | None = ...,
|
||||
) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ...
|
||||
def asof_join_nearest_on_X_by_Y(
|
||||
left_values: np.ndarray, # asof_t[:]
|
||||
right_values: np.ndarray, # asof_t[:]
|
||||
left_by_values: np.ndarray, # by_t[:]
|
||||
right_by_values: np.ndarray, # by_t[:]
|
||||
allow_exact_matches: bool = ...,
|
||||
tolerance: np.number | int | float | None = ...,
|
||||
) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ...
|
||||
def asof_join_backward(
|
||||
left_values: np.ndarray, # asof_t[:]
|
||||
right_values: np.ndarray, # asof_t[:]
|
||||
allow_exact_matches: bool = ...,
|
||||
tolerance: np.number | int | float | None = ...,
|
||||
) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ...
|
||||
def asof_join_forward(
|
||||
left_values: np.ndarray, # asof_t[:]
|
||||
right_values: np.ndarray, # asof_t[:]
|
||||
allow_exact_matches: bool = ...,
|
||||
tolerance: np.number | int | float | None = ...,
|
||||
) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ...
|
||||
def asof_join_nearest(
|
||||
left_values: np.ndarray, # asof_t[:]
|
||||
right_values: np.ndarray, # asof_t[:]
|
||||
allow_exact_matches: bool = ...,
|
||||
tolerance: np.number | int | float | None = ...,
|
||||
) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ...
|
||||
1006
dashboard/flask-server/venv/Lib/site-packages/pandas/_libs/join.pyx
Normal file
1006
dashboard/flask-server/venv/Lib/site-packages/pandas/_libs/join.pyx
Normal file
File diff suppressed because it is too large
Load Diff
Binary file not shown.
@@ -0,0 +1,129 @@
|
||||
from cpython.object cimport PyObject
|
||||
from numpy cimport (
|
||||
complex64_t,
|
||||
complex128_t,
|
||||
float32_t,
|
||||
float64_t,
|
||||
int8_t,
|
||||
int16_t,
|
||||
int32_t,
|
||||
int64_t,
|
||||
uint8_t,
|
||||
uint16_t,
|
||||
uint32_t,
|
||||
uint64_t,
|
||||
)
|
||||
|
||||
|
||||
cdef extern from "khash_python.h":
|
||||
const int KHASH_TRACE_DOMAIN
|
||||
|
||||
ctypedef uint32_t khuint_t
|
||||
ctypedef khuint_t khiter_t
|
||||
|
||||
ctypedef struct khcomplex128_t:
|
||||
double real
|
||||
double imag
|
||||
|
||||
bint are_equivalent_khcomplex128_t \
|
||||
"kh_complex_hash_equal" (khcomplex128_t a, khcomplex128_t b) nogil
|
||||
|
||||
ctypedef struct khcomplex64_t:
|
||||
float real
|
||||
float imag
|
||||
|
||||
bint are_equivalent_khcomplex64_t \
|
||||
"kh_complex_hash_equal" (khcomplex64_t a, khcomplex64_t b) nogil
|
||||
|
||||
bint are_equivalent_float64_t \
|
||||
"kh_floats_hash_equal" (float64_t a, float64_t b) nogil
|
||||
|
||||
bint are_equivalent_float32_t \
|
||||
"kh_floats_hash_equal" (float32_t a, float32_t b) nogil
|
||||
|
||||
uint32_t kh_python_hash_func(object key)
|
||||
bint kh_python_hash_equal(object a, object b)
|
||||
|
||||
ctypedef struct kh_pymap_t:
|
||||
khuint_t n_buckets, size, n_occupied, upper_bound
|
||||
uint32_t *flags
|
||||
PyObject **keys
|
||||
size_t *vals
|
||||
|
||||
kh_pymap_t* kh_init_pymap()
|
||||
void kh_destroy_pymap(kh_pymap_t*)
|
||||
void kh_clear_pymap(kh_pymap_t*)
|
||||
khuint_t kh_get_pymap(kh_pymap_t*, PyObject*)
|
||||
void kh_resize_pymap(kh_pymap_t*, khuint_t)
|
||||
khuint_t kh_put_pymap(kh_pymap_t*, PyObject*, int*)
|
||||
void kh_del_pymap(kh_pymap_t*, khuint_t)
|
||||
|
||||
bint kh_exist_pymap(kh_pymap_t*, khiter_t)
|
||||
|
||||
ctypedef struct kh_pyset_t:
|
||||
khuint_t n_buckets, size, n_occupied, upper_bound
|
||||
uint32_t *flags
|
||||
PyObject **keys
|
||||
size_t *vals
|
||||
|
||||
kh_pyset_t* kh_init_pyset()
|
||||
void kh_destroy_pyset(kh_pyset_t*)
|
||||
void kh_clear_pyset(kh_pyset_t*)
|
||||
khuint_t kh_get_pyset(kh_pyset_t*, PyObject*)
|
||||
void kh_resize_pyset(kh_pyset_t*, khuint_t)
|
||||
khuint_t kh_put_pyset(kh_pyset_t*, PyObject*, int*)
|
||||
void kh_del_pyset(kh_pyset_t*, khuint_t)
|
||||
|
||||
bint kh_exist_pyset(kh_pyset_t*, khiter_t)
|
||||
|
||||
ctypedef char* kh_cstr_t
|
||||
|
||||
ctypedef struct kh_str_t:
|
||||
khuint_t n_buckets, size, n_occupied, upper_bound
|
||||
uint32_t *flags
|
||||
kh_cstr_t *keys
|
||||
size_t *vals
|
||||
|
||||
kh_str_t* kh_init_str() nogil
|
||||
void kh_destroy_str(kh_str_t*) nogil
|
||||
void kh_clear_str(kh_str_t*) nogil
|
||||
khuint_t kh_get_str(kh_str_t*, kh_cstr_t) nogil
|
||||
void kh_resize_str(kh_str_t*, khuint_t) nogil
|
||||
khuint_t kh_put_str(kh_str_t*, kh_cstr_t, int*) nogil
|
||||
void kh_del_str(kh_str_t*, khuint_t) nogil
|
||||
|
||||
bint kh_exist_str(kh_str_t*, khiter_t) nogil
|
||||
|
||||
ctypedef struct kh_str_starts_t:
|
||||
kh_str_t *table
|
||||
int starts[256]
|
||||
|
||||
kh_str_starts_t* kh_init_str_starts() nogil
|
||||
khuint_t kh_put_str_starts_item(kh_str_starts_t* table, char* key,
|
||||
int* ret) nogil
|
||||
khuint_t kh_get_str_starts_item(kh_str_starts_t* table, char* key) nogil
|
||||
void kh_destroy_str_starts(kh_str_starts_t*) nogil
|
||||
void kh_resize_str_starts(kh_str_starts_t*, khuint_t) nogil
|
||||
|
||||
# sweep factorize
|
||||
|
||||
ctypedef struct kh_strbox_t:
|
||||
khuint_t n_buckets, size, n_occupied, upper_bound
|
||||
uint32_t *flags
|
||||
kh_cstr_t *keys
|
||||
PyObject **vals
|
||||
|
||||
kh_strbox_t* kh_init_strbox() nogil
|
||||
void kh_destroy_strbox(kh_strbox_t*) nogil
|
||||
void kh_clear_strbox(kh_strbox_t*) nogil
|
||||
khuint_t kh_get_strbox(kh_strbox_t*, kh_cstr_t) nogil
|
||||
void kh_resize_strbox(kh_strbox_t*, khuint_t) nogil
|
||||
khuint_t kh_put_strbox(kh_strbox_t*, kh_cstr_t, int*) nogil
|
||||
void kh_del_strbox(kh_strbox_t*, khuint_t) nogil
|
||||
|
||||
bint kh_exist_strbox(kh_strbox_t*, khiter_t) nogil
|
||||
|
||||
khuint_t kh_needed_n_buckets(khuint_t element_n) nogil
|
||||
|
||||
|
||||
include "khash_for_primitive_helper.pxi"
|
||||
@@ -0,0 +1,44 @@
|
||||
"""
|
||||
Template for wrapping khash-tables for each primitive `dtype`
|
||||
|
||||
WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
|
||||
"""
|
||||
|
||||
{{py:
|
||||
|
||||
# name, c_type
|
||||
primitive_types = [('int64', 'int64_t'),
|
||||
('uint64', 'uint64_t'),
|
||||
('float64', 'float64_t'),
|
||||
('int32', 'int32_t'),
|
||||
('uint32', 'uint32_t'),
|
||||
('float32', 'float32_t'),
|
||||
('int16', 'int16_t'),
|
||||
('uint16', 'uint16_t'),
|
||||
('int8', 'int8_t'),
|
||||
('uint8', 'uint8_t'),
|
||||
('complex64', 'khcomplex64_t'),
|
||||
('complex128', 'khcomplex128_t'),
|
||||
]
|
||||
}}
|
||||
|
||||
{{for name, c_type in primitive_types}}
|
||||
|
||||
cdef extern from "khash_python.h":
|
||||
ctypedef struct kh_{{name}}_t:
|
||||
khuint_t n_buckets, size, n_occupied, upper_bound
|
||||
uint32_t *flags
|
||||
{{c_type}} *keys
|
||||
size_t *vals
|
||||
|
||||
kh_{{name}}_t* kh_init_{{name}}() nogil
|
||||
void kh_destroy_{{name}}(kh_{{name}}_t*) nogil
|
||||
void kh_clear_{{name}}(kh_{{name}}_t*) nogil
|
||||
khuint_t kh_get_{{name}}(kh_{{name}}_t*, {{c_type}}) nogil
|
||||
void kh_resize_{{name}}(kh_{{name}}_t*, khuint_t) nogil
|
||||
khuint_t kh_put_{{name}}(kh_{{name}}_t*, {{c_type}}, int*) nogil
|
||||
void kh_del_{{name}}(kh_{{name}}_t*, khuint_t) nogil
|
||||
|
||||
bint kh_exist_{{name}}(kh_{{name}}_t*, khiter_t) nogil
|
||||
|
||||
{{endfor}}
|
||||
Binary file not shown.
@@ -0,0 +1,6 @@
|
||||
from numpy cimport ndarray
|
||||
|
||||
|
||||
cdef bint c_is_list_like(object, bint) except -1
|
||||
|
||||
cpdef ndarray eq_NA_compat(ndarray[object] arr, object key)
|
||||
@@ -0,0 +1,231 @@
|
||||
# TODO(npdtypes): Many types specified here can be made more specific/accurate;
|
||||
# the more specific versions are specified in comments
|
||||
|
||||
from typing import (
|
||||
Any,
|
||||
Callable,
|
||||
Generator,
|
||||
Hashable,
|
||||
Literal,
|
||||
overload,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._typing import (
|
||||
ArrayLike,
|
||||
DtypeObj,
|
||||
npt,
|
||||
)
|
||||
|
||||
# placeholder until we can specify np.ndarray[object, ndim=2]
|
||||
ndarray_obj_2d = np.ndarray
|
||||
|
||||
from enum import Enum
|
||||
|
||||
class NoDefault(Enum): ...
|
||||
|
||||
no_default: NoDefault
|
||||
|
||||
i8max: int
|
||||
u8max: int
|
||||
|
||||
def item_from_zerodim(val: object) -> object: ...
|
||||
def infer_dtype(value: object, skipna: bool = ...) -> str: ...
|
||||
def is_iterator(obj: object) -> bool: ...
|
||||
def is_scalar(val: object) -> bool: ...
|
||||
def is_list_like(obj: object, allow_sets: bool = ...) -> bool: ...
|
||||
def is_period(val: object) -> bool: ...
|
||||
def is_interval(val: object) -> bool: ...
|
||||
def is_decimal(val: object) -> bool: ...
|
||||
def is_complex(val: object) -> bool: ...
|
||||
def is_bool(val: object) -> bool: ...
|
||||
def is_integer(val: object) -> bool: ...
|
||||
def is_float(val: object) -> bool: ...
|
||||
def is_interval_array(values: np.ndarray) -> bool: ...
|
||||
def is_datetime64_array(values: np.ndarray) -> bool: ...
|
||||
def is_timedelta_or_timedelta64_array(values: np.ndarray) -> bool: ...
|
||||
def is_datetime_with_singletz_array(values: np.ndarray) -> bool: ...
|
||||
def is_time_array(values: np.ndarray, skipna: bool = ...): ...
|
||||
def is_date_array(values: np.ndarray, skipna: bool = ...): ...
|
||||
def is_datetime_array(values: np.ndarray, skipna: bool = ...): ...
|
||||
def is_string_array(values: np.ndarray, skipna: bool = ...): ...
|
||||
def is_float_array(values: np.ndarray, skipna: bool = ...): ...
|
||||
def is_integer_array(values: np.ndarray, skipna: bool = ...): ...
|
||||
def is_bool_array(values: np.ndarray, skipna: bool = ...): ...
|
||||
def fast_multiget(mapping: dict, keys: np.ndarray, default=...) -> np.ndarray: ...
|
||||
def fast_unique_multiple_list_gen(gen: Generator, sort: bool = ...) -> list: ...
|
||||
def fast_unique_multiple_list(lists: list, sort: bool | None = ...) -> list: ...
|
||||
def fast_unique_multiple(arrays: list, sort: bool = ...) -> list: ...
|
||||
def map_infer(
|
||||
arr: np.ndarray,
|
||||
f: Callable[[Any], Any],
|
||||
convert: bool = ...,
|
||||
ignore_na: bool = ...,
|
||||
) -> np.ndarray: ...
|
||||
@overload # both convert_datetime and convert_to_nullable_integer False -> np.ndarray
|
||||
def maybe_convert_objects(
|
||||
objects: npt.NDArray[np.object_],
|
||||
*,
|
||||
try_float: bool = ...,
|
||||
safe: bool = ...,
|
||||
convert_datetime: Literal[False] = ...,
|
||||
convert_timedelta: bool = ...,
|
||||
convert_period: Literal[False] = ...,
|
||||
convert_interval: Literal[False] = ...,
|
||||
convert_to_nullable_integer: Literal[False] = ...,
|
||||
dtype_if_all_nat: DtypeObj | None = ...,
|
||||
) -> np.ndarray: ...
|
||||
@overload
|
||||
def maybe_convert_objects(
|
||||
objects: npt.NDArray[np.object_],
|
||||
*,
|
||||
try_float: bool = ...,
|
||||
safe: bool = ...,
|
||||
convert_datetime: bool = ...,
|
||||
convert_timedelta: bool = ...,
|
||||
convert_period: bool = ...,
|
||||
convert_interval: bool = ...,
|
||||
convert_to_nullable_integer: Literal[True] = ...,
|
||||
dtype_if_all_nat: DtypeObj | None = ...,
|
||||
) -> ArrayLike: ...
|
||||
@overload
|
||||
def maybe_convert_objects(
|
||||
objects: npt.NDArray[np.object_],
|
||||
*,
|
||||
try_float: bool = ...,
|
||||
safe: bool = ...,
|
||||
convert_datetime: Literal[True] = ...,
|
||||
convert_timedelta: bool = ...,
|
||||
convert_period: bool = ...,
|
||||
convert_interval: bool = ...,
|
||||
convert_to_nullable_integer: bool = ...,
|
||||
dtype_if_all_nat: DtypeObj | None = ...,
|
||||
) -> ArrayLike: ...
|
||||
@overload
|
||||
def maybe_convert_objects(
|
||||
objects: npt.NDArray[np.object_],
|
||||
*,
|
||||
try_float: bool = ...,
|
||||
safe: bool = ...,
|
||||
convert_datetime: bool = ...,
|
||||
convert_timedelta: bool = ...,
|
||||
convert_period: Literal[True] = ...,
|
||||
convert_interval: bool = ...,
|
||||
convert_to_nullable_integer: bool = ...,
|
||||
dtype_if_all_nat: DtypeObj | None = ...,
|
||||
) -> ArrayLike: ...
|
||||
@overload
|
||||
def maybe_convert_objects(
|
||||
objects: npt.NDArray[np.object_],
|
||||
*,
|
||||
try_float: bool = ...,
|
||||
safe: bool = ...,
|
||||
convert_datetime: bool = ...,
|
||||
convert_timedelta: bool = ...,
|
||||
convert_period: bool = ...,
|
||||
convert_interval: bool = ...,
|
||||
convert_to_nullable_integer: bool = ...,
|
||||
dtype_if_all_nat: DtypeObj | None = ...,
|
||||
) -> ArrayLike: ...
|
||||
@overload
|
||||
def maybe_convert_numeric(
|
||||
values: npt.NDArray[np.object_],
|
||||
na_values: set,
|
||||
convert_empty: bool = ...,
|
||||
coerce_numeric: bool = ...,
|
||||
convert_to_masked_nullable: Literal[False] = ...,
|
||||
) -> tuple[np.ndarray, None]: ...
|
||||
@overload
|
||||
def maybe_convert_numeric(
|
||||
values: npt.NDArray[np.object_],
|
||||
na_values: set,
|
||||
convert_empty: bool = ...,
|
||||
coerce_numeric: bool = ...,
|
||||
*,
|
||||
convert_to_masked_nullable: Literal[True],
|
||||
) -> tuple[np.ndarray, np.ndarray]: ...
|
||||
|
||||
# TODO: restrict `arr`?
|
||||
def ensure_string_array(
|
||||
arr,
|
||||
na_value: object = ...,
|
||||
convert_na_value: bool = ...,
|
||||
copy: bool = ...,
|
||||
skipna: bool = ...,
|
||||
) -> npt.NDArray[np.object_]: ...
|
||||
def infer_datetimelike_array(
|
||||
arr: npt.NDArray[np.object_],
|
||||
) -> tuple[str, bool]: ...
|
||||
def astype_intsafe(
|
||||
arr: npt.NDArray[np.object_],
|
||||
new_dtype: np.dtype,
|
||||
) -> np.ndarray: ...
|
||||
def fast_zip(ndarrays: list) -> npt.NDArray[np.object_]: ...
|
||||
|
||||
# TODO: can we be more specific about rows?
|
||||
def to_object_array_tuples(rows: object) -> ndarray_obj_2d: ...
|
||||
def tuples_to_object_array(
|
||||
tuples: npt.NDArray[np.object_],
|
||||
) -> ndarray_obj_2d: ...
|
||||
|
||||
# TODO: can we be more specific about rows?
|
||||
def to_object_array(rows: object, min_width: int = ...) -> ndarray_obj_2d: ...
|
||||
def dicts_to_array(dicts: list, columns: list) -> ndarray_obj_2d: ...
|
||||
def maybe_booleans_to_slice(
|
||||
mask: npt.NDArray[np.uint8],
|
||||
) -> slice | npt.NDArray[np.uint8]: ...
|
||||
def maybe_indices_to_slice(
|
||||
indices: npt.NDArray[np.intp],
|
||||
max_len: int,
|
||||
) -> slice | npt.NDArray[np.intp]: ...
|
||||
def is_all_arraylike(obj: list) -> bool: ...
|
||||
|
||||
# -----------------------------------------------------------------
|
||||
# Functions which in reality take memoryviews
|
||||
|
||||
def memory_usage_of_objects(arr: np.ndarray) -> int: ... # object[:] # np.int64
|
||||
def map_infer_mask(
|
||||
arr: np.ndarray,
|
||||
f: Callable[[Any], Any],
|
||||
mask: np.ndarray, # const uint8_t[:]
|
||||
convert: bool = ...,
|
||||
na_value: Any = ...,
|
||||
dtype: np.dtype = ...,
|
||||
) -> np.ndarray: ...
|
||||
def indices_fast(
|
||||
index: npt.NDArray[np.intp],
|
||||
labels: np.ndarray, # const int64_t[:]
|
||||
keys: list,
|
||||
sorted_labels: list[npt.NDArray[np.int64]],
|
||||
) -> dict[Hashable, npt.NDArray[np.intp]]: ...
|
||||
def generate_slices(
|
||||
labels: np.ndarray, ngroups: int # const intp_t[:]
|
||||
) -> tuple[npt.NDArray[np.int64], npt.NDArray[np.int64]]: ...
|
||||
def count_level_2d(
|
||||
mask: np.ndarray, # ndarray[uint8_t, ndim=2, cast=True],
|
||||
labels: np.ndarray, # const intp_t[:]
|
||||
max_bin: int,
|
||||
axis: int,
|
||||
) -> np.ndarray: ... # np.ndarray[np.int64, ndim=2]
|
||||
def get_level_sorter(
|
||||
label: np.ndarray, # const int64_t[:]
|
||||
starts: np.ndarray, # const intp_t[:]
|
||||
) -> np.ndarray: ... # np.ndarray[np.intp, ndim=1]
|
||||
def generate_bins_dt64(
|
||||
values: npt.NDArray[np.int64],
|
||||
binner: np.ndarray, # const int64_t[:]
|
||||
closed: object = ...,
|
||||
hasnans: bool = ...,
|
||||
) -> np.ndarray: ... # np.ndarray[np.int64, ndim=1]
|
||||
def array_equivalent_object(
|
||||
left: np.ndarray, # object[:]
|
||||
right: np.ndarray, # object[:]
|
||||
) -> bool: ...
|
||||
def has_infs(arr: np.ndarray) -> bool: ... # const floating[:]
|
||||
def get_reverse_indexer(
|
||||
indexer: np.ndarray, # const intp_t[:]
|
||||
length: int,
|
||||
) -> npt.NDArray[np.intp]: ...
|
||||
def is_bool_list(obj: list) -> bool: ...
|
||||
def dtypes_all_equal(types: list[DtypeObj]) -> bool: ...
|
||||
3093
dashboard/flask-server/venv/Lib/site-packages/pandas/_libs/lib.pyx
Normal file
3093
dashboard/flask-server/venv/Lib/site-packages/pandas/_libs/lib.pyx
Normal file
File diff suppressed because it is too large
Load Diff
Binary file not shown.
@@ -0,0 +1,19 @@
|
||||
from numpy cimport (
|
||||
ndarray,
|
||||
uint8_t,
|
||||
)
|
||||
|
||||
|
||||
cpdef bint is_matching_na(object left, object right, bint nan_matches_none=*)
|
||||
|
||||
cpdef bint checknull(object val, bint inf_as_na=*)
|
||||
cpdef ndarray[uint8_t] isnaobj(ndarray arr, bint inf_as_na=*)
|
||||
|
||||
cdef bint is_null_datetime64(v)
|
||||
cdef bint is_null_timedelta64(v)
|
||||
cdef bint checknull_with_nat_and_na(object obj)
|
||||
|
||||
cdef class C_NAType:
|
||||
pass
|
||||
|
||||
cdef C_NAType C_NA
|
||||
@@ -0,0 +1,17 @@
|
||||
import numpy as np
|
||||
from numpy import typing as npt
|
||||
|
||||
class NAType: ...
|
||||
|
||||
NA: NAType
|
||||
|
||||
def is_matching_na(
|
||||
left: object, right: object, nan_matches_none: bool = ...
|
||||
) -> bool: ...
|
||||
def isposinf_scalar(val: object) -> bool: ...
|
||||
def isneginf_scalar(val: object) -> bool: ...
|
||||
def checknull(val: object, inf_as_na: bool = ...) -> bool: ...
|
||||
def isnaobj(arr: np.ndarray, inf_as_na: bool = ...) -> npt.NDArray[np.bool_]: ...
|
||||
def isnaobj2d(arr: np.ndarray, inf_as_na: bool = ...) -> npt.NDArray[np.bool_]: ...
|
||||
def is_numeric_na(values: np.ndarray) -> npt.NDArray[np.bool_]: ...
|
||||
def is_float_nan(values: np.ndarray) -> npt.NDArray[np.bool_]: ...
|
||||
@@ -0,0 +1,507 @@
|
||||
from decimal import Decimal
|
||||
import numbers
|
||||
from sys import maxsize
|
||||
|
||||
import cython
|
||||
from cython import Py_ssize_t
|
||||
import numpy as np
|
||||
|
||||
cimport numpy as cnp
|
||||
from numpy cimport (
|
||||
float64_t,
|
||||
int64_t,
|
||||
ndarray,
|
||||
uint8_t,
|
||||
)
|
||||
|
||||
cnp.import_array()
|
||||
|
||||
from pandas._libs cimport util
|
||||
from pandas._libs.tslibs.nattype cimport (
|
||||
c_NaT as NaT,
|
||||
checknull_with_nat,
|
||||
is_dt64nat,
|
||||
is_td64nat,
|
||||
)
|
||||
from pandas._libs.tslibs.np_datetime cimport (
|
||||
get_datetime64_unit,
|
||||
get_datetime64_value,
|
||||
get_timedelta64_value,
|
||||
)
|
||||
|
||||
from pandas._libs.ops_dispatch import maybe_dispatch_ufunc_to_dunder_op
|
||||
|
||||
cdef:
|
||||
float64_t INF = <float64_t>np.inf
|
||||
float64_t NEGINF = -INF
|
||||
|
||||
int64_t NPY_NAT = util.get_nat()
|
||||
|
||||
bint is_32bit = maxsize <= 2 ** 32
|
||||
|
||||
type cDecimal = Decimal # for faster isinstance checks
|
||||
|
||||
|
||||
cpdef bint is_matching_na(object left, object right, bint nan_matches_none=False):
|
||||
"""
|
||||
Check if two scalars are both NA of matching types.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
left : Any
|
||||
right : Any
|
||||
nan_matches_none : bool, default False
|
||||
For backwards compatibility, consider NaN as matching None.
|
||||
|
||||
Returns
|
||||
-------
|
||||
bool
|
||||
"""
|
||||
if left is None:
|
||||
if nan_matches_none and util.is_nan(right):
|
||||
return True
|
||||
return right is None
|
||||
elif left is C_NA:
|
||||
return right is C_NA
|
||||
elif left is NaT:
|
||||
return right is NaT
|
||||
elif util.is_float_object(left):
|
||||
if nan_matches_none and right is None and util.is_nan(left):
|
||||
return True
|
||||
return (
|
||||
util.is_nan(left)
|
||||
and util.is_float_object(right)
|
||||
and util.is_nan(right)
|
||||
)
|
||||
elif util.is_complex_object(left):
|
||||
return (
|
||||
util.is_nan(left)
|
||||
and util.is_complex_object(right)
|
||||
and util.is_nan(right)
|
||||
)
|
||||
elif util.is_datetime64_object(left):
|
||||
return (
|
||||
get_datetime64_value(left) == NPY_NAT
|
||||
and util.is_datetime64_object(right)
|
||||
and get_datetime64_value(right) == NPY_NAT
|
||||
and get_datetime64_unit(left) == get_datetime64_unit(right)
|
||||
)
|
||||
elif util.is_timedelta64_object(left):
|
||||
return (
|
||||
get_timedelta64_value(left) == NPY_NAT
|
||||
and util.is_timedelta64_object(right)
|
||||
and get_timedelta64_value(right) == NPY_NAT
|
||||
and get_datetime64_unit(left) == get_datetime64_unit(right)
|
||||
)
|
||||
elif is_decimal_na(left):
|
||||
return is_decimal_na(right)
|
||||
return False
|
||||
|
||||
|
||||
cpdef bint checknull(object val, bint inf_as_na=False):
|
||||
"""
|
||||
Return boolean describing of the input is NA-like, defined here as any
|
||||
of:
|
||||
- None
|
||||
- nan
|
||||
- NaT
|
||||
- np.datetime64 representation of NaT
|
||||
- np.timedelta64 representation of NaT
|
||||
- NA
|
||||
- Decimal("NaN")
|
||||
|
||||
Parameters
|
||||
----------
|
||||
val : object
|
||||
inf_as_na : bool, default False
|
||||
Whether to treat INF and -INF as NA values.
|
||||
|
||||
Returns
|
||||
-------
|
||||
bool
|
||||
"""
|
||||
if val is None or val is NaT or val is C_NA:
|
||||
return True
|
||||
elif util.is_float_object(val) or util.is_complex_object(val):
|
||||
if val != val:
|
||||
return True
|
||||
elif inf_as_na:
|
||||
return val == INF or val == NEGINF
|
||||
return False
|
||||
elif util.is_timedelta64_object(val):
|
||||
return get_timedelta64_value(val) == NPY_NAT
|
||||
elif util.is_datetime64_object(val):
|
||||
return get_datetime64_value(val) == NPY_NAT
|
||||
else:
|
||||
return is_decimal_na(val)
|
||||
|
||||
|
||||
cdef inline bint is_decimal_na(object val):
|
||||
"""
|
||||
Is this a decimal.Decimal object Decimal("NAN").
|
||||
"""
|
||||
return isinstance(val, cDecimal) and val != val
|
||||
|
||||
|
||||
@cython.wraparound(False)
|
||||
@cython.boundscheck(False)
|
||||
cpdef ndarray[uint8_t] isnaobj(ndarray arr, bint inf_as_na=False):
|
||||
"""
|
||||
Return boolean mask denoting which elements of a 1-D array are na-like,
|
||||
according to the criteria defined in `checknull`:
|
||||
- None
|
||||
- nan
|
||||
- NaT
|
||||
- np.datetime64 representation of NaT
|
||||
- np.timedelta64 representation of NaT
|
||||
- NA
|
||||
- Decimal("NaN")
|
||||
|
||||
Parameters
|
||||
----------
|
||||
arr : ndarray
|
||||
|
||||
Returns
|
||||
-------
|
||||
result : ndarray (dtype=np.bool_)
|
||||
"""
|
||||
cdef:
|
||||
Py_ssize_t i, n
|
||||
object val
|
||||
ndarray[uint8_t] result
|
||||
|
||||
assert arr.ndim == 1, "'arr' must be 1-D."
|
||||
|
||||
n = len(arr)
|
||||
result = np.empty(n, dtype=np.uint8)
|
||||
for i in range(n):
|
||||
val = arr[i]
|
||||
result[i] = checknull(val, inf_as_na=inf_as_na)
|
||||
return result.view(np.bool_)
|
||||
|
||||
|
||||
@cython.wraparound(False)
|
||||
@cython.boundscheck(False)
|
||||
def isnaobj2d(arr: ndarray, inf_as_na: bool = False) -> ndarray:
|
||||
"""
|
||||
Return boolean mask denoting which elements of a 2-D array are na-like,
|
||||
according to the criteria defined in `checknull`:
|
||||
- None
|
||||
- nan
|
||||
- NaT
|
||||
- np.datetime64 representation of NaT
|
||||
- np.timedelta64 representation of NaT
|
||||
- NA
|
||||
- Decimal("NaN")
|
||||
|
||||
Parameters
|
||||
----------
|
||||
arr : ndarray
|
||||
|
||||
Returns
|
||||
-------
|
||||
result : ndarray (dtype=np.bool_)
|
||||
"""
|
||||
cdef:
|
||||
Py_ssize_t i, j, n, m
|
||||
object val
|
||||
ndarray[uint8_t, ndim=2] result
|
||||
|
||||
assert arr.ndim == 2, "'arr' must be 2-D."
|
||||
|
||||
n, m = (<object>arr).shape
|
||||
result = np.zeros((n, m), dtype=np.uint8)
|
||||
for i in range(n):
|
||||
for j in range(m):
|
||||
val = arr[i, j]
|
||||
if checknull(val, inf_as_na=inf_as_na):
|
||||
result[i, j] = 1
|
||||
return result.view(np.bool_)
|
||||
|
||||
|
||||
def isposinf_scalar(val: object) -> bool:
|
||||
return util.is_float_object(val) and val == INF
|
||||
|
||||
|
||||
def isneginf_scalar(val: object) -> bool:
|
||||
return util.is_float_object(val) and val == NEGINF
|
||||
|
||||
|
||||
cdef inline bint is_null_datetime64(v):
|
||||
# determine if we have a null for a datetime (or integer versions),
|
||||
# excluding np.timedelta64('nat')
|
||||
if checknull_with_nat(v) or is_dt64nat(v):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
cdef inline bint is_null_timedelta64(v):
|
||||
# determine if we have a null for a timedelta (or integer versions),
|
||||
# excluding np.datetime64('nat')
|
||||
if checknull_with_nat(v) or is_td64nat(v):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
cdef bint checknull_with_nat_and_na(object obj):
|
||||
# See GH#32214
|
||||
return checknull_with_nat(obj) or obj is C_NA
|
||||
|
||||
|
||||
@cython.wraparound(False)
|
||||
@cython.boundscheck(False)
|
||||
def is_float_nan(values: ndarray) -> ndarray:
|
||||
"""
|
||||
True for elements which correspond to a float nan
|
||||
|
||||
Returns
|
||||
-------
|
||||
ndarray[bool]
|
||||
"""
|
||||
cdef:
|
||||
ndarray[uint8_t] result
|
||||
Py_ssize_t i, N
|
||||
object val
|
||||
|
||||
N = len(values)
|
||||
result = np.zeros(N, dtype=np.uint8)
|
||||
|
||||
for i in range(N):
|
||||
val = values[i]
|
||||
if util.is_nan(val):
|
||||
result[i] = True
|
||||
return result.view(bool)
|
||||
|
||||
|
||||
@cython.wraparound(False)
|
||||
@cython.boundscheck(False)
|
||||
def is_numeric_na(values: ndarray) -> ndarray:
|
||||
"""
|
||||
Check for NA values consistent with IntegerArray/FloatingArray.
|
||||
|
||||
Similar to a vectorized is_valid_na_for_dtype restricted to numeric dtypes.
|
||||
|
||||
Returns
|
||||
-------
|
||||
ndarray[bool]
|
||||
"""
|
||||
cdef:
|
||||
ndarray[uint8_t] result
|
||||
Py_ssize_t i, N
|
||||
object val
|
||||
|
||||
N = len(values)
|
||||
result = np.zeros(N, dtype=np.uint8)
|
||||
|
||||
for i in range(N):
|
||||
val = values[i]
|
||||
if checknull(val):
|
||||
if val is None or val is C_NA or util.is_nan(val) or is_decimal_na(val):
|
||||
result[i] = True
|
||||
else:
|
||||
raise TypeError(f"'values' contains non-numeric NA {val}")
|
||||
return result.view(bool)
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Implementation of NA singleton
|
||||
|
||||
|
||||
def _create_binary_propagating_op(name, is_divmod=False):
|
||||
|
||||
def method(self, other):
|
||||
if (other is C_NA or isinstance(other, str)
|
||||
or isinstance(other, (numbers.Number, np.bool_))
|
||||
or isinstance(other, np.ndarray) and not other.shape):
|
||||
# Need the other.shape clause to handle NumPy scalars,
|
||||
# since we do a setitem on `out` below, which
|
||||
# won't work for NumPy scalars.
|
||||
if is_divmod:
|
||||
return NA, NA
|
||||
else:
|
||||
return NA
|
||||
|
||||
elif isinstance(other, np.ndarray):
|
||||
out = np.empty(other.shape, dtype=object)
|
||||
out[:] = NA
|
||||
|
||||
if is_divmod:
|
||||
return out, out.copy()
|
||||
else:
|
||||
return out
|
||||
|
||||
return NotImplemented
|
||||
|
||||
method.__name__ = name
|
||||
return method
|
||||
|
||||
|
||||
def _create_unary_propagating_op(name: str):
|
||||
def method(self):
|
||||
return NA
|
||||
|
||||
method.__name__ = name
|
||||
return method
|
||||
|
||||
|
||||
cdef class C_NAType:
|
||||
pass
|
||||
|
||||
|
||||
class NAType(C_NAType):
|
||||
"""
|
||||
NA ("not available") missing value indicator.
|
||||
|
||||
.. warning::
|
||||
|
||||
Experimental: the behaviour of NA can still change without warning.
|
||||
|
||||
.. versionadded:: 1.0.0
|
||||
|
||||
The NA singleton is a missing value indicator defined by pandas. It is
|
||||
used in certain new extension dtypes (currently the "string" dtype).
|
||||
"""
|
||||
|
||||
_instance = None
|
||||
|
||||
def __new__(cls, *args, **kwargs):
|
||||
if NAType._instance is None:
|
||||
NAType._instance = C_NAType.__new__(cls, *args, **kwargs)
|
||||
return NAType._instance
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return "<NA>"
|
||||
|
||||
def __format__(self, format_spec) -> str:
|
||||
try:
|
||||
return self.__repr__().__format__(format_spec)
|
||||
except ValueError:
|
||||
return self.__repr__()
|
||||
|
||||
def __bool__(self):
|
||||
raise TypeError("boolean value of NA is ambiguous")
|
||||
|
||||
def __hash__(self):
|
||||
# GH 30013: Ensure hash is large enough to avoid hash collisions with integers
|
||||
exponent = 31 if is_32bit else 61
|
||||
return 2 ** exponent - 1
|
||||
|
||||
def __reduce__(self):
|
||||
return "NA"
|
||||
|
||||
# Binary arithmetic and comparison ops -> propagate
|
||||
|
||||
__add__ = _create_binary_propagating_op("__add__")
|
||||
__radd__ = _create_binary_propagating_op("__radd__")
|
||||
__sub__ = _create_binary_propagating_op("__sub__")
|
||||
__rsub__ = _create_binary_propagating_op("__rsub__")
|
||||
__mul__ = _create_binary_propagating_op("__mul__")
|
||||
__rmul__ = _create_binary_propagating_op("__rmul__")
|
||||
__matmul__ = _create_binary_propagating_op("__matmul__")
|
||||
__rmatmul__ = _create_binary_propagating_op("__rmatmul__")
|
||||
__truediv__ = _create_binary_propagating_op("__truediv__")
|
||||
__rtruediv__ = _create_binary_propagating_op("__rtruediv__")
|
||||
__floordiv__ = _create_binary_propagating_op("__floordiv__")
|
||||
__rfloordiv__ = _create_binary_propagating_op("__rfloordiv__")
|
||||
__mod__ = _create_binary_propagating_op("__mod__")
|
||||
__rmod__ = _create_binary_propagating_op("__rmod__")
|
||||
__divmod__ = _create_binary_propagating_op("__divmod__", is_divmod=True)
|
||||
__rdivmod__ = _create_binary_propagating_op("__rdivmod__", is_divmod=True)
|
||||
# __lshift__ and __rshift__ are not implemented
|
||||
|
||||
__eq__ = _create_binary_propagating_op("__eq__")
|
||||
__ne__ = _create_binary_propagating_op("__ne__")
|
||||
__le__ = _create_binary_propagating_op("__le__")
|
||||
__lt__ = _create_binary_propagating_op("__lt__")
|
||||
__gt__ = _create_binary_propagating_op("__gt__")
|
||||
__ge__ = _create_binary_propagating_op("__ge__")
|
||||
|
||||
# Unary ops
|
||||
|
||||
__neg__ = _create_unary_propagating_op("__neg__")
|
||||
__pos__ = _create_unary_propagating_op("__pos__")
|
||||
__abs__ = _create_unary_propagating_op("__abs__")
|
||||
__invert__ = _create_unary_propagating_op("__invert__")
|
||||
|
||||
# pow has special
|
||||
def __pow__(self, other):
|
||||
if other is C_NA:
|
||||
return NA
|
||||
elif isinstance(other, (numbers.Number, np.bool_)):
|
||||
if other == 0:
|
||||
# returning positive is correct for +/- 0.
|
||||
return type(other)(1)
|
||||
else:
|
||||
return NA
|
||||
elif isinstance(other, np.ndarray):
|
||||
return np.where(other == 0, other.dtype.type(1), NA)
|
||||
|
||||
return NotImplemented
|
||||
|
||||
def __rpow__(self, other):
|
||||
if other is C_NA:
|
||||
return NA
|
||||
elif isinstance(other, (numbers.Number, np.bool_)):
|
||||
if other == 1:
|
||||
return other
|
||||
else:
|
||||
return NA
|
||||
elif isinstance(other, np.ndarray):
|
||||
return np.where(other == 1, other, NA)
|
||||
return NotImplemented
|
||||
|
||||
# Logical ops using Kleene logic
|
||||
|
||||
def __and__(self, other):
|
||||
if other is False:
|
||||
return False
|
||||
elif other is True or other is C_NA:
|
||||
return NA
|
||||
return NotImplemented
|
||||
|
||||
__rand__ = __and__
|
||||
|
||||
def __or__(self, other):
|
||||
if other is True:
|
||||
return True
|
||||
elif other is False or other is C_NA:
|
||||
return NA
|
||||
return NotImplemented
|
||||
|
||||
__ror__ = __or__
|
||||
|
||||
def __xor__(self, other):
|
||||
if other is False or other is True or other is C_NA:
|
||||
return NA
|
||||
return NotImplemented
|
||||
|
||||
__rxor__ = __xor__
|
||||
|
||||
__array_priority__ = 1000
|
||||
_HANDLED_TYPES = (np.ndarray, numbers.Number, str, np.bool_)
|
||||
|
||||
def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
|
||||
types = self._HANDLED_TYPES + (NAType,)
|
||||
for x in inputs:
|
||||
if not isinstance(x, types):
|
||||
return NotImplemented
|
||||
|
||||
if method != "__call__":
|
||||
raise ValueError(f"ufunc method '{method}' not supported for NA")
|
||||
result = maybe_dispatch_ufunc_to_dunder_op(
|
||||
self, ufunc, method, *inputs, **kwargs
|
||||
)
|
||||
if result is NotImplemented:
|
||||
# For a NumPy ufunc that's not a binop, like np.logaddexp
|
||||
index = [i for i, x in enumerate(inputs) if x is NA][0]
|
||||
result = np.broadcast_arrays(*inputs)[index]
|
||||
if result.ndim == 0:
|
||||
result = result.item()
|
||||
if ufunc.nout > 1:
|
||||
result = (NA,) * ufunc.nout
|
||||
|
||||
return result
|
||||
|
||||
|
||||
C_NA = NAType() # C-visible
|
||||
NA = C_NA # Python-visible
|
||||
Binary file not shown.
@@ -0,0 +1,50 @@
|
||||
from typing import (
|
||||
Any,
|
||||
Callable,
|
||||
Iterable,
|
||||
Literal,
|
||||
overload,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._typing import npt
|
||||
|
||||
_BinOp = Callable[[Any, Any], Any]
|
||||
_BoolOp = Callable[[Any, Any], bool]
|
||||
|
||||
def scalar_compare(
|
||||
values: np.ndarray, # object[:]
|
||||
val: object,
|
||||
op: _BoolOp, # {operator.eq, operator.ne, ...}
|
||||
) -> npt.NDArray[np.bool_]: ...
|
||||
def vec_compare(
|
||||
left: npt.NDArray[np.object_],
|
||||
right: npt.NDArray[np.object_],
|
||||
op: _BoolOp, # {operator.eq, operator.ne, ...}
|
||||
) -> npt.NDArray[np.bool_]: ...
|
||||
def scalar_binop(
|
||||
values: np.ndarray, # object[:]
|
||||
val: object,
|
||||
op: _BinOp, # binary operator
|
||||
) -> np.ndarray: ...
|
||||
def vec_binop(
|
||||
left: np.ndarray, # object[:]
|
||||
right: np.ndarray, # object[:]
|
||||
op: _BinOp, # binary operator
|
||||
) -> np.ndarray: ...
|
||||
@overload
|
||||
def maybe_convert_bool(
|
||||
arr: npt.NDArray[np.object_],
|
||||
true_values: Iterable = ...,
|
||||
false_values: Iterable = ...,
|
||||
convert_to_masked_nullable: Literal[False] = ...,
|
||||
) -> tuple[np.ndarray, None]: ...
|
||||
@overload
|
||||
def maybe_convert_bool(
|
||||
arr: npt.NDArray[np.object_],
|
||||
true_values: Iterable = ...,
|
||||
false_values: Iterable = ...,
|
||||
*,
|
||||
convert_to_masked_nullable: Literal[True],
|
||||
) -> tuple[np.ndarray, np.ndarray]: ...
|
||||
@@ -0,0 +1,310 @@
|
||||
import operator
|
||||
|
||||
from cpython.object cimport (
|
||||
Py_EQ,
|
||||
Py_GE,
|
||||
Py_GT,
|
||||
Py_LE,
|
||||
Py_LT,
|
||||
Py_NE,
|
||||
PyObject_RichCompareBool,
|
||||
)
|
||||
|
||||
import cython
|
||||
from cython import Py_ssize_t
|
||||
import numpy as np
|
||||
|
||||
from numpy cimport (
|
||||
import_array,
|
||||
ndarray,
|
||||
uint8_t,
|
||||
)
|
||||
|
||||
import_array()
|
||||
|
||||
|
||||
from pandas._libs.missing cimport checknull
|
||||
from pandas._libs.util cimport is_nan
|
||||
|
||||
|
||||
@cython.wraparound(False)
|
||||
@cython.boundscheck(False)
|
||||
def scalar_compare(object[:] values, object val, object op) -> ndarray:
|
||||
"""
|
||||
Compare each element of `values` array with the scalar `val`, with
|
||||
the comparison operation described by `op`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
values : ndarray[object]
|
||||
val : object
|
||||
op : {operator.eq, operator.ne,
|
||||
operator.le, operator.lt,
|
||||
operator.ge, operator.gt}
|
||||
|
||||
Returns
|
||||
-------
|
||||
result : ndarray[bool]
|
||||
"""
|
||||
cdef:
|
||||
Py_ssize_t i, n = len(values)
|
||||
ndarray[uint8_t, cast=True] result
|
||||
bint isnull_val
|
||||
int flag
|
||||
object x
|
||||
|
||||
if op is operator.lt:
|
||||
flag = Py_LT
|
||||
elif op is operator.le:
|
||||
flag = Py_LE
|
||||
elif op is operator.gt:
|
||||
flag = Py_GT
|
||||
elif op is operator.ge:
|
||||
flag = Py_GE
|
||||
elif op is operator.eq:
|
||||
flag = Py_EQ
|
||||
elif op is operator.ne:
|
||||
flag = Py_NE
|
||||
else:
|
||||
raise ValueError('Unrecognized operator')
|
||||
|
||||
result = np.empty(n, dtype=bool).view(np.uint8)
|
||||
isnull_val = checknull(val)
|
||||
|
||||
if flag == Py_NE:
|
||||
for i in range(n):
|
||||
x = values[i]
|
||||
if checknull(x):
|
||||
result[i] = True
|
||||
elif isnull_val:
|
||||
result[i] = True
|
||||
else:
|
||||
try:
|
||||
result[i] = PyObject_RichCompareBool(x, val, flag)
|
||||
except TypeError:
|
||||
result[i] = True
|
||||
elif flag == Py_EQ:
|
||||
for i in range(n):
|
||||
x = values[i]
|
||||
if checknull(x):
|
||||
result[i] = False
|
||||
elif isnull_val:
|
||||
result[i] = False
|
||||
else:
|
||||
try:
|
||||
result[i] = PyObject_RichCompareBool(x, val, flag)
|
||||
except TypeError:
|
||||
result[i] = False
|
||||
|
||||
else:
|
||||
for i in range(n):
|
||||
x = values[i]
|
||||
if checknull(x):
|
||||
result[i] = False
|
||||
elif isnull_val:
|
||||
result[i] = False
|
||||
else:
|
||||
result[i] = PyObject_RichCompareBool(x, val, flag)
|
||||
|
||||
return result.view(bool)
|
||||
|
||||
|
||||
@cython.wraparound(False)
|
||||
@cython.boundscheck(False)
|
||||
def vec_compare(ndarray[object] left, ndarray[object] right, object op) -> ndarray:
|
||||
"""
|
||||
Compare the elements of `left` with the elements of `right` pointwise,
|
||||
with the comparison operation described by `op`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
left : ndarray[object]
|
||||
right : ndarray[object]
|
||||
op : {operator.eq, operator.ne,
|
||||
operator.le, operator.lt,
|
||||
operator.ge, operator.gt}
|
||||
|
||||
Returns
|
||||
-------
|
||||
result : ndarray[bool]
|
||||
"""
|
||||
cdef:
|
||||
Py_ssize_t i, n = len(left)
|
||||
ndarray[uint8_t, cast=True] result
|
||||
int flag
|
||||
|
||||
if n != <Py_ssize_t>len(right):
|
||||
raise ValueError(f'Arrays were different lengths: {n} vs {len(right)}')
|
||||
|
||||
if op is operator.lt:
|
||||
flag = Py_LT
|
||||
elif op is operator.le:
|
||||
flag = Py_LE
|
||||
elif op is operator.gt:
|
||||
flag = Py_GT
|
||||
elif op is operator.ge:
|
||||
flag = Py_GE
|
||||
elif op is operator.eq:
|
||||
flag = Py_EQ
|
||||
elif op is operator.ne:
|
||||
flag = Py_NE
|
||||
else:
|
||||
raise ValueError('Unrecognized operator')
|
||||
|
||||
result = np.empty(n, dtype=bool).view(np.uint8)
|
||||
|
||||
if flag == Py_NE:
|
||||
for i in range(n):
|
||||
x = left[i]
|
||||
y = right[i]
|
||||
|
||||
if checknull(x) or checknull(y):
|
||||
result[i] = True
|
||||
else:
|
||||
result[i] = PyObject_RichCompareBool(x, y, flag)
|
||||
else:
|
||||
for i in range(n):
|
||||
x = left[i]
|
||||
y = right[i]
|
||||
|
||||
if checknull(x) or checknull(y):
|
||||
result[i] = False
|
||||
else:
|
||||
result[i] = PyObject_RichCompareBool(x, y, flag)
|
||||
|
||||
return result.view(bool)
|
||||
|
||||
|
||||
@cython.wraparound(False)
|
||||
@cython.boundscheck(False)
|
||||
def scalar_binop(object[:] values, object val, object op) -> ndarray:
|
||||
"""
|
||||
Apply the given binary operator `op` between each element of the array
|
||||
`values` and the scalar `val`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
values : ndarray[object]
|
||||
val : object
|
||||
op : binary operator
|
||||
|
||||
Returns
|
||||
-------
|
||||
result : ndarray[object]
|
||||
"""
|
||||
cdef:
|
||||
Py_ssize_t i, n = len(values)
|
||||
object[:] result
|
||||
object x
|
||||
|
||||
result = np.empty(n, dtype=object)
|
||||
if val is None or is_nan(val):
|
||||
result[:] = val
|
||||
return result.base # `.base` to access underlying np.ndarray
|
||||
|
||||
for i in range(n):
|
||||
x = values[i]
|
||||
if x is None or is_nan(x):
|
||||
result[i] = x
|
||||
else:
|
||||
result[i] = op(x, val)
|
||||
|
||||
return maybe_convert_bool(result.base)[0]
|
||||
|
||||
|
||||
@cython.wraparound(False)
|
||||
@cython.boundscheck(False)
|
||||
def vec_binop(object[:] left, object[:] right, object op) -> ndarray:
|
||||
"""
|
||||
Apply the given binary operator `op` pointwise to the elements of
|
||||
arrays `left` and `right`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
left : ndarray[object]
|
||||
right : ndarray[object]
|
||||
op : binary operator
|
||||
|
||||
Returns
|
||||
-------
|
||||
result : ndarray[object]
|
||||
"""
|
||||
cdef:
|
||||
Py_ssize_t i, n = len(left)
|
||||
object[:] result
|
||||
|
||||
if n != <Py_ssize_t>len(right):
|
||||
raise ValueError(f'Arrays were different lengths: {n} vs {len(right)}')
|
||||
|
||||
result = np.empty(n, dtype=object)
|
||||
|
||||
for i in range(n):
|
||||
x = left[i]
|
||||
y = right[i]
|
||||
try:
|
||||
result[i] = op(x, y)
|
||||
except TypeError:
|
||||
if x is None or is_nan(x):
|
||||
result[i] = x
|
||||
elif y is None or is_nan(y):
|
||||
result[i] = y
|
||||
else:
|
||||
raise
|
||||
|
||||
return maybe_convert_bool(result.base)[0] # `.base` to access np.ndarray
|
||||
|
||||
|
||||
def maybe_convert_bool(ndarray[object] arr,
|
||||
true_values=None,
|
||||
false_values=None,
|
||||
convert_to_masked_nullable=False
|
||||
) -> tuple[np.ndarray, np.ndarray | None]:
|
||||
cdef:
|
||||
Py_ssize_t i, n
|
||||
ndarray[uint8_t] result
|
||||
ndarray[uint8_t] mask
|
||||
object val
|
||||
set true_vals, false_vals
|
||||
bint has_na = False
|
||||
|
||||
n = len(arr)
|
||||
result = np.empty(n, dtype=np.uint8)
|
||||
mask = np.zeros(n, dtype=np.uint8)
|
||||
# the defaults
|
||||
true_vals = {'True', 'TRUE', 'true'}
|
||||
false_vals = {'False', 'FALSE', 'false'}
|
||||
|
||||
if true_values is not None:
|
||||
true_vals = true_vals | set(true_values)
|
||||
|
||||
if false_values is not None:
|
||||
false_vals = false_vals | set(false_values)
|
||||
|
||||
for i in range(n):
|
||||
val = arr[i]
|
||||
|
||||
if isinstance(val, bool):
|
||||
if val is True:
|
||||
result[i] = 1
|
||||
else:
|
||||
result[i] = 0
|
||||
elif val in true_vals:
|
||||
result[i] = 1
|
||||
elif val in false_vals:
|
||||
result[i] = 0
|
||||
elif is_nan(val):
|
||||
mask[i] = 1
|
||||
result[i] = 0 # Value here doesn't matter, will be replaced w/ nan
|
||||
has_na = True
|
||||
else:
|
||||
return (arr, None)
|
||||
|
||||
if has_na:
|
||||
if convert_to_masked_nullable:
|
||||
return (result.view(np.bool_), mask.view(np.bool_))
|
||||
else:
|
||||
arr = result.view(np.bool_).astype(object)
|
||||
np.putmask(arr, mask, np.nan)
|
||||
return (arr, None)
|
||||
else:
|
||||
return (result.view(np.bool_), None)
|
||||
Binary file not shown.
@@ -0,0 +1,5 @@
|
||||
import numpy as np
|
||||
|
||||
def maybe_dispatch_ufunc_to_dunder_op(
|
||||
self, ufunc: np.ufunc, method: str, *inputs, **kwargs
|
||||
): ...
|
||||
@@ -0,0 +1,121 @@
|
||||
DISPATCHED_UFUNCS = {
|
||||
"add",
|
||||
"sub",
|
||||
"mul",
|
||||
"pow",
|
||||
"mod",
|
||||
"floordiv",
|
||||
"truediv",
|
||||
"divmod",
|
||||
"eq",
|
||||
"ne",
|
||||
"lt",
|
||||
"gt",
|
||||
"le",
|
||||
"ge",
|
||||
"remainder",
|
||||
"matmul",
|
||||
"or",
|
||||
"xor",
|
||||
"and",
|
||||
"neg",
|
||||
"pos",
|
||||
"abs",
|
||||
}
|
||||
UNARY_UFUNCS = {
|
||||
"neg",
|
||||
"pos",
|
||||
"abs",
|
||||
}
|
||||
UFUNC_ALIASES = {
|
||||
"subtract": "sub",
|
||||
"multiply": "mul",
|
||||
"floor_divide": "floordiv",
|
||||
"true_divide": "truediv",
|
||||
"power": "pow",
|
||||
"remainder": "mod",
|
||||
"divide": "truediv",
|
||||
"equal": "eq",
|
||||
"not_equal": "ne",
|
||||
"less": "lt",
|
||||
"less_equal": "le",
|
||||
"greater": "gt",
|
||||
"greater_equal": "ge",
|
||||
"bitwise_or": "or",
|
||||
"bitwise_and": "and",
|
||||
"bitwise_xor": "xor",
|
||||
"negative": "neg",
|
||||
"absolute": "abs",
|
||||
"positive": "pos",
|
||||
}
|
||||
|
||||
# For op(., Array) -> Array.__r{op}__
|
||||
REVERSED_NAMES = {
|
||||
"lt": "__gt__",
|
||||
"le": "__ge__",
|
||||
"gt": "__lt__",
|
||||
"ge": "__le__",
|
||||
"eq": "__eq__",
|
||||
"ne": "__ne__",
|
||||
}
|
||||
|
||||
|
||||
def maybe_dispatch_ufunc_to_dunder_op(
|
||||
object self, object ufunc, str method, *inputs, **kwargs
|
||||
):
|
||||
"""
|
||||
Dispatch a ufunc to the equivalent dunder method.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
self : ArrayLike
|
||||
The array whose dunder method we dispatch to
|
||||
ufunc : Callable
|
||||
A NumPy ufunc
|
||||
method : {'reduce', 'accumulate', 'reduceat', 'outer', 'at', '__call__'}
|
||||
inputs : ArrayLike
|
||||
The input arrays.
|
||||
kwargs : Any
|
||||
The additional keyword arguments, e.g. ``out``.
|
||||
|
||||
Returns
|
||||
-------
|
||||
result : Any
|
||||
The result of applying the ufunc
|
||||
"""
|
||||
# special has the ufuncs we dispatch to the dunder op on
|
||||
|
||||
op_name = ufunc.__name__
|
||||
op_name = UFUNC_ALIASES.get(op_name, op_name)
|
||||
|
||||
def not_implemented(*args, **kwargs):
|
||||
return NotImplemented
|
||||
|
||||
if kwargs or ufunc.nin > 2:
|
||||
return NotImplemented
|
||||
|
||||
if method == "__call__" and op_name in DISPATCHED_UFUNCS:
|
||||
|
||||
if inputs[0] is self:
|
||||
name = f"__{op_name}__"
|
||||
meth = getattr(self, name, not_implemented)
|
||||
|
||||
if op_name in UNARY_UFUNCS:
|
||||
assert len(inputs) == 1
|
||||
return meth()
|
||||
|
||||
return meth(inputs[1])
|
||||
|
||||
elif inputs[1] is self:
|
||||
name = REVERSED_NAMES.get(op_name, f"__r{op_name}__")
|
||||
|
||||
meth = getattr(self, name, not_implemented)
|
||||
result = meth(inputs[0])
|
||||
return result
|
||||
|
||||
else:
|
||||
# should not be reached, but covering our bases
|
||||
return NotImplemented
|
||||
|
||||
else:
|
||||
return NotImplemented
|
||||
Binary file not shown.
@@ -0,0 +1,71 @@
|
||||
from typing import (
|
||||
Hashable,
|
||||
Literal,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._typing import (
|
||||
ArrayLike,
|
||||
Dtype,
|
||||
npt,
|
||||
)
|
||||
|
||||
STR_NA_VALUES: set[str]
|
||||
|
||||
def sanitize_objects(
|
||||
values: npt.NDArray[np.object_],
|
||||
na_values: set,
|
||||
) -> int: ...
|
||||
|
||||
class TextReader:
|
||||
unnamed_cols: set[str]
|
||||
table_width: int # int64_t
|
||||
leading_cols: int # int64_t
|
||||
header: list[list[int]] # non-negative integers
|
||||
def __init__(
|
||||
self,
|
||||
source,
|
||||
delimiter: bytes | str = ..., # single-character only
|
||||
header=...,
|
||||
header_start: int = ..., # int64_t
|
||||
header_end: int = ..., # uint64_t
|
||||
index_col=...,
|
||||
names=...,
|
||||
tokenize_chunksize: int = ..., # int64_t
|
||||
delim_whitespace: bool = ...,
|
||||
converters=...,
|
||||
skipinitialspace: bool = ...,
|
||||
escapechar: bytes | str | None = ..., # single-character only
|
||||
doublequote: bool = ...,
|
||||
quotechar: str | bytes | None = ..., # at most 1 character
|
||||
quoting: int = ...,
|
||||
lineterminator: bytes | str | None = ..., # at most 1 character
|
||||
comment=...,
|
||||
decimal: bytes | str = ..., # single-character only
|
||||
thousands: bytes | str | None = ..., # single-character only
|
||||
dtype: Dtype | dict[Hashable, Dtype] = ...,
|
||||
usecols=...,
|
||||
error_bad_lines: bool = ...,
|
||||
warn_bad_lines: bool = ...,
|
||||
na_filter: bool = ...,
|
||||
na_values=...,
|
||||
na_fvalues=...,
|
||||
keep_default_na: bool = ...,
|
||||
true_values=...,
|
||||
false_values=...,
|
||||
allow_leading_cols: bool = ...,
|
||||
skiprows=...,
|
||||
skipfooter: int = ..., # int64_t
|
||||
verbose: bool = ...,
|
||||
mangle_dupe_cols: bool = ...,
|
||||
float_precision: Literal["round_trip", "legacy", "high"] | None = ...,
|
||||
skip_blank_lines: bool = ...,
|
||||
encoding_errors: bytes | str = ...,
|
||||
): ...
|
||||
def set_error_bad_lines(self, status: int) -> None: ...
|
||||
def set_noconvert(self, i: int) -> None: ...
|
||||
def remove_noconvert(self, i: int) -> None: ...
|
||||
def close(self) -> None: ...
|
||||
def read(self, rows: int | None = ...) -> dict[int, ArrayLike]: ...
|
||||
def read_low_memory(self, rows: int | None) -> list[dict[int, ArrayLike]]: ...
|
||||
File diff suppressed because it is too large
Load Diff
Binary file not shown.
@@ -0,0 +1,9 @@
|
||||
# pyright: reportIncompleteStub = false
|
||||
from typing import Any
|
||||
|
||||
# note: this is a lie to make type checkers happy (they special
|
||||
# case property). cache_readonly uses attribute names similar to
|
||||
# property (fget) but it does not provide fset and fdel.
|
||||
cache_readonly = property
|
||||
|
||||
def __getattr__(name: str) -> Any: ... # incomplete
|
||||
@@ -0,0 +1,70 @@
|
||||
from cython import Py_ssize_t
|
||||
|
||||
from cpython.dict cimport (
|
||||
PyDict_Contains,
|
||||
PyDict_GetItem,
|
||||
PyDict_SetItem,
|
||||
)
|
||||
|
||||
|
||||
cdef class CachedProperty:
|
||||
|
||||
cdef readonly:
|
||||
object fget, name, __doc__
|
||||
|
||||
def __init__(self, fget):
|
||||
self.fget = fget
|
||||
self.name = fget.__name__
|
||||
self.__doc__ = getattr(fget, '__doc__', None)
|
||||
|
||||
def __get__(self, obj, typ):
|
||||
if obj is None:
|
||||
# accessed on the class, not the instance
|
||||
return self
|
||||
|
||||
# Get the cache or set a default one if needed
|
||||
cache = getattr(obj, '_cache', None)
|
||||
if cache is None:
|
||||
try:
|
||||
cache = obj._cache = {}
|
||||
except (AttributeError):
|
||||
return self
|
||||
|
||||
if PyDict_Contains(cache, self.name):
|
||||
# not necessary to Py_INCREF
|
||||
val = <object>PyDict_GetItem(cache, self.name)
|
||||
else:
|
||||
val = self.fget(obj)
|
||||
PyDict_SetItem(cache, self.name, val)
|
||||
return val
|
||||
|
||||
def __set__(self, obj, value):
|
||||
raise AttributeError("Can't set attribute")
|
||||
|
||||
|
||||
cache_readonly = CachedProperty
|
||||
|
||||
|
||||
cdef class AxisProperty:
|
||||
|
||||
cdef readonly:
|
||||
Py_ssize_t axis
|
||||
object __doc__
|
||||
|
||||
def __init__(self, axis=0, doc=""):
|
||||
self.axis = axis
|
||||
self.__doc__ = doc
|
||||
|
||||
def __get__(self, obj, type):
|
||||
cdef:
|
||||
list axes
|
||||
|
||||
if obj is None:
|
||||
# Only instances have _mgr, not classes
|
||||
return self
|
||||
else:
|
||||
axes = obj._mgr.axes
|
||||
return axes[self.axis]
|
||||
|
||||
def __set__(self, obj, value):
|
||||
obj._set_axis(self.axis, value)
|
||||
Binary file not shown.
@@ -0,0 +1,33 @@
|
||||
import numpy as np
|
||||
|
||||
cimport numpy as cnp
|
||||
|
||||
cnp.import_array()
|
||||
|
||||
from pandas._libs.util cimport is_array
|
||||
|
||||
|
||||
cdef cnp.dtype _dtype_obj = np.dtype("object")
|
||||
|
||||
|
||||
cpdef check_result_array(object obj, object dtype):
|
||||
# Our operation is supposed to be an aggregation/reduction. If
|
||||
# it returns an ndarray, this likely means an invalid operation has
|
||||
# been passed. See test_apply_without_aggregation, test_agg_must_agg
|
||||
if is_array(obj):
|
||||
if dtype != _dtype_obj:
|
||||
# If it is object dtype, the function can be a reduction/aggregation
|
||||
# and still return an ndarray e.g. test_agg_over_numpy_arrays
|
||||
raise ValueError("Must produce aggregated value")
|
||||
|
||||
|
||||
cpdef inline extract_result(object res):
|
||||
""" extract the result object, it might be a 0-dim ndarray
|
||||
or a len-1 0-dim, or a scalar """
|
||||
if hasattr(res, "_values"):
|
||||
# Preserve EA
|
||||
res = res._values
|
||||
if res.ndim == 1 and len(res) == 1:
|
||||
# see test_agg_lambda_with_timezone, test_resampler_grouper.py::test_apply
|
||||
res = res[0]
|
||||
return res
|
||||
Binary file not shown.
@@ -0,0 +1,16 @@
|
||||
import numpy as np
|
||||
|
||||
from pandas._typing import npt
|
||||
|
||||
def unstack(
|
||||
values: np.ndarray, # reshape_t[:, :]
|
||||
mask: np.ndarray, # const uint8_t[:]
|
||||
stride: int,
|
||||
length: int,
|
||||
width: int,
|
||||
new_values: np.ndarray, # reshape_t[:, :]
|
||||
new_mask: np.ndarray, # uint8_t[:, :]
|
||||
) -> None: ...
|
||||
def explode(
|
||||
values: npt.NDArray[np.object_],
|
||||
) -> tuple[npt.NDArray[np.object_], npt.NDArray[np.int64]]: ...
|
||||
@@ -0,0 +1,139 @@
|
||||
import cython
|
||||
from cython import Py_ssize_t
|
||||
|
||||
from numpy cimport (
|
||||
int64_t,
|
||||
ndarray,
|
||||
uint8_t,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
cimport numpy as cnp
|
||||
|
||||
cnp.import_array()
|
||||
|
||||
from pandas._libs.dtypes cimport numeric_object_t
|
||||
from pandas._libs.lib cimport c_is_list_like
|
||||
|
||||
|
||||
@cython.wraparound(False)
|
||||
@cython.boundscheck(False)
|
||||
def unstack(numeric_object_t[:, :] values, const uint8_t[:] mask,
|
||||
Py_ssize_t stride, Py_ssize_t length, Py_ssize_t width,
|
||||
numeric_object_t[:, :] new_values, uint8_t[:, :] new_mask) -> None:
|
||||
"""
|
||||
Transform long values to wide new_values.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
values : typed ndarray
|
||||
mask : np.ndarray[bool]
|
||||
stride : int
|
||||
length : int
|
||||
width : int
|
||||
new_values : np.ndarray[bool]
|
||||
result array
|
||||
new_mask : np.ndarray[bool]
|
||||
result mask
|
||||
"""
|
||||
cdef:
|
||||
Py_ssize_t i, j, w, nulls, s, offset
|
||||
|
||||
if numeric_object_t is not object:
|
||||
# evaluated at compile-time
|
||||
with nogil:
|
||||
for i in range(stride):
|
||||
|
||||
nulls = 0
|
||||
for j in range(length):
|
||||
|
||||
for w in range(width):
|
||||
|
||||
offset = j * width + w
|
||||
|
||||
if mask[offset]:
|
||||
s = i * width + w
|
||||
new_values[j, s] = values[offset - nulls, i]
|
||||
new_mask[j, s] = 1
|
||||
else:
|
||||
nulls += 1
|
||||
|
||||
else:
|
||||
# object-dtype, identical to above but we cannot use nogil
|
||||
for i in range(stride):
|
||||
|
||||
nulls = 0
|
||||
for j in range(length):
|
||||
|
||||
for w in range(width):
|
||||
|
||||
offset = j * width + w
|
||||
|
||||
if mask[offset]:
|
||||
s = i * width + w
|
||||
new_values[j, s] = values[offset - nulls, i]
|
||||
new_mask[j, s] = 1
|
||||
else:
|
||||
nulls += 1
|
||||
|
||||
|
||||
@cython.wraparound(False)
|
||||
@cython.boundscheck(False)
|
||||
def explode(ndarray[object] values):
|
||||
"""
|
||||
transform array list-likes to long form
|
||||
preserve non-list entries
|
||||
|
||||
Parameters
|
||||
----------
|
||||
values : ndarray[object]
|
||||
|
||||
Returns
|
||||
-------
|
||||
ndarray[object]
|
||||
result
|
||||
ndarray[int64_t]
|
||||
counts
|
||||
"""
|
||||
cdef:
|
||||
Py_ssize_t i, j, count, n
|
||||
object v
|
||||
ndarray[object] result
|
||||
ndarray[int64_t] counts
|
||||
|
||||
# find the resulting len
|
||||
n = len(values)
|
||||
counts = np.zeros(n, dtype='int64')
|
||||
for i in range(n):
|
||||
v = values[i]
|
||||
|
||||
if c_is_list_like(v, True):
|
||||
if len(v):
|
||||
counts[i] += len(v)
|
||||
else:
|
||||
# empty list-like, use a nan marker
|
||||
counts[i] += 1
|
||||
else:
|
||||
counts[i] += 1
|
||||
|
||||
result = np.empty(counts.sum(), dtype='object')
|
||||
count = 0
|
||||
for i in range(n):
|
||||
v = values[i]
|
||||
|
||||
if c_is_list_like(v, True):
|
||||
if len(v):
|
||||
v = list(v)
|
||||
for j in range(len(v)):
|
||||
result[count] = v[j]
|
||||
count += 1
|
||||
else:
|
||||
# empty list-like, use a nan marker
|
||||
result[count] = np.nan
|
||||
count += 1
|
||||
else:
|
||||
# replace with the existing scalar
|
||||
result[count] = v
|
||||
count += 1
|
||||
return result, counts
|
||||
Binary file not shown.
@@ -0,0 +1,47 @@
|
||||
from typing import (
|
||||
Sequence,
|
||||
TypeVar,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._typing import npt
|
||||
|
||||
SparseIndexT = TypeVar("SparseIndexT", bound="SparseIndex")
|
||||
|
||||
class SparseIndex:
|
||||
length: int
|
||||
npoints: int
|
||||
def __init__(self): ...
|
||||
@property
|
||||
def ngaps(self) -> int: ...
|
||||
@property
|
||||
def nbytes(self) -> int: ...
|
||||
@property
|
||||
def indices(self) -> npt.NDArray[np.int32]: ...
|
||||
def equals(self, other) -> bool: ...
|
||||
def lookup(self, index: int) -> np.int32: ...
|
||||
def lookup_array(self, indexer: npt.NDArray[np.int32]) -> npt.NDArray[np.int32]: ...
|
||||
def to_int_index(self) -> IntIndex: ...
|
||||
def to_block_index(self) -> BlockIndex: ...
|
||||
def intersect(self: SparseIndexT, y_: SparseIndex) -> SparseIndexT: ...
|
||||
def make_union(self: SparseIndexT, y_: SparseIndex) -> SparseIndexT: ...
|
||||
|
||||
class IntIndex(SparseIndex):
|
||||
indices: npt.NDArray[np.int32]
|
||||
def __init__(
|
||||
self, length: int, indices: Sequence[int], check_integrity: bool = ...
|
||||
): ...
|
||||
|
||||
class BlockIndex(SparseIndex):
|
||||
nblocks: int
|
||||
blocs: np.ndarray
|
||||
blengths: np.ndarray
|
||||
def __init__(self, length: int, blocs: np.ndarray, blengths: np.ndarray): ...
|
||||
|
||||
def make_mask_object_ndarray(
|
||||
arr: npt.NDArray[np.object_], fill_value
|
||||
) -> npt.NDArray[np.bool_]: ...
|
||||
def get_blocks(
|
||||
indices: npt.NDArray[np.int32],
|
||||
) -> tuple[npt.NDArray[np.int32], npt.NDArray[np.int32]]: ...
|
||||
@@ -0,0 +1,738 @@
|
||||
import cython
|
||||
import numpy as np
|
||||
|
||||
cimport numpy as cnp
|
||||
from numpy cimport (
|
||||
float32_t,
|
||||
float64_t,
|
||||
int8_t,
|
||||
int16_t,
|
||||
int32_t,
|
||||
int64_t,
|
||||
ndarray,
|
||||
uint8_t,
|
||||
)
|
||||
|
||||
cnp.import_array()
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Preamble stuff
|
||||
|
||||
cdef float64_t NaN = <float64_t>np.NaN
|
||||
cdef float64_t INF = <float64_t>np.inf
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
cdef class SparseIndex:
|
||||
"""
|
||||
Abstract superclass for sparse index types.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
cdef class IntIndex(SparseIndex):
|
||||
"""
|
||||
Object for holding exact integer sparse indexing information
|
||||
|
||||
Parameters
|
||||
----------
|
||||
length : integer
|
||||
indices : array-like
|
||||
Contains integers corresponding to the indices.
|
||||
check_integrity : bool, default=True
|
||||
Check integrity of the input.
|
||||
"""
|
||||
|
||||
cdef readonly:
|
||||
Py_ssize_t length, npoints
|
||||
ndarray indices
|
||||
|
||||
def __init__(self, Py_ssize_t length, indices, bint check_integrity=True):
|
||||
self.length = length
|
||||
self.indices = np.ascontiguousarray(indices, dtype=np.int32)
|
||||
self.npoints = len(self.indices)
|
||||
|
||||
if check_integrity:
|
||||
self.check_integrity()
|
||||
|
||||
def __reduce__(self):
|
||||
args = (self.length, self.indices)
|
||||
return IntIndex, args
|
||||
|
||||
def __repr__(self) -> str:
|
||||
output = 'IntIndex\n'
|
||||
output += f'Indices: {repr(self.indices)}\n'
|
||||
return output
|
||||
|
||||
@property
|
||||
def nbytes(self) -> int:
|
||||
return self.indices.nbytes
|
||||
|
||||
cdef check_integrity(self):
|
||||
"""
|
||||
Checks the following:
|
||||
|
||||
- Indices are strictly ascending
|
||||
- Number of indices is at most self.length
|
||||
- Indices are at least 0 and at most the total length less one
|
||||
|
||||
A ValueError is raised if any of these conditions is violated.
|
||||
"""
|
||||
|
||||
if self.npoints > self.length:
|
||||
raise ValueError(
|
||||
f"Too many indices. Expected {self.length} but found {self.npoints}"
|
||||
)
|
||||
|
||||
# Indices are vacuously ordered and non-negative
|
||||
# if the sequence of indices is empty.
|
||||
if self.npoints == 0:
|
||||
return
|
||||
|
||||
if self.indices.min() < 0:
|
||||
raise ValueError("No index can be less than zero")
|
||||
|
||||
if self.indices.max() >= self.length:
|
||||
raise ValueError("All indices must be less than the length")
|
||||
|
||||
monotonic = np.all(self.indices[:-1] < self.indices[1:])
|
||||
if not monotonic:
|
||||
raise ValueError("Indices must be strictly increasing")
|
||||
|
||||
def equals(self, other: object) -> bool:
|
||||
if not isinstance(other, IntIndex):
|
||||
return False
|
||||
|
||||
if self is other:
|
||||
return True
|
||||
|
||||
same_length = self.length == other.length
|
||||
same_indices = np.array_equal(self.indices, other.indices)
|
||||
return same_length and same_indices
|
||||
|
||||
@property
|
||||
def ngaps(self) -> int:
|
||||
return self.length - self.npoints
|
||||
|
||||
cpdef to_int_index(self):
|
||||
return self
|
||||
|
||||
def to_block_index(self):
|
||||
locs, lens = get_blocks(self.indices)
|
||||
return BlockIndex(self.length, locs, lens)
|
||||
|
||||
cpdef IntIndex intersect(self, SparseIndex y_):
|
||||
cdef:
|
||||
Py_ssize_t out_length, xi, yi = 0, result_indexer = 0
|
||||
int32_t xind
|
||||
ndarray[int32_t, ndim=1] xindices, yindices, new_indices
|
||||
IntIndex y
|
||||
|
||||
# if is one already, returns self
|
||||
y = y_.to_int_index()
|
||||
|
||||
if self.length != y.length:
|
||||
raise Exception('Indices must reference same underlying length')
|
||||
|
||||
xindices = self.indices
|
||||
yindices = y.indices
|
||||
new_indices = np.empty(min(
|
||||
len(xindices), len(yindices)), dtype=np.int32)
|
||||
|
||||
for xi in range(self.npoints):
|
||||
xind = xindices[xi]
|
||||
|
||||
while yi < y.npoints and yindices[yi] < xind:
|
||||
yi += 1
|
||||
|
||||
if yi >= y.npoints:
|
||||
break
|
||||
|
||||
# TODO: would a two-pass algorithm be faster?
|
||||
if yindices[yi] == xind:
|
||||
new_indices[result_indexer] = xind
|
||||
result_indexer += 1
|
||||
|
||||
new_indices = new_indices[:result_indexer]
|
||||
return IntIndex(self.length, new_indices)
|
||||
|
||||
cpdef IntIndex make_union(self, SparseIndex y_):
|
||||
|
||||
cdef:
|
||||
ndarray[int32_t, ndim=1] new_indices
|
||||
IntIndex y
|
||||
|
||||
# if is one already, returns self
|
||||
y = y_.to_int_index()
|
||||
|
||||
if self.length != y.length:
|
||||
raise ValueError('Indices must reference same underlying length')
|
||||
|
||||
new_indices = np.union1d(self.indices, y.indices)
|
||||
return IntIndex(self.length, new_indices)
|
||||
|
||||
@cython.wraparound(False)
|
||||
cpdef int32_t lookup(self, Py_ssize_t index):
|
||||
"""
|
||||
Return the internal location if value exists on given index.
|
||||
Return -1 otherwise.
|
||||
"""
|
||||
cdef:
|
||||
int32_t res
|
||||
ndarray[int32_t, ndim=1] inds
|
||||
|
||||
inds = self.indices
|
||||
if self.npoints == 0:
|
||||
return -1
|
||||
elif index < 0 or self.length <= index:
|
||||
return -1
|
||||
|
||||
res = inds.searchsorted(index)
|
||||
if res == self.npoints:
|
||||
return -1
|
||||
elif inds[res] == index:
|
||||
return res
|
||||
else:
|
||||
return -1
|
||||
|
||||
@cython.wraparound(False)
|
||||
cpdef ndarray[int32_t] lookup_array(self, ndarray[int32_t, ndim=1] indexer):
|
||||
"""
|
||||
Vectorized lookup, returns ndarray[int32_t]
|
||||
"""
|
||||
cdef:
|
||||
Py_ssize_t n, i, ind_val
|
||||
ndarray[int32_t, ndim=1] inds
|
||||
ndarray[uint8_t, ndim=1, cast=True] mask
|
||||
ndarray[int32_t, ndim=1] masked
|
||||
ndarray[int32_t, ndim=1] res
|
||||
ndarray[int32_t, ndim=1] results
|
||||
|
||||
n = len(indexer)
|
||||
results = np.empty(n, dtype=np.int32)
|
||||
results[:] = -1
|
||||
|
||||
if self.npoints == 0:
|
||||
return results
|
||||
|
||||
inds = self.indices
|
||||
mask = (inds[0] <= indexer) & (indexer <= inds[len(inds) - 1])
|
||||
|
||||
masked = indexer[mask]
|
||||
res = inds.searchsorted(masked).astype(np.int32)
|
||||
|
||||
res[inds[res] != masked] = -1
|
||||
results[mask] = res
|
||||
return results
|
||||
|
||||
|
||||
cpdef get_blocks(ndarray[int32_t, ndim=1] indices):
|
||||
cdef:
|
||||
Py_ssize_t init_len, i, npoints, result_indexer = 0
|
||||
int32_t block, length = 1, cur, prev
|
||||
ndarray[int32_t, ndim=1] locs, lens
|
||||
|
||||
npoints = len(indices)
|
||||
|
||||
# just handle the special empty case separately
|
||||
if npoints == 0:
|
||||
return np.array([], dtype=np.int32), np.array([], dtype=np.int32)
|
||||
|
||||
# block size can't be longer than npoints
|
||||
locs = np.empty(npoints, dtype=np.int32)
|
||||
lens = np.empty(npoints, dtype=np.int32)
|
||||
|
||||
# TODO: two-pass algorithm faster?
|
||||
prev = block = indices[0]
|
||||
for i in range(1, npoints):
|
||||
cur = indices[i]
|
||||
if cur - prev > 1:
|
||||
# new block
|
||||
locs[result_indexer] = block
|
||||
lens[result_indexer] = length
|
||||
block = cur
|
||||
length = 1
|
||||
result_indexer += 1
|
||||
else:
|
||||
# same block, increment length
|
||||
length += 1
|
||||
|
||||
prev = cur
|
||||
|
||||
locs[result_indexer] = block
|
||||
lens[result_indexer] = length
|
||||
result_indexer += 1
|
||||
locs = locs[:result_indexer]
|
||||
lens = lens[:result_indexer]
|
||||
return locs, lens
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# BlockIndex
|
||||
|
||||
cdef class BlockIndex(SparseIndex):
|
||||
"""
|
||||
Object for holding block-based sparse indexing information
|
||||
|
||||
Parameters
|
||||
----------
|
||||
"""
|
||||
cdef readonly:
|
||||
int32_t nblocks, npoints, length
|
||||
ndarray blocs, blengths
|
||||
|
||||
cdef:
|
||||
object __weakref__ # need to be picklable
|
||||
int32_t *locbuf
|
||||
int32_t *lenbuf
|
||||
|
||||
def __init__(self, length, blocs, blengths):
|
||||
|
||||
self.blocs = np.ascontiguousarray(blocs, dtype=np.int32)
|
||||
self.blengths = np.ascontiguousarray(blengths, dtype=np.int32)
|
||||
|
||||
# in case we need
|
||||
self.locbuf = <int32_t*>self.blocs.data
|
||||
self.lenbuf = <int32_t*>self.blengths.data
|
||||
|
||||
self.length = length
|
||||
self.nblocks = np.int32(len(self.blocs))
|
||||
self.npoints = self.blengths.sum()
|
||||
|
||||
# self.block_start = blocs
|
||||
# self.block_end = blocs + blengths
|
||||
|
||||
self.check_integrity()
|
||||
|
||||
def __reduce__(self):
|
||||
args = (self.length, self.blocs, self.blengths)
|
||||
return BlockIndex, args
|
||||
|
||||
def __repr__(self) -> str:
|
||||
output = 'BlockIndex\n'
|
||||
output += f'Block locations: {repr(self.blocs)}\n'
|
||||
output += f'Block lengths: {repr(self.blengths)}'
|
||||
|
||||
return output
|
||||
|
||||
@property
|
||||
def nbytes(self) -> int:
|
||||
return self.blocs.nbytes + self.blengths.nbytes
|
||||
|
||||
@property
|
||||
def ngaps(self) -> int:
|
||||
return self.length - self.npoints
|
||||
|
||||
cdef check_integrity(self):
|
||||
"""
|
||||
Check:
|
||||
- Locations are in ascending order
|
||||
- No overlapping blocks
|
||||
- Blocks to not start after end of index, nor extend beyond end
|
||||
"""
|
||||
cdef:
|
||||
Py_ssize_t i
|
||||
ndarray[int32_t, ndim=1] blocs, blengths
|
||||
|
||||
blocs = self.blocs
|
||||
blengths = self.blengths
|
||||
|
||||
if len(blocs) != len(blengths):
|
||||
raise ValueError('block bound arrays must be same length')
|
||||
|
||||
for i in range(self.nblocks):
|
||||
if i > 0:
|
||||
if blocs[i] <= blocs[i - 1]:
|
||||
raise ValueError('Locations not in ascending order')
|
||||
|
||||
if i < self.nblocks - 1:
|
||||
if blocs[i] + blengths[i] > blocs[i + 1]:
|
||||
raise ValueError(f'Block {i} overlaps')
|
||||
else:
|
||||
if blocs[i] + blengths[i] > self.length:
|
||||
raise ValueError(f'Block {i} extends beyond end')
|
||||
|
||||
# no zero-length blocks
|
||||
if blengths[i] == 0:
|
||||
raise ValueError(f'Zero-length block {i}')
|
||||
|
||||
def equals(self, other: object) -> bool:
|
||||
if not isinstance(other, BlockIndex):
|
||||
return False
|
||||
|
||||
if self is other:
|
||||
return True
|
||||
|
||||
same_length = self.length == other.length
|
||||
same_blocks = (np.array_equal(self.blocs, other.blocs) and
|
||||
np.array_equal(self.blengths, other.blengths))
|
||||
return same_length and same_blocks
|
||||
|
||||
def to_block_index(self):
|
||||
return self
|
||||
|
||||
cpdef to_int_index(self):
|
||||
cdef:
|
||||
int32_t i = 0, j, b
|
||||
int32_t offset
|
||||
ndarray[int32_t, ndim=1] indices
|
||||
|
||||
indices = np.empty(self.npoints, dtype=np.int32)
|
||||
|
||||
for b in range(self.nblocks):
|
||||
offset = self.locbuf[b]
|
||||
|
||||
for j in range(self.lenbuf[b]):
|
||||
indices[i] = offset + j
|
||||
i += 1
|
||||
|
||||
return IntIndex(self.length, indices)
|
||||
|
||||
@property
|
||||
def indices(self):
|
||||
return self.to_int_index().indices
|
||||
|
||||
cpdef BlockIndex intersect(self, SparseIndex other):
|
||||
"""
|
||||
Intersect two BlockIndex objects
|
||||
|
||||
Returns
|
||||
-------
|
||||
BlockIndex
|
||||
"""
|
||||
cdef:
|
||||
BlockIndex y
|
||||
ndarray[int32_t, ndim=1] xloc, xlen, yloc, ylen, out_bloc, out_blen
|
||||
Py_ssize_t xi = 0, yi = 0, max_len, result_indexer = 0
|
||||
int32_t cur_loc, cur_length, diff
|
||||
|
||||
y = other.to_block_index()
|
||||
|
||||
if self.length != y.length:
|
||||
raise Exception('Indices must reference same underlying length')
|
||||
|
||||
xloc = self.blocs
|
||||
xlen = self.blengths
|
||||
yloc = y.blocs
|
||||
ylen = y.blengths
|
||||
|
||||
# block may be split, but can't exceed original len / 2 + 1
|
||||
max_len = min(self.length, y.length) // 2 + 1
|
||||
out_bloc = np.empty(max_len, dtype=np.int32)
|
||||
out_blen = np.empty(max_len, dtype=np.int32)
|
||||
|
||||
while True:
|
||||
# we are done (or possibly never began)
|
||||
if xi >= self.nblocks or yi >= y.nblocks:
|
||||
break
|
||||
|
||||
# completely symmetric...would like to avoid code dup but oh well
|
||||
if xloc[xi] >= yloc[yi]:
|
||||
cur_loc = xloc[xi]
|
||||
diff = xloc[xi] - yloc[yi]
|
||||
|
||||
if ylen[yi] <= diff:
|
||||
# have to skip this block
|
||||
yi += 1
|
||||
continue
|
||||
|
||||
if ylen[yi] - diff < xlen[xi]:
|
||||
# take end of y block, move onward
|
||||
cur_length = ylen[yi] - diff
|
||||
yi += 1
|
||||
else:
|
||||
# take end of x block
|
||||
cur_length = xlen[xi]
|
||||
xi += 1
|
||||
|
||||
else: # xloc[xi] < yloc[yi]
|
||||
cur_loc = yloc[yi]
|
||||
diff = yloc[yi] - xloc[xi]
|
||||
|
||||
if xlen[xi] <= diff:
|
||||
# have to skip this block
|
||||
xi += 1
|
||||
continue
|
||||
|
||||
if xlen[xi] - diff < ylen[yi]:
|
||||
# take end of x block, move onward
|
||||
cur_length = xlen[xi] - diff
|
||||
xi += 1
|
||||
else:
|
||||
# take end of y block
|
||||
cur_length = ylen[yi]
|
||||
yi += 1
|
||||
|
||||
out_bloc[result_indexer] = cur_loc
|
||||
out_blen[result_indexer] = cur_length
|
||||
result_indexer += 1
|
||||
|
||||
out_bloc = out_bloc[:result_indexer]
|
||||
out_blen = out_blen[:result_indexer]
|
||||
|
||||
return BlockIndex(self.length, out_bloc, out_blen)
|
||||
|
||||
cpdef BlockIndex make_union(self, SparseIndex y):
|
||||
"""
|
||||
Combine together two BlockIndex objects, accepting indices if contained
|
||||
in one or the other
|
||||
|
||||
Parameters
|
||||
----------
|
||||
other : SparseIndex
|
||||
|
||||
Notes
|
||||
-----
|
||||
union is a protected keyword in Cython, hence make_union
|
||||
|
||||
Returns
|
||||
-------
|
||||
BlockIndex
|
||||
"""
|
||||
return BlockUnion(self, y.to_block_index()).result
|
||||
|
||||
cpdef Py_ssize_t lookup(self, Py_ssize_t index):
|
||||
"""
|
||||
Return the internal location if value exists on given index.
|
||||
Return -1 otherwise.
|
||||
"""
|
||||
cdef:
|
||||
Py_ssize_t i, cum_len
|
||||
ndarray[int32_t, ndim=1] locs, lens
|
||||
|
||||
locs = self.blocs
|
||||
lens = self.blengths
|
||||
|
||||
if self.nblocks == 0:
|
||||
return -1
|
||||
elif index < locs[0]:
|
||||
return -1
|
||||
|
||||
cum_len = 0
|
||||
for i in range(self.nblocks):
|
||||
if index >= locs[i] and index < locs[i] + lens[i]:
|
||||
return cum_len + index - locs[i]
|
||||
cum_len += lens[i]
|
||||
|
||||
return -1
|
||||
|
||||
@cython.wraparound(False)
|
||||
cpdef ndarray[int32_t] lookup_array(self, ndarray[int32_t, ndim=1] indexer):
|
||||
"""
|
||||
Vectorized lookup, returns ndarray[int32_t]
|
||||
"""
|
||||
cdef:
|
||||
Py_ssize_t n, i, j, ind_val
|
||||
ndarray[int32_t, ndim=1] locs, lens
|
||||
ndarray[int32_t, ndim=1] results
|
||||
|
||||
locs = self.blocs
|
||||
lens = self.blengths
|
||||
|
||||
n = len(indexer)
|
||||
results = np.empty(n, dtype=np.int32)
|
||||
results[:] = -1
|
||||
|
||||
if self.npoints == 0:
|
||||
return results
|
||||
|
||||
for i in range(n):
|
||||
ind_val = indexer[i]
|
||||
if not (ind_val < 0 or self.length <= ind_val):
|
||||
cum_len = 0
|
||||
for j in range(self.nblocks):
|
||||
if ind_val >= locs[j] and ind_val < locs[j] + lens[j]:
|
||||
results[i] = cum_len + ind_val - locs[j]
|
||||
cum_len += lens[j]
|
||||
return results
|
||||
|
||||
|
||||
@cython.internal
|
||||
cdef class BlockMerge:
|
||||
"""
|
||||
Object-oriented approach makes sharing state between recursive functions a
|
||||
lot easier and reduces code duplication
|
||||
"""
|
||||
cdef:
|
||||
BlockIndex x, y, result
|
||||
ndarray xstart, xlen, xend, ystart, ylen, yend
|
||||
int32_t xi, yi # block indices
|
||||
|
||||
def __init__(self, BlockIndex x, BlockIndex y):
|
||||
self.x = x
|
||||
self.y = y
|
||||
|
||||
if x.length != y.length:
|
||||
raise Exception('Indices must reference same underlying length')
|
||||
|
||||
self.xstart = self.x.blocs
|
||||
self.ystart = self.y.blocs
|
||||
|
||||
self.xend = self.x.blocs + self.x.blengths
|
||||
self.yend = self.y.blocs + self.y.blengths
|
||||
|
||||
# self.xlen = self.x.blengths
|
||||
# self.ylen = self.y.blengths
|
||||
|
||||
self.xi = 0
|
||||
self.yi = 0
|
||||
|
||||
self.result = self._make_merged_blocks()
|
||||
|
||||
cdef _make_merged_blocks(self):
|
||||
raise NotImplementedError
|
||||
|
||||
cdef _set_current_indices(self, int32_t xi, int32_t yi, bint mode):
|
||||
if mode == 0:
|
||||
self.xi = xi
|
||||
self.yi = yi
|
||||
else:
|
||||
self.xi = yi
|
||||
self.yi = xi
|
||||
|
||||
|
||||
@cython.internal
|
||||
cdef class BlockUnion(BlockMerge):
|
||||
"""
|
||||
Object-oriented approach makes sharing state between recursive functions a
|
||||
lot easier and reduces code duplication
|
||||
"""
|
||||
|
||||
cdef _make_merged_blocks(self):
|
||||
cdef:
|
||||
ndarray[int32_t, ndim=1] xstart, xend, ystart
|
||||
ndarray[int32_t, ndim=1] yend, out_bloc, out_blen
|
||||
int32_t nstart, nend, diff
|
||||
Py_ssize_t max_len, result_indexer = 0
|
||||
|
||||
xstart = self.xstart
|
||||
xend = self.xend
|
||||
ystart = self.ystart
|
||||
yend = self.yend
|
||||
|
||||
max_len = min(self.x.length, self.y.length) // 2 + 1
|
||||
out_bloc = np.empty(max_len, dtype=np.int32)
|
||||
out_blen = np.empty(max_len, dtype=np.int32)
|
||||
|
||||
while True:
|
||||
# we are done (or possibly never began)
|
||||
if self.xi >= self.x.nblocks and self.yi >= self.y.nblocks:
|
||||
break
|
||||
elif self.yi >= self.y.nblocks:
|
||||
# through with y, just pass through x blocks
|
||||
nstart = xstart[self.xi]
|
||||
nend = xend[self.xi]
|
||||
self.xi += 1
|
||||
elif self.xi >= self.x.nblocks:
|
||||
# through with x, just pass through y blocks
|
||||
nstart = ystart[self.yi]
|
||||
nend = yend[self.yi]
|
||||
self.yi += 1
|
||||
else:
|
||||
# find end of new block
|
||||
if xstart[self.xi] < ystart[self.yi]:
|
||||
nstart = xstart[self.xi]
|
||||
nend = self._find_next_block_end(0)
|
||||
else:
|
||||
nstart = ystart[self.yi]
|
||||
nend = self._find_next_block_end(1)
|
||||
|
||||
out_bloc[result_indexer] = nstart
|
||||
out_blen[result_indexer] = nend - nstart
|
||||
result_indexer += 1
|
||||
|
||||
out_bloc = out_bloc[:result_indexer]
|
||||
out_blen = out_blen[:result_indexer]
|
||||
|
||||
return BlockIndex(self.x.length, out_bloc, out_blen)
|
||||
|
||||
cdef int32_t _find_next_block_end(self, bint mode) except -1:
|
||||
"""
|
||||
Wow, this got complicated in a hurry
|
||||
|
||||
mode 0: block started in index x
|
||||
mode 1: block started in index y
|
||||
"""
|
||||
cdef:
|
||||
ndarray[int32_t, ndim=1] xstart, xend, ystart, yend
|
||||
int32_t xi, yi, xnblocks, ynblocks, nend
|
||||
|
||||
if mode != 0 and mode != 1:
|
||||
raise Exception('Mode must be 0 or 1')
|
||||
|
||||
# so symmetric code will work
|
||||
if mode == 0:
|
||||
xstart = self.xstart
|
||||
xend = self.xend
|
||||
xi = self.xi
|
||||
|
||||
ystart = self.ystart
|
||||
yend = self.yend
|
||||
yi = self.yi
|
||||
ynblocks = self.y.nblocks
|
||||
else:
|
||||
xstart = self.ystart
|
||||
xend = self.yend
|
||||
xi = self.yi
|
||||
|
||||
ystart = self.xstart
|
||||
yend = self.xend
|
||||
yi = self.xi
|
||||
ynblocks = self.x.nblocks
|
||||
|
||||
nend = xend[xi]
|
||||
|
||||
# done with y?
|
||||
if yi == ynblocks:
|
||||
self._set_current_indices(xi + 1, yi, mode)
|
||||
return nend
|
||||
elif nend < ystart[yi]:
|
||||
# block ends before y block
|
||||
self._set_current_indices(xi + 1, yi, mode)
|
||||
return nend
|
||||
else:
|
||||
while yi < ynblocks and nend > yend[yi]:
|
||||
yi += 1
|
||||
|
||||
self._set_current_indices(xi + 1, yi, mode)
|
||||
|
||||
if yi == ynblocks:
|
||||
return nend
|
||||
|
||||
if nend < ystart[yi]:
|
||||
# we're done, return the block end
|
||||
return nend
|
||||
else:
|
||||
# merge blocks, continue searching
|
||||
# this also catches the case where blocks
|
||||
return self._find_next_block_end(1 - mode)
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Sparse arithmetic
|
||||
|
||||
include "sparse_op_helper.pxi"
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# SparseArray mask create operations
|
||||
|
||||
def make_mask_object_ndarray(ndarray[object, ndim=1] arr, object fill_value):
|
||||
cdef:
|
||||
object value
|
||||
Py_ssize_t i
|
||||
Py_ssize_t new_length = len(arr)
|
||||
ndarray[int8_t, ndim=1] mask
|
||||
|
||||
mask = np.ones(new_length, dtype=np.int8)
|
||||
|
||||
for i in range(new_length):
|
||||
value = arr[i]
|
||||
if value == fill_value and type(value) == type(fill_value):
|
||||
mask[i] = 0
|
||||
|
||||
return mask.view(dtype=bool)
|
||||
@@ -0,0 +1,309 @@
|
||||
"""
|
||||
Template for each `dtype` helper function for sparse ops
|
||||
|
||||
WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
|
||||
"""
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# Sparse op
|
||||
# ----------------------------------------------------------------------
|
||||
|
||||
ctypedef fused sparse_t:
|
||||
float64_t
|
||||
int64_t
|
||||
|
||||
|
||||
cdef inline float64_t __div__(sparse_t a, sparse_t b):
|
||||
if b == 0:
|
||||
if a > 0:
|
||||
return INF
|
||||
elif a < 0:
|
||||
return -INF
|
||||
else:
|
||||
return NaN
|
||||
else:
|
||||
return float(a) / b
|
||||
|
||||
|
||||
cdef inline float64_t __truediv__(sparse_t a, sparse_t b):
|
||||
return __div__(a, b)
|
||||
|
||||
|
||||
cdef inline sparse_t __mod__(sparse_t a, sparse_t b):
|
||||
if b == 0:
|
||||
if sparse_t is float64_t:
|
||||
return NaN
|
||||
else:
|
||||
return 0
|
||||
else:
|
||||
return a % b
|
||||
|
||||
|
||||
cdef inline sparse_t __floordiv__(sparse_t a, sparse_t b):
|
||||
if b == 0:
|
||||
if sparse_t is float64_t:
|
||||
# Match non-sparse Series behavior implemented in mask_zero_div_zero
|
||||
if a > 0:
|
||||
return INF
|
||||
elif a < 0:
|
||||
return -INF
|
||||
return NaN
|
||||
else:
|
||||
return 0
|
||||
else:
|
||||
return a // b
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# sparse array op
|
||||
# ----------------------------------------------------------------------
|
||||
|
||||
{{py:
|
||||
|
||||
# dtype, arith_comp_group, logical_group
|
||||
dtypes = [('float64', True, False),
|
||||
('int64', True, True),
|
||||
('uint8', False, True)]
|
||||
# do not generate arithmetic / comparison template for uint8,
|
||||
# it should be done in fused types
|
||||
|
||||
def get_op(tup):
|
||||
assert isinstance(tup, tuple)
|
||||
assert len(tup) == 4
|
||||
|
||||
opname, lval, rval, dtype = tup
|
||||
|
||||
ops_dict = {'add': '{0} + {1}',
|
||||
'sub': '{0} - {1}',
|
||||
'mul': '{0} * {1}',
|
||||
'div': '__div__({0}, {1})',
|
||||
'mod': '__mod__({0}, {1})',
|
||||
'truediv': '__truediv__({0}, {1})',
|
||||
'floordiv': '__floordiv__({0}, {1})',
|
||||
'pow': '{0} ** {1}',
|
||||
'eq': '{0} == {1}',
|
||||
'ne': '{0} != {1}',
|
||||
'lt': '{0} < {1}',
|
||||
'gt': '{0} > {1}',
|
||||
'le': '{0} <= {1}',
|
||||
'ge': '{0} >= {1}',
|
||||
|
||||
'and': '{0} & {1}', # logical op
|
||||
'or': '{0} | {1}',
|
||||
'xor': '{0} ^ {1}'}
|
||||
|
||||
return ops_dict[opname].format(lval, rval)
|
||||
|
||||
|
||||
def get_dispatch(dtypes):
|
||||
|
||||
ops_list = ['add', 'sub', 'mul', 'div', 'mod', 'truediv',
|
||||
'floordiv', 'pow',
|
||||
'eq', 'ne', 'lt', 'gt', 'le', 'ge',
|
||||
'and', 'or', 'xor']
|
||||
|
||||
for opname in ops_list:
|
||||
for dtype, arith_comp_group, logical_group in dtypes:
|
||||
|
||||
if opname in ('div', 'truediv'):
|
||||
rdtype = 'float64'
|
||||
elif opname in ('eq', 'ne', 'lt', 'gt', 'le', 'ge'):
|
||||
# comparison op
|
||||
rdtype = 'uint8'
|
||||
elif opname in ('and', 'or', 'xor'):
|
||||
# logical op
|
||||
rdtype = 'uint8'
|
||||
else:
|
||||
rdtype = dtype
|
||||
|
||||
if opname in ('and', 'or', 'xor'):
|
||||
if logical_group:
|
||||
yield opname, dtype, rdtype
|
||||
else:
|
||||
if arith_comp_group:
|
||||
yield opname, dtype, rdtype
|
||||
|
||||
}}
|
||||
|
||||
|
||||
{{for opname, dtype, rdtype in get_dispatch(dtypes)}}
|
||||
|
||||
|
||||
@cython.wraparound(False)
|
||||
@cython.boundscheck(False)
|
||||
cdef inline tuple block_op_{{opname}}_{{dtype}}({{dtype}}_t[:] x_,
|
||||
BlockIndex xindex,
|
||||
{{dtype}}_t xfill,
|
||||
{{dtype}}_t[:] y_,
|
||||
BlockIndex yindex,
|
||||
{{dtype}}_t yfill):
|
||||
'''
|
||||
Binary operator on BlockIndex objects with fill values
|
||||
'''
|
||||
|
||||
cdef:
|
||||
BlockIndex out_index
|
||||
Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
|
||||
int32_t xbp = 0, ybp = 0 # block positions
|
||||
int32_t xloc, yloc
|
||||
Py_ssize_t xblock = 0, yblock = 0 # block numbers
|
||||
|
||||
{{dtype}}_t[:] x, y
|
||||
ndarray[{{rdtype}}_t, ndim=1] out
|
||||
|
||||
# to suppress Cython warning
|
||||
x = x_
|
||||
y = y_
|
||||
|
||||
out_index = xindex.make_union(yindex)
|
||||
out = np.empty(out_index.npoints, dtype=np.{{rdtype}})
|
||||
|
||||
# Wow, what a hack job. Need to do something about this
|
||||
|
||||
# walk the two SparseVectors, adding matched locations...
|
||||
for out_i in range(out_index.npoints):
|
||||
if yblock == yindex.nblocks:
|
||||
# use y fill value
|
||||
out[out_i] = {{(opname, 'x[xi]', 'yfill', dtype) | get_op}}
|
||||
xi += 1
|
||||
|
||||
# advance x location
|
||||
xbp += 1
|
||||
if xbp == xindex.lenbuf[xblock]:
|
||||
xblock += 1
|
||||
xbp = 0
|
||||
continue
|
||||
|
||||
if xblock == xindex.nblocks:
|
||||
# use x fill value
|
||||
out[out_i] = {{(opname, 'xfill', 'y[yi]', dtype) | get_op}}
|
||||
yi += 1
|
||||
|
||||
# advance y location
|
||||
ybp += 1
|
||||
if ybp == yindex.lenbuf[yblock]:
|
||||
yblock += 1
|
||||
ybp = 0
|
||||
continue
|
||||
|
||||
yloc = yindex.locbuf[yblock] + ybp
|
||||
xloc = xindex.locbuf[xblock] + xbp
|
||||
|
||||
# each index in the out_index had to come from either x, y, or both
|
||||
if xloc == yloc:
|
||||
out[out_i] = {{(opname, 'x[xi]', 'y[yi]', dtype) | get_op}}
|
||||
xi += 1
|
||||
yi += 1
|
||||
|
||||
# advance both locations
|
||||
xbp += 1
|
||||
if xbp == xindex.lenbuf[xblock]:
|
||||
xblock += 1
|
||||
xbp = 0
|
||||
|
||||
ybp += 1
|
||||
if ybp == yindex.lenbuf[yblock]:
|
||||
yblock += 1
|
||||
ybp = 0
|
||||
|
||||
elif xloc < yloc:
|
||||
# use y fill value
|
||||
out[out_i] = {{(opname, 'x[xi]', 'yfill', dtype) | get_op}}
|
||||
xi += 1
|
||||
|
||||
# advance x location
|
||||
xbp += 1
|
||||
if xbp == xindex.lenbuf[xblock]:
|
||||
xblock += 1
|
||||
xbp = 0
|
||||
else:
|
||||
# use x fill value
|
||||
out[out_i] = {{(opname, 'xfill', 'y[yi]', dtype) | get_op}}
|
||||
yi += 1
|
||||
|
||||
# advance y location
|
||||
ybp += 1
|
||||
if ybp == yindex.lenbuf[yblock]:
|
||||
yblock += 1
|
||||
ybp = 0
|
||||
|
||||
return out, out_index, {{(opname, 'xfill', 'yfill', dtype) | get_op}}
|
||||
|
||||
|
||||
@cython.wraparound(False)
|
||||
@cython.boundscheck(False)
|
||||
cdef inline tuple int_op_{{opname}}_{{dtype}}({{dtype}}_t[:] x_,
|
||||
IntIndex xindex,
|
||||
{{dtype}}_t xfill,
|
||||
{{dtype}}_t[:] y_,
|
||||
IntIndex yindex,
|
||||
{{dtype}}_t yfill):
|
||||
cdef:
|
||||
IntIndex out_index
|
||||
Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
|
||||
int32_t xloc, yloc
|
||||
int32_t[:] xindices, yindices, out_indices
|
||||
{{dtype}}_t[:] x, y
|
||||
ndarray[{{rdtype}}_t, ndim=1] out
|
||||
|
||||
# suppress Cython compiler warnings due to inlining
|
||||
x = x_
|
||||
y = y_
|
||||
|
||||
# need to do this first to know size of result array
|
||||
out_index = xindex.make_union(yindex)
|
||||
out = np.empty(out_index.npoints, dtype=np.{{rdtype}})
|
||||
|
||||
xindices = xindex.indices
|
||||
yindices = yindex.indices
|
||||
out_indices = out_index.indices
|
||||
|
||||
# walk the two SparseVectors, adding matched locations...
|
||||
for out_i in range(out_index.npoints):
|
||||
if xi == xindex.npoints:
|
||||
# use x fill value
|
||||
out[out_i] = {{(opname, 'xfill', 'y[yi]', dtype) | get_op}}
|
||||
yi += 1
|
||||
continue
|
||||
|
||||
if yi == yindex.npoints:
|
||||
# use y fill value
|
||||
out[out_i] = {{(opname, 'x[xi]', 'yfill', dtype) | get_op}}
|
||||
xi += 1
|
||||
continue
|
||||
|
||||
xloc = xindices[xi]
|
||||
yloc = yindices[yi]
|
||||
|
||||
# each index in the out_index had to come from either x, y, or both
|
||||
if xloc == yloc:
|
||||
out[out_i] = {{(opname, 'x[xi]', 'y[yi]', dtype) | get_op}}
|
||||
xi += 1
|
||||
yi += 1
|
||||
elif xloc < yloc:
|
||||
# use y fill value
|
||||
out[out_i] = {{(opname, 'x[xi]', 'yfill', dtype) | get_op}}
|
||||
xi += 1
|
||||
else:
|
||||
# use x fill value
|
||||
out[out_i] = {{(opname, 'xfill', 'y[yi]', dtype) | get_op}}
|
||||
yi += 1
|
||||
|
||||
return out, out_index, {{(opname, 'xfill', 'yfill', dtype) | get_op}}
|
||||
|
||||
|
||||
cpdef sparse_{{opname}}_{{dtype}}({{dtype}}_t[:] x,
|
||||
SparseIndex xindex, {{dtype}}_t xfill,
|
||||
{{dtype}}_t[:] y,
|
||||
SparseIndex yindex, {{dtype}}_t yfill):
|
||||
|
||||
if isinstance(xindex, BlockIndex):
|
||||
return block_op_{{opname}}_{{dtype}}(x, xindex.to_block_index(), xfill,
|
||||
y, yindex.to_block_index(), yfill)
|
||||
elif isinstance(xindex, IntIndex):
|
||||
return int_op_{{opname}}_{{dtype}}(x, xindex.to_int_index(), xfill,
|
||||
y, yindex.to_int_index(), yfill)
|
||||
else:
|
||||
raise NotImplementedError
|
||||
|
||||
{{endfor}}
|
||||
@@ -0,0 +1,48 @@
|
||||
#ifndef _PANDAS_MATH_H_
|
||||
#define _PANDAS_MATH_H_
|
||||
|
||||
// MSVC 2017 has a bug where `x == x` can be true for NaNs.
|
||||
// MSC_VER from https://stackoverflow.com/a/70630/1889400
|
||||
// Place upper bound on this check once a fixed MSVC is released.
|
||||
#if defined(_MSC_VER) && (_MSC_VER < 1800)
|
||||
#include <cmath>
|
||||
// In older versions of Visual Studio there wasn't a std::signbit defined
|
||||
// This defines it using _copysign
|
||||
namespace std {
|
||||
__inline int isnan(double x) { return _isnan(x); }
|
||||
__inline int signbit(double num) { return _copysign(1.0, num) < 0; }
|
||||
__inline int notnan(double x) { return !isnan(x); }
|
||||
}
|
||||
#elif defined(_MSC_VER) && (_MSC_VER >= 1900)
|
||||
#include <cmath>
|
||||
namespace std {
|
||||
__inline int isnan(double x) { return _isnan(x); }
|
||||
__inline int notnan(double x) { return !isnan(x); }
|
||||
}
|
||||
#elif defined(_MSC_VER)
|
||||
#include <cmath>
|
||||
namespace std {
|
||||
__inline int isnan(double x) { return _isnan(x); }
|
||||
__inline int notnan(double x) { return x == x; }
|
||||
}
|
||||
#elif defined(__MVS__)
|
||||
#include <cmath>
|
||||
|
||||
#define _signbit signbit
|
||||
#undef signbit
|
||||
#undef isnan
|
||||
|
||||
namespace std {
|
||||
__inline int notnan(double x) { return x == x; }
|
||||
__inline int signbit(double num) { return _signbit(num); }
|
||||
__inline int isnan(double x) { return isnan(x); }
|
||||
}
|
||||
#else
|
||||
#include <cmath>
|
||||
|
||||
namespace std {
|
||||
__inline int notnan(double x) { return x == x; }
|
||||
}
|
||||
|
||||
#endif
|
||||
#endif
|
||||
@@ -0,0 +1,305 @@
|
||||
// ISO C9x compliant inttypes.h for Microsoft Visual Studio
|
||||
// Based on ISO/IEC 9899:TC2 Committee draft (May 6, 2005) WG14/N1124
|
||||
//
|
||||
// Copyright (c) 2006 Alexander Chemeris
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. The name of the author may be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
|
||||
// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
||||
// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
|
||||
// EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
|
||||
// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||
// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
|
||||
// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
|
||||
// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifndef _MSC_VER // [
|
||||
#error "Use this header only with Microsoft Visual C++ compilers!"
|
||||
#endif // _MSC_VER ]
|
||||
|
||||
#ifndef _MSC_INTTYPES_H_ // [
|
||||
#define _MSC_INTTYPES_H_
|
||||
|
||||
#if _MSC_VER > 1000
|
||||
#pragma once
|
||||
#endif
|
||||
|
||||
#include "ms_stdint.h"
|
||||
|
||||
// 7.8 Format conversion of integer types
|
||||
|
||||
typedef struct {
|
||||
intmax_t quot;
|
||||
intmax_t rem;
|
||||
} imaxdiv_t;
|
||||
|
||||
// 7.8.1 Macros for format specifiers
|
||||
|
||||
#if !defined(__cplusplus) || defined(__STDC_FORMAT_MACROS) // [ See footnote 185 at page 198
|
||||
|
||||
// The fprintf macros for signed integers are:
|
||||
#define PRId8 "d"
|
||||
#define PRIi8 "i"
|
||||
#define PRIdLEAST8 "d"
|
||||
#define PRIiLEAST8 "i"
|
||||
#define PRIdFAST8 "d"
|
||||
#define PRIiFAST8 "i"
|
||||
|
||||
#define PRId16 "hd"
|
||||
#define PRIi16 "hi"
|
||||
#define PRIdLEAST16 "hd"
|
||||
#define PRIiLEAST16 "hi"
|
||||
#define PRIdFAST16 "hd"
|
||||
#define PRIiFAST16 "hi"
|
||||
|
||||
#define PRId32 "I32d"
|
||||
#define PRIi32 "I32i"
|
||||
#define PRIdLEAST32 "I32d"
|
||||
#define PRIiLEAST32 "I32i"
|
||||
#define PRIdFAST32 "I32d"
|
||||
#define PRIiFAST32 "I32i"
|
||||
|
||||
#define PRId64 "I64d"
|
||||
#define PRIi64 "I64i"
|
||||
#define PRIdLEAST64 "I64d"
|
||||
#define PRIiLEAST64 "I64i"
|
||||
#define PRIdFAST64 "I64d"
|
||||
#define PRIiFAST64 "I64i"
|
||||
|
||||
#define PRIdMAX "I64d"
|
||||
#define PRIiMAX "I64i"
|
||||
|
||||
#define PRIdPTR "Id"
|
||||
#define PRIiPTR "Ii"
|
||||
|
||||
// The fprintf macros for unsigned integers are:
|
||||
#define PRIo8 "o"
|
||||
#define PRIu8 "u"
|
||||
#define PRIx8 "x"
|
||||
#define PRIX8 "X"
|
||||
#define PRIoLEAST8 "o"
|
||||
#define PRIuLEAST8 "u"
|
||||
#define PRIxLEAST8 "x"
|
||||
#define PRIXLEAST8 "X"
|
||||
#define PRIoFAST8 "o"
|
||||
#define PRIuFAST8 "u"
|
||||
#define PRIxFAST8 "x"
|
||||
#define PRIXFAST8 "X"
|
||||
|
||||
#define PRIo16 "ho"
|
||||
#define PRIu16 "hu"
|
||||
#define PRIx16 "hx"
|
||||
#define PRIX16 "hX"
|
||||
#define PRIoLEAST16 "ho"
|
||||
#define PRIuLEAST16 "hu"
|
||||
#define PRIxLEAST16 "hx"
|
||||
#define PRIXLEAST16 "hX"
|
||||
#define PRIoFAST16 "ho"
|
||||
#define PRIuFAST16 "hu"
|
||||
#define PRIxFAST16 "hx"
|
||||
#define PRIXFAST16 "hX"
|
||||
|
||||
#define PRIo32 "I32o"
|
||||
#define PRIu32 "I32u"
|
||||
#define PRIx32 "I32x"
|
||||
#define PRIX32 "I32X"
|
||||
#define PRIoLEAST32 "I32o"
|
||||
#define PRIuLEAST32 "I32u"
|
||||
#define PRIxLEAST32 "I32x"
|
||||
#define PRIXLEAST32 "I32X"
|
||||
#define PRIoFAST32 "I32o"
|
||||
#define PRIuFAST32 "I32u"
|
||||
#define PRIxFAST32 "I32x"
|
||||
#define PRIXFAST32 "I32X"
|
||||
|
||||
#define PRIo64 "I64o"
|
||||
#define PRIu64 "I64u"
|
||||
#define PRIx64 "I64x"
|
||||
#define PRIX64 "I64X"
|
||||
#define PRIoLEAST64 "I64o"
|
||||
#define PRIuLEAST64 "I64u"
|
||||
#define PRIxLEAST64 "I64x"
|
||||
#define PRIXLEAST64 "I64X"
|
||||
#define PRIoFAST64 "I64o"
|
||||
#define PRIuFAST64 "I64u"
|
||||
#define PRIxFAST64 "I64x"
|
||||
#define PRIXFAST64 "I64X"
|
||||
|
||||
#define PRIoMAX "I64o"
|
||||
#define PRIuMAX "I64u"
|
||||
#define PRIxMAX "I64x"
|
||||
#define PRIXMAX "I64X"
|
||||
|
||||
#define PRIoPTR "Io"
|
||||
#define PRIuPTR "Iu"
|
||||
#define PRIxPTR "Ix"
|
||||
#define PRIXPTR "IX"
|
||||
|
||||
// The fscanf macros for signed integers are:
|
||||
#define SCNd8 "d"
|
||||
#define SCNi8 "i"
|
||||
#define SCNdLEAST8 "d"
|
||||
#define SCNiLEAST8 "i"
|
||||
#define SCNdFAST8 "d"
|
||||
#define SCNiFAST8 "i"
|
||||
|
||||
#define SCNd16 "hd"
|
||||
#define SCNi16 "hi"
|
||||
#define SCNdLEAST16 "hd"
|
||||
#define SCNiLEAST16 "hi"
|
||||
#define SCNdFAST16 "hd"
|
||||
#define SCNiFAST16 "hi"
|
||||
|
||||
#define SCNd32 "ld"
|
||||
#define SCNi32 "li"
|
||||
#define SCNdLEAST32 "ld"
|
||||
#define SCNiLEAST32 "li"
|
||||
#define SCNdFAST32 "ld"
|
||||
#define SCNiFAST32 "li"
|
||||
|
||||
#define SCNd64 "I64d"
|
||||
#define SCNi64 "I64i"
|
||||
#define SCNdLEAST64 "I64d"
|
||||
#define SCNiLEAST64 "I64i"
|
||||
#define SCNdFAST64 "I64d"
|
||||
#define SCNiFAST64 "I64i"
|
||||
|
||||
#define SCNdMAX "I64d"
|
||||
#define SCNiMAX "I64i"
|
||||
|
||||
#ifdef _WIN64 // [
|
||||
# define SCNdPTR "I64d"
|
||||
# define SCNiPTR "I64i"
|
||||
#else // _WIN64 ][
|
||||
# define SCNdPTR "ld"
|
||||
# define SCNiPTR "li"
|
||||
#endif // _WIN64 ]
|
||||
|
||||
// The fscanf macros for unsigned integers are:
|
||||
#define SCNo8 "o"
|
||||
#define SCNu8 "u"
|
||||
#define SCNx8 "x"
|
||||
#define SCNX8 "X"
|
||||
#define SCNoLEAST8 "o"
|
||||
#define SCNuLEAST8 "u"
|
||||
#define SCNxLEAST8 "x"
|
||||
#define SCNXLEAST8 "X"
|
||||
#define SCNoFAST8 "o"
|
||||
#define SCNuFAST8 "u"
|
||||
#define SCNxFAST8 "x"
|
||||
#define SCNXFAST8 "X"
|
||||
|
||||
#define SCNo16 "ho"
|
||||
#define SCNu16 "hu"
|
||||
#define SCNx16 "hx"
|
||||
#define SCNX16 "hX"
|
||||
#define SCNoLEAST16 "ho"
|
||||
#define SCNuLEAST16 "hu"
|
||||
#define SCNxLEAST16 "hx"
|
||||
#define SCNXLEAST16 "hX"
|
||||
#define SCNoFAST16 "ho"
|
||||
#define SCNuFAST16 "hu"
|
||||
#define SCNxFAST16 "hx"
|
||||
#define SCNXFAST16 "hX"
|
||||
|
||||
#define SCNo32 "lo"
|
||||
#define SCNu32 "lu"
|
||||
#define SCNx32 "lx"
|
||||
#define SCNX32 "lX"
|
||||
#define SCNoLEAST32 "lo"
|
||||
#define SCNuLEAST32 "lu"
|
||||
#define SCNxLEAST32 "lx"
|
||||
#define SCNXLEAST32 "lX"
|
||||
#define SCNoFAST32 "lo"
|
||||
#define SCNuFAST32 "lu"
|
||||
#define SCNxFAST32 "lx"
|
||||
#define SCNXFAST32 "lX"
|
||||
|
||||
#define SCNo64 "I64o"
|
||||
#define SCNu64 "I64u"
|
||||
#define SCNx64 "I64x"
|
||||
#define SCNX64 "I64X"
|
||||
#define SCNoLEAST64 "I64o"
|
||||
#define SCNuLEAST64 "I64u"
|
||||
#define SCNxLEAST64 "I64x"
|
||||
#define SCNXLEAST64 "I64X"
|
||||
#define SCNoFAST64 "I64o"
|
||||
#define SCNuFAST64 "I64u"
|
||||
#define SCNxFAST64 "I64x"
|
||||
#define SCNXFAST64 "I64X"
|
||||
|
||||
#define SCNoMAX "I64o"
|
||||
#define SCNuMAX "I64u"
|
||||
#define SCNxMAX "I64x"
|
||||
#define SCNXMAX "I64X"
|
||||
|
||||
#ifdef _WIN64 // [
|
||||
# define SCNoPTR "I64o"
|
||||
# define SCNuPTR "I64u"
|
||||
# define SCNxPTR "I64x"
|
||||
# define SCNXPTR "I64X"
|
||||
#else // _WIN64 ][
|
||||
# define SCNoPTR "lo"
|
||||
# define SCNuPTR "lu"
|
||||
# define SCNxPTR "lx"
|
||||
# define SCNXPTR "lX"
|
||||
#endif // _WIN64 ]
|
||||
|
||||
#endif // __STDC_FORMAT_MACROS ]
|
||||
|
||||
// 7.8.2 Functions for greatest-width integer types
|
||||
|
||||
// 7.8.2.1 The imaxabs function
|
||||
#define imaxabs _abs64
|
||||
|
||||
// 7.8.2.2 The imaxdiv function
|
||||
|
||||
// This is modified version of div() function from Microsoft's div.c found
|
||||
// in %MSVC.NET%\crt\src\div.c
|
||||
#ifdef STATIC_IMAXDIV // [
|
||||
static
|
||||
#else // STATIC_IMAXDIV ][
|
||||
_inline
|
||||
#endif // STATIC_IMAXDIV ]
|
||||
imaxdiv_t __cdecl imaxdiv(intmax_t numer, intmax_t denom)
|
||||
{
|
||||
imaxdiv_t result;
|
||||
|
||||
result.quot = numer / denom;
|
||||
result.rem = numer % denom;
|
||||
|
||||
if (numer < 0 && result.rem > 0) {
|
||||
// did division wrong; must fix up
|
||||
++result.quot;
|
||||
result.rem -= denom;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// 7.8.2.3 The strtoimax and strtoumax functions
|
||||
#define strtoimax _strtoi64
|
||||
#define strtoumax _strtoui64
|
||||
|
||||
// 7.8.2.4 The wcstoimax and wcstoumax functions
|
||||
#define wcstoimax _wcstoi64
|
||||
#define wcstoumax _wcstoui64
|
||||
|
||||
|
||||
#endif // _MSC_INTTYPES_H_ ]
|
||||
@@ -0,0 +1,247 @@
|
||||
// ISO C9x compliant stdint.h for Microsoft Visual Studio
|
||||
// Based on ISO/IEC 9899:TC2 Committee draft (May 6, 2005) WG14/N1124
|
||||
//
|
||||
// Copyright (c) 2006-2008 Alexander Chemeris
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. The name of the author may be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
|
||||
// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
||||
// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
|
||||
// EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
|
||||
// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||
// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
|
||||
// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
|
||||
// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifndef _MSC_VER // [
|
||||
#error "Use this header only with Microsoft Visual C++ compilers!"
|
||||
#endif // _MSC_VER ]
|
||||
|
||||
#ifndef _MSC_STDINT_H_ // [
|
||||
#define _MSC_STDINT_H_
|
||||
|
||||
#if _MSC_VER > 1000
|
||||
#pragma once
|
||||
#endif
|
||||
|
||||
#include <limits.h>
|
||||
|
||||
// For Visual Studio 6 in C++ mode and for many Visual Studio versions when
|
||||
// compiling for ARM we should wrap <wchar.h> include with 'extern "C++" {}'
|
||||
// or compiler give many errors like this:
|
||||
// error C2733: second C linkage of overloaded function 'wmemchr' not allowed
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
# include <wchar.h>
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
// Define _W64 macros to mark types changing their size, like intptr_t.
|
||||
#ifndef _W64
|
||||
# if !defined(__midl) && (defined(_X86_) || defined(_M_IX86)) && _MSC_VER >= 1300
|
||||
# define _W64 __w64
|
||||
# else
|
||||
# define _W64
|
||||
# endif
|
||||
#endif
|
||||
|
||||
|
||||
// 7.18.1 Integer types
|
||||
|
||||
// 7.18.1.1 Exact-width integer types
|
||||
|
||||
// Visual Studio 6 and Embedded Visual C++ 4 doesn't
|
||||
// realize that, e.g. char has the same size as __int8
|
||||
// so we give up on __intX for them.
|
||||
#if (_MSC_VER < 1300)
|
||||
typedef signed char int8_t;
|
||||
typedef signed short int16_t;
|
||||
typedef signed int int32_t;
|
||||
typedef unsigned char uint8_t;
|
||||
typedef unsigned short uint16_t;
|
||||
typedef unsigned int uint32_t;
|
||||
#else
|
||||
typedef signed __int8 int8_t;
|
||||
typedef signed __int16 int16_t;
|
||||
typedef signed __int32 int32_t;
|
||||
typedef unsigned __int8 uint8_t;
|
||||
typedef unsigned __int16 uint16_t;
|
||||
typedef unsigned __int32 uint32_t;
|
||||
#endif
|
||||
typedef signed __int64 int64_t;
|
||||
typedef unsigned __int64 uint64_t;
|
||||
|
||||
|
||||
// 7.18.1.2 Minimum-width integer types
|
||||
typedef int8_t int_least8_t;
|
||||
typedef int16_t int_least16_t;
|
||||
typedef int32_t int_least32_t;
|
||||
typedef int64_t int_least64_t;
|
||||
typedef uint8_t uint_least8_t;
|
||||
typedef uint16_t uint_least16_t;
|
||||
typedef uint32_t uint_least32_t;
|
||||
typedef uint64_t uint_least64_t;
|
||||
|
||||
// 7.18.1.3 Fastest minimum-width integer types
|
||||
typedef int8_t int_fast8_t;
|
||||
typedef int16_t int_fast16_t;
|
||||
typedef int32_t int_fast32_t;
|
||||
typedef int64_t int_fast64_t;
|
||||
typedef uint8_t uint_fast8_t;
|
||||
typedef uint16_t uint_fast16_t;
|
||||
typedef uint32_t uint_fast32_t;
|
||||
typedef uint64_t uint_fast64_t;
|
||||
|
||||
// 7.18.1.4 Integer types capable of holding object pointers
|
||||
#ifdef _WIN64 // [
|
||||
typedef signed __int64 intptr_t;
|
||||
typedef unsigned __int64 uintptr_t;
|
||||
#else // _WIN64 ][
|
||||
typedef _W64 signed int intptr_t;
|
||||
typedef _W64 unsigned int uintptr_t;
|
||||
#endif // _WIN64 ]
|
||||
|
||||
// 7.18.1.5 Greatest-width integer types
|
||||
typedef int64_t intmax_t;
|
||||
typedef uint64_t uintmax_t;
|
||||
|
||||
|
||||
// 7.18.2 Limits of specified-width integer types
|
||||
|
||||
#if !defined(__cplusplus) || defined(__STDC_LIMIT_MACROS) // [ See footnote 220 at page 257 and footnote 221 at page 259
|
||||
|
||||
// 7.18.2.1 Limits of exact-width integer types
|
||||
#define INT8_MIN ((int8_t)_I8_MIN)
|
||||
#define INT8_MAX _I8_MAX
|
||||
#define INT16_MIN ((int16_t)_I16_MIN)
|
||||
#define INT16_MAX _I16_MAX
|
||||
#define INT32_MIN ((int32_t)_I32_MIN)
|
||||
#define INT32_MAX _I32_MAX
|
||||
#define INT64_MIN ((int64_t)_I64_MIN)
|
||||
#define INT64_MAX _I64_MAX
|
||||
#define UINT8_MAX _UI8_MAX
|
||||
#define UINT16_MAX _UI16_MAX
|
||||
#define UINT32_MAX _UI32_MAX
|
||||
#define UINT64_MAX _UI64_MAX
|
||||
|
||||
// 7.18.2.2 Limits of minimum-width integer types
|
||||
#define INT_LEAST8_MIN INT8_MIN
|
||||
#define INT_LEAST8_MAX INT8_MAX
|
||||
#define INT_LEAST16_MIN INT16_MIN
|
||||
#define INT_LEAST16_MAX INT16_MAX
|
||||
#define INT_LEAST32_MIN INT32_MIN
|
||||
#define INT_LEAST32_MAX INT32_MAX
|
||||
#define INT_LEAST64_MIN INT64_MIN
|
||||
#define INT_LEAST64_MAX INT64_MAX
|
||||
#define UINT_LEAST8_MAX UINT8_MAX
|
||||
#define UINT_LEAST16_MAX UINT16_MAX
|
||||
#define UINT_LEAST32_MAX UINT32_MAX
|
||||
#define UINT_LEAST64_MAX UINT64_MAX
|
||||
|
||||
// 7.18.2.3 Limits of fastest minimum-width integer types
|
||||
#define INT_FAST8_MIN INT8_MIN
|
||||
#define INT_FAST8_MAX INT8_MAX
|
||||
#define INT_FAST16_MIN INT16_MIN
|
||||
#define INT_FAST16_MAX INT16_MAX
|
||||
#define INT_FAST32_MIN INT32_MIN
|
||||
#define INT_FAST32_MAX INT32_MAX
|
||||
#define INT_FAST64_MIN INT64_MIN
|
||||
#define INT_FAST64_MAX INT64_MAX
|
||||
#define UINT_FAST8_MAX UINT8_MAX
|
||||
#define UINT_FAST16_MAX UINT16_MAX
|
||||
#define UINT_FAST32_MAX UINT32_MAX
|
||||
#define UINT_FAST64_MAX UINT64_MAX
|
||||
|
||||
// 7.18.2.4 Limits of integer types capable of holding object pointers
|
||||
#ifdef _WIN64 // [
|
||||
# define INTPTR_MIN INT64_MIN
|
||||
# define INTPTR_MAX INT64_MAX
|
||||
# define UINTPTR_MAX UINT64_MAX
|
||||
#else // _WIN64 ][
|
||||
# define INTPTR_MIN INT32_MIN
|
||||
# define INTPTR_MAX INT32_MAX
|
||||
# define UINTPTR_MAX UINT32_MAX
|
||||
#endif // _WIN64 ]
|
||||
|
||||
// 7.18.2.5 Limits of greatest-width integer types
|
||||
#define INTMAX_MIN INT64_MIN
|
||||
#define INTMAX_MAX INT64_MAX
|
||||
#define UINTMAX_MAX UINT64_MAX
|
||||
|
||||
// 7.18.3 Limits of other integer types
|
||||
|
||||
#ifdef _WIN64 // [
|
||||
# define PTRDIFF_MIN _I64_MIN
|
||||
# define PTRDIFF_MAX _I64_MAX
|
||||
#else // _WIN64 ][
|
||||
# define PTRDIFF_MIN _I32_MIN
|
||||
# define PTRDIFF_MAX _I32_MAX
|
||||
#endif // _WIN64 ]
|
||||
|
||||
#define SIG_ATOMIC_MIN INT_MIN
|
||||
#define SIG_ATOMIC_MAX INT_MAX
|
||||
|
||||
#ifndef SIZE_MAX // [
|
||||
# ifdef _WIN64 // [
|
||||
# define SIZE_MAX _UI64_MAX
|
||||
# else // _WIN64 ][
|
||||
# define SIZE_MAX _UI32_MAX
|
||||
# endif // _WIN64 ]
|
||||
#endif // SIZE_MAX ]
|
||||
|
||||
// WCHAR_MIN and WCHAR_MAX are also defined in <wchar.h>
|
||||
#ifndef WCHAR_MIN // [
|
||||
# define WCHAR_MIN 0
|
||||
#endif // WCHAR_MIN ]
|
||||
#ifndef WCHAR_MAX // [
|
||||
# define WCHAR_MAX _UI16_MAX
|
||||
#endif // WCHAR_MAX ]
|
||||
|
||||
#define WINT_MIN 0
|
||||
#define WINT_MAX _UI16_MAX
|
||||
|
||||
#endif // __STDC_LIMIT_MACROS ]
|
||||
|
||||
|
||||
// 7.18.4 Limits of other integer types
|
||||
|
||||
#if !defined(__cplusplus) || defined(__STDC_CONSTANT_MACROS) // [ See footnote 224 at page 260
|
||||
|
||||
// 7.18.4.1 Macros for minimum-width integer constants
|
||||
|
||||
#define INT8_C(val) val##i8
|
||||
#define INT16_C(val) val##i16
|
||||
#define INT32_C(val) val##i32
|
||||
#define INT64_C(val) val##i64
|
||||
|
||||
#define UINT8_C(val) val##ui8
|
||||
#define UINT16_C(val) val##ui16
|
||||
#define UINT32_C(val) val##ui32
|
||||
#define UINT64_C(val) val##ui64
|
||||
|
||||
// 7.18.4.2 Macros for greatest-width integer constants
|
||||
#define INTMAX_C INT64_C
|
||||
#define UINTMAX_C UINT64_C
|
||||
|
||||
#endif // __STDC_CONSTANT_MACROS ]
|
||||
|
||||
|
||||
#endif // _MSC_STDINT_H_ ]
|
||||
@@ -0,0 +1,16 @@
|
||||
#ifndef _PANDAS_PORTABLE_H_
|
||||
#define _PANDAS_PORTABLE_H_
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#define strcasecmp( s1, s2 ) _stricmp( s1, s2 )
|
||||
#endif
|
||||
|
||||
// GH-23516 - works around locale perf issues
|
||||
// from MUSL libc, MIT Licensed - see LICENSES
|
||||
#define isdigit_ascii(c) (((unsigned)(c) - '0') < 10u)
|
||||
#define getdigit_ascii(c, default) (isdigit_ascii(c) ? ((int)((c) - '0')) : default)
|
||||
#define isspace_ascii(c) (((c) == ' ') || (((unsigned)(c) - '\t') < 5))
|
||||
#define toupper_ascii(c) ((((unsigned)(c) - 'a') < 26) ? ((c) & 0x5f) : (c))
|
||||
#define tolower_ascii(c) ((((unsigned)(c) - 'A') < 26) ? ((c) | 0x20) : (c))
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,10 @@
|
||||
#ifndef _PANDAS_STDINT_H_
|
||||
#define _PANDAS_STDINT_H_
|
||||
|
||||
#if defined(_MSC_VER) && (_MSC_VER < 1900)
|
||||
#include "ms_stdint.h"
|
||||
#else
|
||||
#include <stdint.h>
|
||||
#endif
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,27 @@
|
||||
/*
|
||||
Copyright (c) 2016, PyData Development Team
|
||||
All rights reserved.
|
||||
|
||||
Distributed under the terms of the BSD Simplified License.
|
||||
|
||||
The full license is in the LICENSE file, distributed with this software.
|
||||
*/
|
||||
|
||||
#ifndef PANDAS__LIBS_SRC_INLINE_HELPER_H_
|
||||
#define PANDAS__LIBS_SRC_INLINE_HELPER_H_
|
||||
|
||||
#ifndef PANDAS_INLINE
|
||||
#if defined(__clang__)
|
||||
#define PANDAS_INLINE static __inline__ __attribute__ ((__unused__))
|
||||
#elif defined(__GNUC__)
|
||||
#define PANDAS_INLINE static __inline__
|
||||
#elif defined(_MSC_VER)
|
||||
#define PANDAS_INLINE static __inline
|
||||
#elif defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
|
||||
#define PANDAS_INLINE static inline
|
||||
#else
|
||||
#define PANDAS_INLINE
|
||||
#endif // __GNUC__
|
||||
#endif // PANDAS_INLINE
|
||||
|
||||
#endif // PANDAS__LIBS_SRC_INLINE_HELPER_H_
|
||||
@@ -0,0 +1,719 @@
|
||||
/* The MIT License
|
||||
|
||||
Copyright (c) 2008, 2009, 2011 by Attractive Chaos <attractor@live.co.uk>
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining
|
||||
a copy of this software and associated documentation files (the
|
||||
"Software"), to deal in the Software without restriction, including
|
||||
without limitation the rights to use, copy, modify, merge, publish,
|
||||
distribute, sublicense, and/or sell copies of the Software, and to
|
||||
permit persons to whom the Software is furnished to do so, subject to
|
||||
the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be
|
||||
included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
||||
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
*/
|
||||
|
||||
/*
|
||||
An example:
|
||||
|
||||
#include "khash.h"
|
||||
KHASH_MAP_INIT_INT(32, char)
|
||||
int main() {
|
||||
int ret, is_missing;
|
||||
khiter_t k;
|
||||
khash_t(32) *h = kh_init(32);
|
||||
k = kh_put(32, h, 5, &ret);
|
||||
if (!ret) kh_del(32, h, k);
|
||||
kh_value(h, k) = 10;
|
||||
k = kh_get(32, h, 10);
|
||||
is_missing = (k == kh_end(h));
|
||||
k = kh_get(32, h, 5);
|
||||
kh_del(32, h, k);
|
||||
for (k = kh_begin(h); k != kh_end(h); ++k)
|
||||
if (kh_exist(h, k)) kh_value(h, k) = 1;
|
||||
kh_destroy(32, h);
|
||||
return 0;
|
||||
}
|
||||
*/
|
||||
|
||||
/*
|
||||
2011-09-16 (0.2.6):
|
||||
|
||||
* The capacity is a power of 2. This seems to dramatically improve the
|
||||
speed for simple keys. Thank Zilong Tan for the suggestion. Reference:
|
||||
|
||||
- https://github.com/stefanocasazza/ULib
|
||||
- https://nothings.org/computer/judy/
|
||||
|
||||
* Allow to optionally use linear probing which usually has better
|
||||
performance for random input. Double hashing is still the default as it
|
||||
is more robust to certain non-random input.
|
||||
|
||||
* Added Wang's integer hash function (not used by default). This hash
|
||||
function is more robust to certain non-random input.
|
||||
|
||||
2011-02-14 (0.2.5):
|
||||
|
||||
* Allow to declare global functions.
|
||||
|
||||
2009-09-26 (0.2.4):
|
||||
|
||||
* Improve portability
|
||||
|
||||
2008-09-19 (0.2.3):
|
||||
|
||||
* Corrected the example
|
||||
* Improved interfaces
|
||||
|
||||
2008-09-11 (0.2.2):
|
||||
|
||||
* Improved speed a little in kh_put()
|
||||
|
||||
2008-09-10 (0.2.1):
|
||||
|
||||
* Added kh_clear()
|
||||
* Fixed a compiling error
|
||||
|
||||
2008-09-02 (0.2.0):
|
||||
|
||||
* Changed to token concatenation which increases flexibility.
|
||||
|
||||
2008-08-31 (0.1.2):
|
||||
|
||||
* Fixed a bug in kh_get(), which has not been tested previously.
|
||||
|
||||
2008-08-31 (0.1.1):
|
||||
|
||||
* Added destructor
|
||||
*/
|
||||
|
||||
|
||||
#ifndef __AC_KHASH_H
|
||||
#define __AC_KHASH_H
|
||||
|
||||
/*!
|
||||
@header
|
||||
|
||||
Generic hash table library.
|
||||
*/
|
||||
|
||||
#define AC_VERSION_KHASH_H "0.2.6"
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <limits.h>
|
||||
#include "../inline_helper.h"
|
||||
|
||||
|
||||
// hooks for memory allocator, C-runtime allocator used per default
|
||||
#ifndef KHASH_MALLOC
|
||||
#define KHASH_MALLOC malloc
|
||||
#endif
|
||||
|
||||
#ifndef KHASH_REALLOC
|
||||
#define KHASH_REALLOC realloc
|
||||
#endif
|
||||
|
||||
#ifndef KHASH_CALLOC
|
||||
#define KHASH_CALLOC calloc
|
||||
#endif
|
||||
|
||||
#ifndef KHASH_FREE
|
||||
#define KHASH_FREE free
|
||||
#endif
|
||||
|
||||
|
||||
#if UINT_MAX == 0xffffffffu
|
||||
typedef unsigned int khuint32_t;
|
||||
typedef signed int khint32_t;
|
||||
#elif ULONG_MAX == 0xffffffffu
|
||||
typedef unsigned long khuint32_t;
|
||||
typedef signed long khint32_t;
|
||||
#endif
|
||||
|
||||
#if ULONG_MAX == ULLONG_MAX
|
||||
typedef unsigned long khuint64_t;
|
||||
typedef signed long khint64_t;
|
||||
#else
|
||||
typedef unsigned long long khuint64_t;
|
||||
typedef signed long long khint64_t;
|
||||
#endif
|
||||
|
||||
#if UINT_MAX == 0xffffu
|
||||
typedef unsigned int khuint16_t;
|
||||
typedef signed int khint16_t;
|
||||
#elif USHRT_MAX == 0xffffu
|
||||
typedef unsigned short khuint16_t;
|
||||
typedef signed short khint16_t;
|
||||
#endif
|
||||
|
||||
#if UCHAR_MAX == 0xffu
|
||||
typedef unsigned char khuint8_t;
|
||||
typedef signed char khint8_t;
|
||||
#endif
|
||||
|
||||
typedef double khfloat64_t;
|
||||
typedef float khfloat32_t;
|
||||
|
||||
typedef khuint32_t khuint_t;
|
||||
typedef khuint_t khiter_t;
|
||||
|
||||
#define __ac_isempty(flag, i) ((flag[i>>5]>>(i&0x1fU))&1)
|
||||
#define __ac_isdel(flag, i) (0)
|
||||
#define __ac_iseither(flag, i) __ac_isempty(flag, i)
|
||||
#define __ac_set_isdel_false(flag, i) (0)
|
||||
#define __ac_set_isempty_false(flag, i) (flag[i>>5]&=~(1ul<<(i&0x1fU)))
|
||||
#define __ac_set_isempty_true(flag, i) (flag[i>>5]|=(1ul<<(i&0x1fU)))
|
||||
#define __ac_set_isboth_false(flag, i) __ac_set_isempty_false(flag, i)
|
||||
#define __ac_set_isdel_true(flag, i) ((void)0)
|
||||
|
||||
|
||||
// specializations of https://github.com/aappleby/smhasher/blob/master/src/MurmurHash2.cpp
|
||||
khuint32_t PANDAS_INLINE murmur2_32to32(khuint32_t k){
|
||||
const khuint32_t SEED = 0xc70f6907UL;
|
||||
// 'm' and 'r' are mixing constants generated offline.
|
||||
// They're not really 'magic', they just happen to work well.
|
||||
const khuint32_t M_32 = 0x5bd1e995;
|
||||
const int R_32 = 24;
|
||||
|
||||
// Initialize the hash to a 'random' value
|
||||
khuint32_t h = SEED ^ 4;
|
||||
|
||||
//handle 4 bytes:
|
||||
k *= M_32;
|
||||
k ^= k >> R_32;
|
||||
k *= M_32;
|
||||
|
||||
h *= M_32;
|
||||
h ^= k;
|
||||
|
||||
// Do a few final mixes of the hash to ensure the "last few
|
||||
// bytes" are well-incorporated. (Really needed here?)
|
||||
h ^= h >> 13;
|
||||
h *= M_32;
|
||||
h ^= h >> 15;
|
||||
return h;
|
||||
}
|
||||
|
||||
// it is possible to have a special x64-version, which would need less operations, but
|
||||
// using 32bit version always has also some benefits:
|
||||
// - one code for 32bit and 64bit builds
|
||||
// - the same case for 32bit and 64bit builds
|
||||
// - no performance difference could be measured compared to a possible x64-version
|
||||
|
||||
khuint32_t PANDAS_INLINE murmur2_32_32to32(khuint32_t k1, khuint32_t k2){
|
||||
const khuint32_t SEED = 0xc70f6907UL;
|
||||
// 'm' and 'r' are mixing constants generated offline.
|
||||
// They're not really 'magic', they just happen to work well.
|
||||
const khuint32_t M_32 = 0x5bd1e995;
|
||||
const int R_32 = 24;
|
||||
|
||||
// Initialize the hash to a 'random' value
|
||||
khuint32_t h = SEED ^ 4;
|
||||
|
||||
//handle first 4 bytes:
|
||||
k1 *= M_32;
|
||||
k1 ^= k1 >> R_32;
|
||||
k1 *= M_32;
|
||||
|
||||
h *= M_32;
|
||||
h ^= k1;
|
||||
|
||||
//handle second 4 bytes:
|
||||
k2 *= M_32;
|
||||
k2 ^= k2 >> R_32;
|
||||
k2 *= M_32;
|
||||
|
||||
h *= M_32;
|
||||
h ^= k2;
|
||||
|
||||
// Do a few final mixes of the hash to ensure the "last few
|
||||
// bytes" are well-incorporated.
|
||||
h ^= h >> 13;
|
||||
h *= M_32;
|
||||
h ^= h >> 15;
|
||||
return h;
|
||||
}
|
||||
|
||||
khuint32_t PANDAS_INLINE murmur2_64to32(khuint64_t k){
|
||||
khuint32_t k1 = (khuint32_t)k;
|
||||
khuint32_t k2 = (khuint32_t)(k >> 32);
|
||||
|
||||
return murmur2_32_32to32(k1, k2);
|
||||
}
|
||||
|
||||
|
||||
#ifdef KHASH_LINEAR
|
||||
#define __ac_inc(k, m) 1
|
||||
#else
|
||||
#define __ac_inc(k, m) (murmur2_32to32(k) | 1) & (m)
|
||||
#endif
|
||||
|
||||
#define __ac_fsize(m) ((m) < 32? 1 : (m)>>5)
|
||||
|
||||
#ifndef kroundup32
|
||||
#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
|
||||
#endif
|
||||
|
||||
static const double __ac_HASH_UPPER = 0.77;
|
||||
|
||||
#define KHASH_DECLARE(name, khkey_t, khval_t) \
|
||||
typedef struct { \
|
||||
khuint_t n_buckets, size, n_occupied, upper_bound; \
|
||||
khuint32_t *flags; \
|
||||
khkey_t *keys; \
|
||||
khval_t *vals; \
|
||||
} kh_##name##_t; \
|
||||
extern kh_##name##_t *kh_init_##name(); \
|
||||
extern void kh_destroy_##name(kh_##name##_t *h); \
|
||||
extern void kh_clear_##name(kh_##name##_t *h); \
|
||||
extern khuint_t kh_get_##name(const kh_##name##_t *h, khkey_t key); \
|
||||
extern void kh_resize_##name(kh_##name##_t *h, khuint_t new_n_buckets); \
|
||||
extern khuint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \
|
||||
extern void kh_del_##name(kh_##name##_t *h, khuint_t x);
|
||||
|
||||
#define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
|
||||
typedef struct { \
|
||||
khuint_t n_buckets, size, n_occupied, upper_bound; \
|
||||
khuint32_t *flags; \
|
||||
khkey_t *keys; \
|
||||
khval_t *vals; \
|
||||
} kh_##name##_t; \
|
||||
SCOPE kh_##name##_t *kh_init_##name(void) { \
|
||||
return (kh_##name##_t*)KHASH_CALLOC(1, sizeof(kh_##name##_t)); \
|
||||
} \
|
||||
SCOPE void kh_destroy_##name(kh_##name##_t *h) \
|
||||
{ \
|
||||
if (h) { \
|
||||
KHASH_FREE(h->keys); KHASH_FREE(h->flags); \
|
||||
KHASH_FREE(h->vals); \
|
||||
KHASH_FREE(h); \
|
||||
} \
|
||||
} \
|
||||
SCOPE void kh_clear_##name(kh_##name##_t *h) \
|
||||
{ \
|
||||
if (h && h->flags) { \
|
||||
memset(h->flags, 0xaa, __ac_fsize(h->n_buckets) * sizeof(khuint32_t)); \
|
||||
h->size = h->n_occupied = 0; \
|
||||
} \
|
||||
} \
|
||||
SCOPE khuint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \
|
||||
{ \
|
||||
if (h->n_buckets) { \
|
||||
khuint_t inc, k, i, last, mask; \
|
||||
mask = h->n_buckets - 1; \
|
||||
k = __hash_func(key); i = k & mask; \
|
||||
inc = __ac_inc(k, mask); last = i; /* inc==1 for linear probing */ \
|
||||
while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
|
||||
i = (i + inc) & mask; \
|
||||
if (i == last) return h->n_buckets; \
|
||||
} \
|
||||
return __ac_iseither(h->flags, i)? h->n_buckets : i; \
|
||||
} else return 0; \
|
||||
} \
|
||||
SCOPE void kh_resize_##name(kh_##name##_t *h, khuint_t new_n_buckets) \
|
||||
{ /* This function uses 0.25*n_bucktes bytes of working space instead of [sizeof(key_t+val_t)+.25]*n_buckets. */ \
|
||||
khuint32_t *new_flags = 0; \
|
||||
khuint_t j = 1; \
|
||||
{ \
|
||||
kroundup32(new_n_buckets); \
|
||||
if (new_n_buckets < 4) new_n_buckets = 4; \
|
||||
if (h->size >= (khuint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; /* requested size is too small */ \
|
||||
else { /* hash table size to be changed (shrink or expand); rehash */ \
|
||||
new_flags = (khuint32_t*)KHASH_MALLOC(__ac_fsize(new_n_buckets) * sizeof(khuint32_t)); \
|
||||
memset(new_flags, 0xff, __ac_fsize(new_n_buckets) * sizeof(khuint32_t)); \
|
||||
if (h->n_buckets < new_n_buckets) { /* expand */ \
|
||||
h->keys = (khkey_t*)KHASH_REALLOC(h->keys, new_n_buckets * sizeof(khkey_t)); \
|
||||
if (kh_is_map) h->vals = (khval_t*)KHASH_REALLOC(h->vals, new_n_buckets * sizeof(khval_t)); \
|
||||
} /* otherwise shrink */ \
|
||||
} \
|
||||
} \
|
||||
if (j) { /* rehashing is needed */ \
|
||||
for (j = 0; j != h->n_buckets; ++j) { \
|
||||
if (__ac_iseither(h->flags, j) == 0) { \
|
||||
khkey_t key = h->keys[j]; \
|
||||
khval_t val; \
|
||||
khuint_t new_mask; \
|
||||
new_mask = new_n_buckets - 1; \
|
||||
if (kh_is_map) val = h->vals[j]; \
|
||||
__ac_set_isempty_true(h->flags, j); \
|
||||
while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \
|
||||
khuint_t inc, k, i; \
|
||||
k = __hash_func(key); \
|
||||
i = k & new_mask; \
|
||||
inc = __ac_inc(k, new_mask); \
|
||||
while (!__ac_isempty(new_flags, i)) i = (i + inc) & new_mask; \
|
||||
__ac_set_isempty_false(new_flags, i); \
|
||||
if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \
|
||||
{ khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \
|
||||
if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \
|
||||
__ac_set_isempty_true(h->flags, i); /* mark it as deleted in the old hash table */ \
|
||||
} else { /* write the element and jump out of the loop */ \
|
||||
h->keys[i] = key; \
|
||||
if (kh_is_map) h->vals[i] = val; \
|
||||
break; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \
|
||||
h->keys = (khkey_t*)KHASH_REALLOC(h->keys, new_n_buckets * sizeof(khkey_t)); \
|
||||
if (kh_is_map) h->vals = (khval_t*)KHASH_REALLOC(h->vals, new_n_buckets * sizeof(khval_t)); \
|
||||
} \
|
||||
KHASH_FREE(h->flags); /* free the working space */ \
|
||||
h->flags = new_flags; \
|
||||
h->n_buckets = new_n_buckets; \
|
||||
h->n_occupied = h->size; \
|
||||
h->upper_bound = (khuint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \
|
||||
} \
|
||||
} \
|
||||
SCOPE khuint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \
|
||||
{ \
|
||||
khuint_t x; \
|
||||
if (h->n_occupied >= h->upper_bound) { /* update the hash table */ \
|
||||
if (h->n_buckets > (h->size<<1)) kh_resize_##name(h, h->n_buckets - 1); /* clear "deleted" elements */ \
|
||||
else kh_resize_##name(h, h->n_buckets + 1); /* expand the hash table */ \
|
||||
} /* TODO: to implement automatically shrinking; resize() already support shrinking */ \
|
||||
{ \
|
||||
khuint_t inc, k, i, site, last, mask = h->n_buckets - 1; \
|
||||
x = site = h->n_buckets; k = __hash_func(key); i = k & mask; \
|
||||
if (__ac_isempty(h->flags, i)) x = i; /* for speed up */ \
|
||||
else { \
|
||||
inc = __ac_inc(k, mask); last = i; \
|
||||
while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
|
||||
if (__ac_isdel(h->flags, i)) site = i; \
|
||||
i = (i + inc) & mask; \
|
||||
if (i == last) { x = site; break; } \
|
||||
} \
|
||||
if (x == h->n_buckets) { \
|
||||
if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \
|
||||
else x = i; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
if (__ac_isempty(h->flags, x)) { /* not present at all */ \
|
||||
h->keys[x] = key; \
|
||||
__ac_set_isboth_false(h->flags, x); \
|
||||
++h->size; ++h->n_occupied; \
|
||||
*ret = 1; \
|
||||
} else if (__ac_isdel(h->flags, x)) { /* deleted */ \
|
||||
h->keys[x] = key; \
|
||||
__ac_set_isboth_false(h->flags, x); \
|
||||
++h->size; \
|
||||
*ret = 2; \
|
||||
} else *ret = 0; /* Don't touch h->keys[x] if present and not deleted */ \
|
||||
return x; \
|
||||
} \
|
||||
SCOPE void kh_del_##name(kh_##name##_t *h, khuint_t x) \
|
||||
{ \
|
||||
if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \
|
||||
__ac_set_isdel_true(h->flags, x); \
|
||||
--h->size; \
|
||||
} \
|
||||
}
|
||||
|
||||
#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
|
||||
KHASH_INIT2(name, PANDAS_INLINE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
|
||||
|
||||
/* --- BEGIN OF HASH FUNCTIONS --- */
|
||||
|
||||
/*! @function
|
||||
@abstract Integer hash function
|
||||
@param key The integer [khuint32_t]
|
||||
@return The hash value [khuint_t]
|
||||
*/
|
||||
#define kh_int_hash_func(key) (khuint32_t)(key)
|
||||
/*! @function
|
||||
@abstract Integer comparison function
|
||||
*/
|
||||
#define kh_int_hash_equal(a, b) ((a) == (b))
|
||||
/*! @function
|
||||
@abstract 64-bit integer hash function
|
||||
@param key The integer [khuint64_t]
|
||||
@return The hash value [khuint_t]
|
||||
*/
|
||||
PANDAS_INLINE khuint_t kh_int64_hash_func(khuint64_t key)
|
||||
{
|
||||
return (khuint_t)((key)>>33^(key)^(key)<<11);
|
||||
}
|
||||
/*! @function
|
||||
@abstract 64-bit integer comparison function
|
||||
*/
|
||||
#define kh_int64_hash_equal(a, b) ((a) == (b))
|
||||
|
||||
/*! @function
|
||||
@abstract const char* hash function
|
||||
@param s Pointer to a null terminated string
|
||||
@return The hash value
|
||||
*/
|
||||
PANDAS_INLINE khuint_t __ac_X31_hash_string(const char *s)
|
||||
{
|
||||
khuint_t h = *s;
|
||||
if (h) for (++s ; *s; ++s) h = (h << 5) - h + *s;
|
||||
return h;
|
||||
}
|
||||
/*! @function
|
||||
@abstract Another interface to const char* hash function
|
||||
@param key Pointer to a null terminated string [const char*]
|
||||
@return The hash value [khuint_t]
|
||||
*/
|
||||
#define kh_str_hash_func(key) __ac_X31_hash_string(key)
|
||||
/*! @function
|
||||
@abstract Const char* comparison function
|
||||
*/
|
||||
#define kh_str_hash_equal(a, b) (strcmp(a, b) == 0)
|
||||
|
||||
PANDAS_INLINE khuint_t __ac_Wang_hash(khuint_t key)
|
||||
{
|
||||
key += ~(key << 15);
|
||||
key ^= (key >> 10);
|
||||
key += (key << 3);
|
||||
key ^= (key >> 6);
|
||||
key += ~(key << 11);
|
||||
key ^= (key >> 16);
|
||||
return key;
|
||||
}
|
||||
#define kh_int_hash_func2(k) __ac_Wang_hash((khuint_t)key)
|
||||
|
||||
/* --- END OF HASH FUNCTIONS --- */
|
||||
|
||||
/* Other convenient macros... */
|
||||
|
||||
/*!
|
||||
@abstract Type of the hash table.
|
||||
@param name Name of the hash table [symbol]
|
||||
*/
|
||||
#define khash_t(name) kh_##name##_t
|
||||
|
||||
/*! @function
|
||||
@abstract Initiate a hash table.
|
||||
@param name Name of the hash table [symbol]
|
||||
@return Pointer to the hash table [khash_t(name)*]
|
||||
*/
|
||||
#define kh_init(name) kh_init_##name(void)
|
||||
|
||||
/*! @function
|
||||
@abstract Destroy a hash table.
|
||||
@param name Name of the hash table [symbol]
|
||||
@param h Pointer to the hash table [khash_t(name)*]
|
||||
*/
|
||||
#define kh_destroy(name, h) kh_destroy_##name(h)
|
||||
|
||||
/*! @function
|
||||
@abstract Reset a hash table without deallocating memory.
|
||||
@param name Name of the hash table [symbol]
|
||||
@param h Pointer to the hash table [khash_t(name)*]
|
||||
*/
|
||||
#define kh_clear(name, h) kh_clear_##name(h)
|
||||
|
||||
/*! @function
|
||||
@abstract Resize a hash table.
|
||||
@param name Name of the hash table [symbol]
|
||||
@param h Pointer to the hash table [khash_t(name)*]
|
||||
@param s New size [khuint_t]
|
||||
*/
|
||||
#define kh_resize(name, h, s) kh_resize_##name(h, s)
|
||||
|
||||
/*! @function
|
||||
@abstract Insert a key to the hash table.
|
||||
@param name Name of the hash table [symbol]
|
||||
@param h Pointer to the hash table [khash_t(name)*]
|
||||
@param k Key [type of keys]
|
||||
@param r Extra return code: 0 if the key is present in the hash table;
|
||||
1 if the bucket is empty (never used); 2 if the element in
|
||||
the bucket has been deleted [int*]
|
||||
@return Iterator to the inserted element [khuint_t]
|
||||
*/
|
||||
#define kh_put(name, h, k, r) kh_put_##name(h, k, r)
|
||||
|
||||
/*! @function
|
||||
@abstract Retrieve a key from the hash table.
|
||||
@param name Name of the hash table [symbol]
|
||||
@param h Pointer to the hash table [khash_t(name)*]
|
||||
@param k Key [type of keys]
|
||||
@return Iterator to the found element, or kh_end(h) is the element is absent [khuint_t]
|
||||
*/
|
||||
#define kh_get(name, h, k) kh_get_##name(h, k)
|
||||
|
||||
/*! @function
|
||||
@abstract Remove a key from the hash table.
|
||||
@param name Name of the hash table [symbol]
|
||||
@param h Pointer to the hash table [khash_t(name)*]
|
||||
@param k Iterator to the element to be deleted [khuint_t]
|
||||
*/
|
||||
#define kh_del(name, h, k) kh_del_##name(h, k)
|
||||
|
||||
/*! @function
|
||||
@abstract Test whether a bucket contains data.
|
||||
@param h Pointer to the hash table [khash_t(name)*]
|
||||
@param x Iterator to the bucket [khuint_t]
|
||||
@return 1 if containing data; 0 otherwise [int]
|
||||
*/
|
||||
#define kh_exist(h, x) (!__ac_iseither((h)->flags, (x)))
|
||||
|
||||
/*! @function
|
||||
@abstract Get key given an iterator
|
||||
@param h Pointer to the hash table [khash_t(name)*]
|
||||
@param x Iterator to the bucket [khuint_t]
|
||||
@return Key [type of keys]
|
||||
*/
|
||||
#define kh_key(h, x) ((h)->keys[x])
|
||||
|
||||
/*! @function
|
||||
@abstract Get value given an iterator
|
||||
@param h Pointer to the hash table [khash_t(name)*]
|
||||
@param x Iterator to the bucket [khuint_t]
|
||||
@return Value [type of values]
|
||||
@discussion For hash sets, calling this results in segfault.
|
||||
*/
|
||||
#define kh_val(h, x) ((h)->vals[x])
|
||||
|
||||
/*! @function
|
||||
@abstract Alias of kh_val()
|
||||
*/
|
||||
#define kh_value(h, x) ((h)->vals[x])
|
||||
|
||||
/*! @function
|
||||
@abstract Get the start iterator
|
||||
@param h Pointer to the hash table [khash_t(name)*]
|
||||
@return The start iterator [khuint_t]
|
||||
*/
|
||||
#define kh_begin(h) (khuint_t)(0)
|
||||
|
||||
/*! @function
|
||||
@abstract Get the end iterator
|
||||
@param h Pointer to the hash table [khash_t(name)*]
|
||||
@return The end iterator [khuint_t]
|
||||
*/
|
||||
#define kh_end(h) ((h)->n_buckets)
|
||||
|
||||
/*! @function
|
||||
@abstract Get the number of elements in the hash table
|
||||
@param h Pointer to the hash table [khash_t(name)*]
|
||||
@return Number of elements in the hash table [khuint_t]
|
||||
*/
|
||||
#define kh_size(h) ((h)->size)
|
||||
|
||||
/*! @function
|
||||
@abstract Get the number of buckets in the hash table
|
||||
@param h Pointer to the hash table [khash_t(name)*]
|
||||
@return Number of buckets in the hash table [khuint_t]
|
||||
*/
|
||||
#define kh_n_buckets(h) ((h)->n_buckets)
|
||||
|
||||
/* More convenient interfaces */
|
||||
|
||||
/*! @function
|
||||
@abstract Instantiate a hash set containing integer keys
|
||||
@param name Name of the hash table [symbol]
|
||||
*/
|
||||
#define KHASH_SET_INIT_INT(name) \
|
||||
KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal)
|
||||
|
||||
/*! @function
|
||||
@abstract Instantiate a hash map containing integer keys
|
||||
@param name Name of the hash table [symbol]
|
||||
@param khval_t Type of values [type]
|
||||
*/
|
||||
#define KHASH_MAP_INIT_INT(name, khval_t) \
|
||||
KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)
|
||||
|
||||
#define KHASH_MAP_INIT_UINT(name, khval_t) \
|
||||
KHASH_INIT(name, khuint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)
|
||||
|
||||
/*! @function
|
||||
@abstract Instantiate a hash map containing 64-bit integer keys
|
||||
@param name Name of the hash table [symbol]
|
||||
*/
|
||||
#define KHASH_SET_INIT_UINT64(name) \
|
||||
KHASH_INIT(name, khuint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal)
|
||||
|
||||
#define KHASH_SET_INIT_INT64(name) \
|
||||
KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal)
|
||||
|
||||
/*! @function
|
||||
@abstract Instantiate a hash map containing 64-bit integer keys
|
||||
@param name Name of the hash table [symbol]
|
||||
@param khval_t Type of values [type]
|
||||
*/
|
||||
#define KHASH_MAP_INIT_UINT64(name, khval_t) \
|
||||
KHASH_INIT(name, khuint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal)
|
||||
|
||||
#define KHASH_MAP_INIT_INT64(name, khval_t) \
|
||||
KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal)
|
||||
|
||||
/*! @function
|
||||
@abstract Instantiate a hash map containing 16bit-integer keys
|
||||
@param name Name of the hash table [symbol]
|
||||
@param khval_t Type of values [type]
|
||||
*/
|
||||
#define KHASH_MAP_INIT_INT16(name, khval_t) \
|
||||
KHASH_INIT(name, khint16_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)
|
||||
|
||||
#define KHASH_MAP_INIT_UINT16(name, khval_t) \
|
||||
KHASH_INIT(name, khuint16_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)
|
||||
|
||||
/*! @function
|
||||
@abstract Instantiate a hash map containing 8bit-integer keys
|
||||
@param name Name of the hash table [symbol]
|
||||
@param khval_t Type of values [type]
|
||||
*/
|
||||
#define KHASH_MAP_INIT_INT8(name, khval_t) \
|
||||
KHASH_INIT(name, khint8_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)
|
||||
|
||||
#define KHASH_MAP_INIT_UINT8(name, khval_t) \
|
||||
KHASH_INIT(name, khuint8_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)
|
||||
|
||||
|
||||
|
||||
typedef const char *kh_cstr_t;
|
||||
/*! @function
|
||||
@abstract Instantiate a hash map containing const char* keys
|
||||
@param name Name of the hash table [symbol]
|
||||
*/
|
||||
#define KHASH_SET_INIT_STR(name) \
|
||||
KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal)
|
||||
|
||||
/*! @function
|
||||
@abstract Instantiate a hash map containing const char* keys
|
||||
@param name Name of the hash table [symbol]
|
||||
@param khval_t Type of values [type]
|
||||
*/
|
||||
#define KHASH_MAP_INIT_STR(name, khval_t) \
|
||||
KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal)
|
||||
|
||||
|
||||
#define kh_exist_str(h, k) (kh_exist(h, k))
|
||||
#define kh_exist_float64(h, k) (kh_exist(h, k))
|
||||
#define kh_exist_uint64(h, k) (kh_exist(h, k))
|
||||
#define kh_exist_int64(h, k) (kh_exist(h, k))
|
||||
#define kh_exist_float32(h, k) (kh_exist(h, k))
|
||||
#define kh_exist_int32(h, k) (kh_exist(h, k))
|
||||
#define kh_exist_uint32(h, k) (kh_exist(h, k))
|
||||
#define kh_exist_int16(h, k) (kh_exist(h, k))
|
||||
#define kh_exist_uint16(h, k) (kh_exist(h, k))
|
||||
#define kh_exist_int8(h, k) (kh_exist(h, k))
|
||||
#define kh_exist_uint8(h, k) (kh_exist(h, k))
|
||||
|
||||
KHASH_MAP_INIT_STR(str, size_t)
|
||||
KHASH_MAP_INIT_INT(int32, size_t)
|
||||
KHASH_MAP_INIT_UINT(uint32, size_t)
|
||||
KHASH_MAP_INIT_INT64(int64, size_t)
|
||||
KHASH_MAP_INIT_UINT64(uint64, size_t)
|
||||
KHASH_MAP_INIT_INT16(int16, size_t)
|
||||
KHASH_MAP_INIT_UINT16(uint16, size_t)
|
||||
KHASH_MAP_INIT_INT8(int8, size_t)
|
||||
KHASH_MAP_INIT_UINT8(uint8, size_t)
|
||||
|
||||
|
||||
#endif /* __AC_KHASH_H */
|
||||
@@ -0,0 +1,446 @@
|
||||
#include <string.h>
|
||||
#include <Python.h>
|
||||
|
||||
|
||||
// use numpy's definitions for complex
|
||||
#include <numpy/arrayobject.h>
|
||||
typedef npy_complex64 khcomplex64_t;
|
||||
typedef npy_complex128 khcomplex128_t;
|
||||
|
||||
|
||||
|
||||
// khash should report usage to tracemalloc
|
||||
#if PY_VERSION_HEX >= 0x03060000
|
||||
#include <pymem.h>
|
||||
#if PY_VERSION_HEX < 0x03070000
|
||||
#define PyTraceMalloc_Track _PyTraceMalloc_Track
|
||||
#define PyTraceMalloc_Untrack _PyTraceMalloc_Untrack
|
||||
#endif
|
||||
#else
|
||||
#define PyTraceMalloc_Track(...)
|
||||
#define PyTraceMalloc_Untrack(...)
|
||||
#endif
|
||||
|
||||
|
||||
static const int KHASH_TRACE_DOMAIN = 424242;
|
||||
void *traced_malloc(size_t size){
|
||||
void * ptr = malloc(size);
|
||||
if(ptr!=NULL){
|
||||
PyTraceMalloc_Track(KHASH_TRACE_DOMAIN, (uintptr_t)ptr, size);
|
||||
}
|
||||
return ptr;
|
||||
}
|
||||
|
||||
void *traced_calloc(size_t num, size_t size){
|
||||
void * ptr = calloc(num, size);
|
||||
if(ptr!=NULL){
|
||||
PyTraceMalloc_Track(KHASH_TRACE_DOMAIN, (uintptr_t)ptr, num*size);
|
||||
}
|
||||
return ptr;
|
||||
}
|
||||
|
||||
void *traced_realloc(void* old_ptr, size_t size){
|
||||
void * ptr = realloc(old_ptr, size);
|
||||
if(ptr!=NULL){
|
||||
if(old_ptr != ptr){
|
||||
PyTraceMalloc_Untrack(KHASH_TRACE_DOMAIN, (uintptr_t)old_ptr);
|
||||
}
|
||||
PyTraceMalloc_Track(KHASH_TRACE_DOMAIN, (uintptr_t)ptr, size);
|
||||
}
|
||||
return ptr;
|
||||
}
|
||||
|
||||
void traced_free(void* ptr){
|
||||
if(ptr!=NULL){
|
||||
PyTraceMalloc_Untrack(KHASH_TRACE_DOMAIN, (uintptr_t)ptr);
|
||||
}
|
||||
free(ptr);
|
||||
}
|
||||
|
||||
|
||||
#define KHASH_MALLOC traced_malloc
|
||||
#define KHASH_REALLOC traced_realloc
|
||||
#define KHASH_CALLOC traced_calloc
|
||||
#define KHASH_FREE traced_free
|
||||
#include "khash.h"
|
||||
|
||||
// Previously we were using the built in cpython hash function for doubles
|
||||
// python 2.7 https://github.com/python/cpython/blob/2.7/Objects/object.c#L1021
|
||||
// python 3.5 https://github.com/python/cpython/blob/3.5/Python/pyhash.c#L85
|
||||
|
||||
// The python 3 hash function has the invariant hash(x) == hash(int(x)) == hash(decimal(x))
|
||||
// and the size of hash may be different by platform / version (long in py2, Py_ssize_t in py3).
|
||||
// We don't need those invariants because types will be cast before hashing, and if Py_ssize_t
|
||||
// is 64 bits the truncation causes collision issues. Given all that, we use our own
|
||||
// simple hash, viewing the double bytes as an int64 and using khash's default
|
||||
// hash for 64 bit integers.
|
||||
// GH 13436 showed that _Py_HashDouble doesn't work well with khash
|
||||
// GH 28303 showed, that the simple xoring-version isn't good enough
|
||||
// See GH 36729 for evaluation of the currently used murmur2-hash version
|
||||
// An interesting alternative to expensive murmur2-hash would be to change
|
||||
// the probing strategy and use e.g. the probing strategy from CPython's
|
||||
// implementation of dicts, which shines for smaller sizes but is more
|
||||
// predisposed to superlinear running times (see GH 36729 for comparison)
|
||||
|
||||
|
||||
khuint64_t PANDAS_INLINE asuint64(double key) {
|
||||
khuint64_t val;
|
||||
memcpy(&val, &key, sizeof(double));
|
||||
return val;
|
||||
}
|
||||
|
||||
khuint32_t PANDAS_INLINE asuint32(float key) {
|
||||
khuint32_t val;
|
||||
memcpy(&val, &key, sizeof(float));
|
||||
return val;
|
||||
}
|
||||
|
||||
#define ZERO_HASH 0
|
||||
#define NAN_HASH 0
|
||||
|
||||
khuint32_t PANDAS_INLINE kh_float64_hash_func(double val){
|
||||
// 0.0 and -0.0 should have the same hash:
|
||||
if (val == 0.0){
|
||||
return ZERO_HASH;
|
||||
}
|
||||
// all nans should have the same hash:
|
||||
if ( val!=val ){
|
||||
return NAN_HASH;
|
||||
}
|
||||
khuint64_t as_int = asuint64(val);
|
||||
return murmur2_64to32(as_int);
|
||||
}
|
||||
|
||||
khuint32_t PANDAS_INLINE kh_float32_hash_func(float val){
|
||||
// 0.0 and -0.0 should have the same hash:
|
||||
if (val == 0.0f){
|
||||
return ZERO_HASH;
|
||||
}
|
||||
// all nans should have the same hash:
|
||||
if ( val!=val ){
|
||||
return NAN_HASH;
|
||||
}
|
||||
khuint32_t as_int = asuint32(val);
|
||||
return murmur2_32to32(as_int);
|
||||
}
|
||||
|
||||
#define kh_floats_hash_equal(a, b) ((a) == (b) || ((b) != (b) && (a) != (a)))
|
||||
|
||||
#define KHASH_MAP_INIT_FLOAT64(name, khval_t) \
|
||||
KHASH_INIT(name, khfloat64_t, khval_t, 1, kh_float64_hash_func, kh_floats_hash_equal)
|
||||
|
||||
KHASH_MAP_INIT_FLOAT64(float64, size_t)
|
||||
|
||||
#define KHASH_MAP_INIT_FLOAT32(name, khval_t) \
|
||||
KHASH_INIT(name, khfloat32_t, khval_t, 1, kh_float32_hash_func, kh_floats_hash_equal)
|
||||
|
||||
KHASH_MAP_INIT_FLOAT32(float32, size_t)
|
||||
|
||||
khint32_t PANDAS_INLINE kh_complex128_hash_func(khcomplex128_t val){
|
||||
return kh_float64_hash_func(val.real)^kh_float64_hash_func(val.imag);
|
||||
}
|
||||
khint32_t PANDAS_INLINE kh_complex64_hash_func(khcomplex64_t val){
|
||||
return kh_float32_hash_func(val.real)^kh_float32_hash_func(val.imag);
|
||||
}
|
||||
|
||||
#define kh_complex_hash_equal(a, b) \
|
||||
(kh_floats_hash_equal(a.real, b.real) && kh_floats_hash_equal(a.imag, b.imag))
|
||||
|
||||
|
||||
#define KHASH_MAP_INIT_COMPLEX64(name, khval_t) \
|
||||
KHASH_INIT(name, khcomplex64_t, khval_t, 1, kh_complex64_hash_func, kh_complex_hash_equal)
|
||||
|
||||
KHASH_MAP_INIT_COMPLEX64(complex64, size_t)
|
||||
|
||||
|
||||
#define KHASH_MAP_INIT_COMPLEX128(name, khval_t) \
|
||||
KHASH_INIT(name, khcomplex128_t, khval_t, 1, kh_complex128_hash_func, kh_complex_hash_equal)
|
||||
|
||||
KHASH_MAP_INIT_COMPLEX128(complex128, size_t)
|
||||
|
||||
|
||||
#define kh_exist_complex64(h, k) (kh_exist(h, k))
|
||||
#define kh_exist_complex128(h, k) (kh_exist(h, k))
|
||||
|
||||
|
||||
// NaN-floats should be in the same equivalency class, see GH 22119
|
||||
int PANDAS_INLINE floatobject_cmp(PyFloatObject* a, PyFloatObject* b){
|
||||
return (
|
||||
Py_IS_NAN(PyFloat_AS_DOUBLE(a)) &&
|
||||
Py_IS_NAN(PyFloat_AS_DOUBLE(b))
|
||||
)
|
||||
||
|
||||
( PyFloat_AS_DOUBLE(a) == PyFloat_AS_DOUBLE(b) );
|
||||
}
|
||||
|
||||
|
||||
// NaNs should be in the same equivalency class, see GH 41836
|
||||
// PyObject_RichCompareBool for complexobjects has a different behavior
|
||||
// needs to be replaced
|
||||
int PANDAS_INLINE complexobject_cmp(PyComplexObject* a, PyComplexObject* b){
|
||||
return (
|
||||
Py_IS_NAN(a->cval.real) &&
|
||||
Py_IS_NAN(b->cval.real) &&
|
||||
Py_IS_NAN(a->cval.imag) &&
|
||||
Py_IS_NAN(b->cval.imag)
|
||||
)
|
||||
||
|
||||
(
|
||||
Py_IS_NAN(a->cval.real) &&
|
||||
Py_IS_NAN(b->cval.real) &&
|
||||
a->cval.imag == b->cval.imag
|
||||
)
|
||||
||
|
||||
(
|
||||
a->cval.real == b->cval.real &&
|
||||
Py_IS_NAN(a->cval.imag) &&
|
||||
Py_IS_NAN(b->cval.imag)
|
||||
)
|
||||
||
|
||||
(
|
||||
a->cval.real == b->cval.real &&
|
||||
a->cval.imag == b->cval.imag
|
||||
);
|
||||
}
|
||||
|
||||
int PANDAS_INLINE pyobject_cmp(PyObject* a, PyObject* b);
|
||||
|
||||
|
||||
// replacing PyObject_RichCompareBool (NaN!=NaN) with pyobject_cmp (NaN==NaN),
|
||||
// which treats NaNs as equivalent
|
||||
// see GH 41836
|
||||
int PANDAS_INLINE tupleobject_cmp(PyTupleObject* a, PyTupleObject* b){
|
||||
Py_ssize_t i;
|
||||
|
||||
if (Py_SIZE(a) != Py_SIZE(b)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
for (i = 0; i < Py_SIZE(a); ++i) {
|
||||
if (!pyobject_cmp(PyTuple_GET_ITEM(a, i), PyTuple_GET_ITEM(b, i))) {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
int PANDAS_INLINE pyobject_cmp(PyObject* a, PyObject* b) {
|
||||
if (a == b) {
|
||||
return 1;
|
||||
}
|
||||
if (Py_TYPE(a) == Py_TYPE(b)) {
|
||||
// special handling for some built-in types which could have NaNs
|
||||
// as we would like to have them equivalent, but the usual
|
||||
// PyObject_RichCompareBool would return False
|
||||
if (PyFloat_CheckExact(a)) {
|
||||
return floatobject_cmp((PyFloatObject*)a, (PyFloatObject*)b);
|
||||
}
|
||||
if (PyComplex_CheckExact(a)) {
|
||||
return complexobject_cmp((PyComplexObject*)a, (PyComplexObject*)b);
|
||||
}
|
||||
if (PyTuple_CheckExact(a)) {
|
||||
return tupleobject_cmp((PyTupleObject*)a, (PyTupleObject*)b);
|
||||
}
|
||||
// frozenset isn't yet supported
|
||||
}
|
||||
|
||||
int result = PyObject_RichCompareBool(a, b, Py_EQ);
|
||||
if (result < 0) {
|
||||
PyErr_Clear();
|
||||
return 0;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
Py_hash_t PANDAS_INLINE _Pandas_HashDouble(double val) {
|
||||
//Since Python3.10, nan is no longer has hash 0
|
||||
if (Py_IS_NAN(val)) {
|
||||
return 0;
|
||||
}
|
||||
#if PY_VERSION_HEX < 0x030A0000
|
||||
return _Py_HashDouble(val);
|
||||
#else
|
||||
return _Py_HashDouble(NULL, val);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
Py_hash_t PANDAS_INLINE floatobject_hash(PyFloatObject* key) {
|
||||
return _Pandas_HashDouble(PyFloat_AS_DOUBLE(key));
|
||||
}
|
||||
|
||||
|
||||
#define _PandasHASH_IMAG 1000003UL
|
||||
|
||||
// replaces _Py_HashDouble with _Pandas_HashDouble
|
||||
Py_hash_t PANDAS_INLINE complexobject_hash(PyComplexObject* key) {
|
||||
Py_uhash_t realhash = (Py_uhash_t)_Pandas_HashDouble(key->cval.real);
|
||||
Py_uhash_t imaghash = (Py_uhash_t)_Pandas_HashDouble(key->cval.imag);
|
||||
if (realhash == (Py_uhash_t)-1 || imaghash == (Py_uhash_t)-1) {
|
||||
return -1;
|
||||
}
|
||||
Py_uhash_t combined = realhash + _PandasHASH_IMAG * imaghash;
|
||||
if (combined == (Py_uhash_t)-1) {
|
||||
return -2;
|
||||
}
|
||||
return (Py_hash_t)combined;
|
||||
}
|
||||
|
||||
|
||||
khuint32_t PANDAS_INLINE kh_python_hash_func(PyObject* key);
|
||||
|
||||
//we could use any hashing algorithm, this is the original CPython's for tuples
|
||||
|
||||
#if SIZEOF_PY_UHASH_T > 4
|
||||
#define _PandasHASH_XXPRIME_1 ((Py_uhash_t)11400714785074694791ULL)
|
||||
#define _PandasHASH_XXPRIME_2 ((Py_uhash_t)14029467366897019727ULL)
|
||||
#define _PandasHASH_XXPRIME_5 ((Py_uhash_t)2870177450012600261ULL)
|
||||
#define _PandasHASH_XXROTATE(x) ((x << 31) | (x >> 33)) /* Rotate left 31 bits */
|
||||
#else
|
||||
#define _PandasHASH_XXPRIME_1 ((Py_uhash_t)2654435761UL)
|
||||
#define _PandasHASH_XXPRIME_2 ((Py_uhash_t)2246822519UL)
|
||||
#define _PandasHASH_XXPRIME_5 ((Py_uhash_t)374761393UL)
|
||||
#define _PandasHASH_XXROTATE(x) ((x << 13) | (x >> 19)) /* Rotate left 13 bits */
|
||||
#endif
|
||||
|
||||
Py_hash_t PANDAS_INLINE tupleobject_hash(PyTupleObject* key) {
|
||||
Py_ssize_t i, len = Py_SIZE(key);
|
||||
PyObject **item = key->ob_item;
|
||||
|
||||
Py_uhash_t acc = _PandasHASH_XXPRIME_5;
|
||||
for (i = 0; i < len; i++) {
|
||||
Py_uhash_t lane = kh_python_hash_func(item[i]);
|
||||
if (lane == (Py_uhash_t)-1) {
|
||||
return -1;
|
||||
}
|
||||
acc += lane * _PandasHASH_XXPRIME_2;
|
||||
acc = _PandasHASH_XXROTATE(acc);
|
||||
acc *= _PandasHASH_XXPRIME_1;
|
||||
}
|
||||
|
||||
/* Add input length, mangled to keep the historical value of hash(()). */
|
||||
acc += len ^ (_PandasHASH_XXPRIME_5 ^ 3527539UL);
|
||||
|
||||
if (acc == (Py_uhash_t)-1) {
|
||||
return 1546275796;
|
||||
}
|
||||
return acc;
|
||||
}
|
||||
|
||||
|
||||
khuint32_t PANDAS_INLINE kh_python_hash_func(PyObject* key) {
|
||||
Py_hash_t hash;
|
||||
// For PyObject_Hash holds:
|
||||
// hash(0.0) == 0 == hash(-0.0)
|
||||
// yet for different nan-objects different hash-values
|
||||
// are possible
|
||||
if (PyFloat_CheckExact(key)) {
|
||||
// we cannot use kh_float64_hash_func
|
||||
// because float(k) == k holds for any int-object k
|
||||
// and kh_float64_hash_func doesn't respect it
|
||||
hash = floatobject_hash((PyFloatObject*)key);
|
||||
}
|
||||
else if (PyComplex_CheckExact(key)) {
|
||||
// we cannot use kh_complex128_hash_func
|
||||
// because complex(k,0) == k holds for any int-object k
|
||||
// and kh_complex128_hash_func doesn't respect it
|
||||
hash = complexobject_hash((PyComplexObject*)key);
|
||||
}
|
||||
else if (PyTuple_CheckExact(key)) {
|
||||
hash = tupleobject_hash((PyTupleObject*)key);
|
||||
}
|
||||
else {
|
||||
hash = PyObject_Hash(key);
|
||||
}
|
||||
|
||||
if (hash == -1) {
|
||||
PyErr_Clear();
|
||||
return 0;
|
||||
}
|
||||
#if SIZEOF_PY_HASH_T == 4
|
||||
// it is already 32bit value
|
||||
return hash;
|
||||
#else
|
||||
// for 64bit builds,
|
||||
// we need information of the upper 32bits as well
|
||||
// see GH 37615
|
||||
khuint64_t as_uint = (khuint64_t) hash;
|
||||
// uints avoid undefined behavior of signed ints
|
||||
return (as_uint>>32)^as_uint;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
#define kh_python_hash_equal(a, b) (pyobject_cmp(a, b))
|
||||
|
||||
|
||||
// Python object
|
||||
|
||||
typedef PyObject* kh_pyobject_t;
|
||||
|
||||
#define KHASH_MAP_INIT_PYOBJECT(name, khval_t) \
|
||||
KHASH_INIT(name, kh_pyobject_t, khval_t, 1, \
|
||||
kh_python_hash_func, kh_python_hash_equal)
|
||||
|
||||
KHASH_MAP_INIT_PYOBJECT(pymap, Py_ssize_t)
|
||||
|
||||
#define KHASH_SET_INIT_PYOBJECT(name) \
|
||||
KHASH_INIT(name, kh_pyobject_t, char, 0, \
|
||||
kh_python_hash_func, kh_python_hash_equal)
|
||||
|
||||
KHASH_SET_INIT_PYOBJECT(pyset)
|
||||
|
||||
#define kh_exist_pymap(h, k) (kh_exist(h, k))
|
||||
#define kh_exist_pyset(h, k) (kh_exist(h, k))
|
||||
|
||||
KHASH_MAP_INIT_STR(strbox, kh_pyobject_t)
|
||||
|
||||
typedef struct {
|
||||
kh_str_t *table;
|
||||
int starts[256];
|
||||
} kh_str_starts_t;
|
||||
|
||||
typedef kh_str_starts_t* p_kh_str_starts_t;
|
||||
|
||||
p_kh_str_starts_t PANDAS_INLINE kh_init_str_starts(void) {
|
||||
kh_str_starts_t *result = (kh_str_starts_t*)KHASH_CALLOC(1, sizeof(kh_str_starts_t));
|
||||
result->table = kh_init_str();
|
||||
return result;
|
||||
}
|
||||
|
||||
khuint_t PANDAS_INLINE kh_put_str_starts_item(kh_str_starts_t* table, char* key, int* ret) {
|
||||
khuint_t result = kh_put_str(table->table, key, ret);
|
||||
if (*ret != 0) {
|
||||
table->starts[(unsigned char)key[0]] = 1;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
khuint_t PANDAS_INLINE kh_get_str_starts_item(const kh_str_starts_t* table, const char* key) {
|
||||
unsigned char ch = *key;
|
||||
if (table->starts[ch]) {
|
||||
if (ch == '\0' || kh_get_str(table->table, key) != table->table->n_buckets) return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
void PANDAS_INLINE kh_destroy_str_starts(kh_str_starts_t* table) {
|
||||
kh_destroy_str(table->table);
|
||||
KHASH_FREE(table);
|
||||
}
|
||||
|
||||
void PANDAS_INLINE kh_resize_str_starts(kh_str_starts_t* table, khuint_t val) {
|
||||
kh_resize_str(table->table, val);
|
||||
}
|
||||
|
||||
// utility function: given the number of elements
|
||||
// returns number of necessary buckets
|
||||
khuint_t PANDAS_INLINE kh_needed_n_buckets(khuint_t n_elements){
|
||||
khuint_t candidate = n_elements;
|
||||
kroundup32(candidate);
|
||||
khuint_t upper_bound = (khuint_t)(candidate * __ac_HASH_UPPER + 0.5);
|
||||
return (upper_bound < n_elements) ? 2*candidate : candidate;
|
||||
|
||||
}
|
||||
@@ -0,0 +1,100 @@
|
||||
/*
|
||||
Copyright (c) 2016, PyData Development Team
|
||||
All rights reserved.
|
||||
|
||||
Distributed under the terms of the BSD Simplified License.
|
||||
|
||||
The full license is in the LICENSE file, distributed with this software.
|
||||
*/
|
||||
|
||||
#ifndef PANDAS__LIBS_SRC_PARSE_HELPER_H_
|
||||
#define PANDAS__LIBS_SRC_PARSE_HELPER_H_
|
||||
|
||||
#include <float.h>
|
||||
#include "parser/tokenizer.h"
|
||||
|
||||
int to_double(char *item, double *p_value, char sci, char decimal,
|
||||
int *maybe_int) {
|
||||
char *p_end = NULL;
|
||||
int error = 0;
|
||||
|
||||
/* Switch to precise xstrtod GH 31364 */
|
||||
*p_value = precise_xstrtod(item, &p_end, decimal, sci, '\0', 1,
|
||||
&error, maybe_int);
|
||||
|
||||
return (error == 0) && (!*p_end);
|
||||
}
|
||||
|
||||
int floatify(PyObject *str, double *result, int *maybe_int) {
|
||||
int status;
|
||||
char *data;
|
||||
PyObject *tmp = NULL;
|
||||
const char sci = 'E';
|
||||
const char dec = '.';
|
||||
|
||||
if (PyBytes_Check(str)) {
|
||||
data = PyBytes_AS_STRING(str);
|
||||
} else if (PyUnicode_Check(str)) {
|
||||
tmp = PyUnicode_AsUTF8String(str);
|
||||
if (tmp == NULL) {
|
||||
return -1;
|
||||
}
|
||||
data = PyBytes_AS_STRING(tmp);
|
||||
} else {
|
||||
PyErr_SetString(PyExc_TypeError, "Invalid object type");
|
||||
return -1;
|
||||
}
|
||||
|
||||
status = to_double(data, result, sci, dec, maybe_int);
|
||||
|
||||
if (!status) {
|
||||
/* handle inf/-inf infinity/-infinity */
|
||||
if (strlen(data) == 3) {
|
||||
if (0 == strcasecmp(data, "inf")) {
|
||||
*result = HUGE_VAL;
|
||||
*maybe_int = 0;
|
||||
} else {
|
||||
goto parsingerror;
|
||||
}
|
||||
} else if (strlen(data) == 4) {
|
||||
if (0 == strcasecmp(data, "-inf")) {
|
||||
*result = -HUGE_VAL;
|
||||
*maybe_int = 0;
|
||||
} else if (0 == strcasecmp(data, "+inf")) {
|
||||
*result = HUGE_VAL;
|
||||
*maybe_int = 0;
|
||||
} else {
|
||||
goto parsingerror;
|
||||
}
|
||||
} else if (strlen(data) == 8) {
|
||||
if (0 == strcasecmp(data, "infinity")) {
|
||||
*result = HUGE_VAL;
|
||||
*maybe_int = 0;
|
||||
} else {
|
||||
goto parsingerror;
|
||||
}
|
||||
} else if (strlen(data) == 9) {
|
||||
if (0 == strcasecmp(data, "-infinity")) {
|
||||
*result = -HUGE_VAL;
|
||||
*maybe_int = 0;
|
||||
} else if (0 == strcasecmp(data, "+infinity")) {
|
||||
*result = HUGE_VAL;
|
||||
*maybe_int = 0;
|
||||
} else {
|
||||
goto parsingerror;
|
||||
}
|
||||
} else {
|
||||
goto parsingerror;
|
||||
}
|
||||
}
|
||||
|
||||
Py_XDECREF(tmp);
|
||||
return 0;
|
||||
|
||||
parsingerror:
|
||||
PyErr_Format(PyExc_ValueError, "Unable to parse string \"%s\"", data);
|
||||
Py_XDECREF(tmp);
|
||||
return -1;
|
||||
}
|
||||
|
||||
#endif // PANDAS__LIBS_SRC_PARSE_HELPER_H_
|
||||
@@ -0,0 +1,107 @@
|
||||
/*
|
||||
Copyright (c) 2016, PyData Development Team
|
||||
All rights reserved.
|
||||
|
||||
Distributed under the terms of the BSD Simplified License.
|
||||
|
||||
The full license is in the LICENSE file, distributed with this software.
|
||||
*/
|
||||
|
||||
#include "io.h"
|
||||
|
||||
/*
|
||||
On-disk FILE, uncompressed
|
||||
*/
|
||||
|
||||
void *new_rd_source(PyObject *obj) {
|
||||
rd_source *rds = (rd_source *)malloc(sizeof(rd_source));
|
||||
|
||||
if (rds == NULL) {
|
||||
PyErr_NoMemory();
|
||||
return NULL;
|
||||
}
|
||||
/* hold on to this object */
|
||||
Py_INCREF(obj);
|
||||
rds->obj = obj;
|
||||
rds->buffer = NULL;
|
||||
rds->position = 0;
|
||||
|
||||
return (void *)rds;
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
Cleanup callbacks
|
||||
|
||||
*/
|
||||
|
||||
int del_rd_source(void *rds) {
|
||||
Py_XDECREF(RDS(rds)->obj);
|
||||
Py_XDECREF(RDS(rds)->buffer);
|
||||
free(rds);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
IO callbacks
|
||||
|
||||
*/
|
||||
|
||||
void *buffer_rd_bytes(void *source, size_t nbytes, size_t *bytes_read,
|
||||
int *status, const char *encoding_errors) {
|
||||
PyGILState_STATE state;
|
||||
PyObject *result, *func, *args, *tmp;
|
||||
|
||||
void *retval;
|
||||
|
||||
size_t length;
|
||||
rd_source *src = RDS(source);
|
||||
state = PyGILState_Ensure();
|
||||
|
||||
/* delete old object */
|
||||
Py_XDECREF(src->buffer);
|
||||
src->buffer = NULL;
|
||||
args = Py_BuildValue("(i)", nbytes);
|
||||
|
||||
func = PyObject_GetAttrString(src->obj, "read");
|
||||
|
||||
/* TODO: does this release the GIL? */
|
||||
result = PyObject_CallObject(func, args);
|
||||
Py_XDECREF(args);
|
||||
Py_XDECREF(func);
|
||||
|
||||
if (result == NULL) {
|
||||
PyGILState_Release(state);
|
||||
*bytes_read = 0;
|
||||
*status = CALLING_READ_FAILED;
|
||||
return NULL;
|
||||
} else if (!PyBytes_Check(result)) {
|
||||
tmp = PyUnicode_AsEncodedString(result, "utf-8", encoding_errors);
|
||||
Py_DECREF(result);
|
||||
if (tmp == NULL) {
|
||||
PyGILState_Release(state);
|
||||
return NULL;
|
||||
}
|
||||
result = tmp;
|
||||
}
|
||||
|
||||
length = PySequence_Length(result);
|
||||
|
||||
if (length == 0)
|
||||
*status = REACHED_EOF;
|
||||
else
|
||||
*status = 0;
|
||||
|
||||
/* hang on to the Python object */
|
||||
src->buffer = result;
|
||||
retval = (void *)PyBytes_AsString(result);
|
||||
|
||||
PyGILState_Release(state);
|
||||
|
||||
/* TODO: more error handling */
|
||||
*bytes_read = length;
|
||||
|
||||
return retval;
|
||||
}
|
||||
@@ -0,0 +1,34 @@
|
||||
/*
|
||||
Copyright (c) 2016, PyData Development Team
|
||||
All rights reserved.
|
||||
|
||||
Distributed under the terms of the BSD Simplified License.
|
||||
|
||||
The full license is in the LICENSE file, distributed with this software.
|
||||
*/
|
||||
|
||||
#ifndef PANDAS__LIBS_SRC_PARSER_IO_H_
|
||||
#define PANDAS__LIBS_SRC_PARSER_IO_H_
|
||||
|
||||
#define PY_SSIZE_T_CLEAN
|
||||
#include <Python.h>
|
||||
#include "tokenizer.h"
|
||||
|
||||
#define FS(source) ((file_source *)source)
|
||||
|
||||
typedef struct _rd_source {
|
||||
PyObject *obj;
|
||||
PyObject *buffer;
|
||||
size_t position;
|
||||
} rd_source;
|
||||
|
||||
#define RDS(source) ((rd_source *)source)
|
||||
|
||||
void *new_rd_source(PyObject *obj);
|
||||
|
||||
int del_rd_source(void *src);
|
||||
|
||||
void *buffer_rd_bytes(void *source, size_t nbytes, size_t *bytes_read,
|
||||
int *status, const char *encoding_errors);
|
||||
|
||||
#endif // PANDAS__LIBS_SRC_PARSER_IO_H_
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,236 @@
|
||||
/*
|
||||
|
||||
Copyright (c) 2012, Lambda Foundry, Inc., except where noted
|
||||
|
||||
Incorporates components of WarrenWeckesser/textreader, licensed under 3-clause
|
||||
BSD
|
||||
|
||||
See LICENSE for the license
|
||||
|
||||
*/
|
||||
|
||||
#ifndef PANDAS__LIBS_SRC_PARSER_TOKENIZER_H_
|
||||
#define PANDAS__LIBS_SRC_PARSER_TOKENIZER_H_
|
||||
|
||||
#define PY_SSIZE_T_CLEAN
|
||||
#include <Python.h>
|
||||
|
||||
#define ERROR_NO_DIGITS 1
|
||||
#define ERROR_OVERFLOW 2
|
||||
#define ERROR_INVALID_CHARS 3
|
||||
|
||||
#include "../headers/stdint.h"
|
||||
#include "../inline_helper.h"
|
||||
#include "../headers/portable.h"
|
||||
|
||||
#include "khash.h"
|
||||
|
||||
#define STREAM_INIT_SIZE 32
|
||||
|
||||
#define REACHED_EOF 1
|
||||
#define CALLING_READ_FAILED 2
|
||||
|
||||
|
||||
/*
|
||||
|
||||
C flat file parsing low level code for pandas / NumPy
|
||||
|
||||
*/
|
||||
|
||||
/*
|
||||
* Common set of error types for the read_rows() and tokenize()
|
||||
* functions.
|
||||
*/
|
||||
|
||||
// #define VERBOSE
|
||||
#if defined(VERBOSE)
|
||||
#define TRACE(X) printf X;
|
||||
#else
|
||||
#define TRACE(X)
|
||||
#endif // VERBOSE
|
||||
|
||||
#define PARSER_OUT_OF_MEMORY -1
|
||||
|
||||
/*
|
||||
* TODO: Might want to couple count_rows() with read_rows() to avoid
|
||||
* duplication of some file I/O.
|
||||
*/
|
||||
|
||||
typedef enum {
|
||||
START_RECORD,
|
||||
START_FIELD,
|
||||
ESCAPED_CHAR,
|
||||
IN_FIELD,
|
||||
IN_QUOTED_FIELD,
|
||||
ESCAPE_IN_QUOTED_FIELD,
|
||||
QUOTE_IN_QUOTED_FIELD,
|
||||
EAT_CRNL,
|
||||
EAT_CRNL_NOP,
|
||||
EAT_WHITESPACE,
|
||||
EAT_COMMENT,
|
||||
EAT_LINE_COMMENT,
|
||||
WHITESPACE_LINE,
|
||||
START_FIELD_IN_SKIP_LINE,
|
||||
IN_FIELD_IN_SKIP_LINE,
|
||||
IN_QUOTED_FIELD_IN_SKIP_LINE,
|
||||
QUOTE_IN_QUOTED_FIELD_IN_SKIP_LINE,
|
||||
FINISHED
|
||||
} ParserState;
|
||||
|
||||
typedef enum {
|
||||
QUOTE_MINIMAL,
|
||||
QUOTE_ALL,
|
||||
QUOTE_NONNUMERIC,
|
||||
QUOTE_NONE
|
||||
} QuoteStyle;
|
||||
|
||||
typedef enum {
|
||||
ERROR,
|
||||
WARN,
|
||||
SKIP
|
||||
} BadLineHandleMethod;
|
||||
|
||||
typedef void *(*io_callback)(void *src, size_t nbytes, size_t *bytes_read,
|
||||
int *status, const char *encoding_errors);
|
||||
typedef int (*io_cleanup)(void *src);
|
||||
|
||||
typedef struct parser_t {
|
||||
void *source;
|
||||
io_callback cb_io;
|
||||
io_cleanup cb_cleanup;
|
||||
|
||||
int64_t chunksize; // Number of bytes to prepare for each chunk
|
||||
char *data; // pointer to data to be processed
|
||||
int64_t datalen; // amount of data available
|
||||
int64_t datapos;
|
||||
|
||||
// where to write out tokenized data
|
||||
char *stream;
|
||||
uint64_t stream_len;
|
||||
uint64_t stream_cap;
|
||||
|
||||
// Store words in (potentially ragged) matrix for now, hmm
|
||||
char **words;
|
||||
int64_t *word_starts; // where we are in the stream
|
||||
uint64_t words_len;
|
||||
uint64_t words_cap;
|
||||
uint64_t max_words_cap; // maximum word cap encountered
|
||||
|
||||
char *pword_start; // pointer to stream start of current field
|
||||
int64_t word_start; // position start of current field
|
||||
|
||||
int64_t *line_start; // position in words for start of line
|
||||
int64_t *line_fields; // Number of fields in each line
|
||||
uint64_t lines; // Number of (good) lines observed
|
||||
uint64_t file_lines; // Number of lines (including bad or skipped)
|
||||
uint64_t lines_cap; // Vector capacity
|
||||
|
||||
// Tokenizing stuff
|
||||
ParserState state;
|
||||
int doublequote; /* is " represented by ""? */
|
||||
char delimiter; /* field separator */
|
||||
int delim_whitespace; /* delimit by consuming space/tabs instead */
|
||||
char quotechar; /* quote character */
|
||||
char escapechar; /* escape character */
|
||||
char lineterminator;
|
||||
int skipinitialspace; /* ignore spaces following delimiter? */
|
||||
int quoting; /* style of quoting to write */
|
||||
|
||||
char commentchar;
|
||||
int allow_embedded_newline;
|
||||
|
||||
int usecols; // Boolean: 1: usecols provided, 0: none provided
|
||||
|
||||
Py_ssize_t expected_fields;
|
||||
BadLineHandleMethod on_bad_lines;
|
||||
|
||||
// floating point options
|
||||
char decimal;
|
||||
char sci;
|
||||
|
||||
// thousands separator (comma, period)
|
||||
char thousands;
|
||||
|
||||
int header; // Boolean: 1: has header, 0: no header
|
||||
int64_t header_start; // header row start
|
||||
uint64_t header_end; // header row end
|
||||
|
||||
void *skipset;
|
||||
PyObject *skipfunc;
|
||||
int64_t skip_first_N_rows;
|
||||
int64_t skip_footer;
|
||||
double (*double_converter)(const char *, char **,
|
||||
char, char, char, int, int *, int *);
|
||||
|
||||
// error handling
|
||||
char *warn_msg;
|
||||
char *error_msg;
|
||||
|
||||
int skip_empty_lines;
|
||||
} parser_t;
|
||||
|
||||
typedef struct coliter_t {
|
||||
char **words;
|
||||
int64_t *line_start;
|
||||
int64_t col;
|
||||
} coliter_t;
|
||||
|
||||
void coliter_setup(coliter_t *self, parser_t *parser, int64_t i, int64_t start);
|
||||
|
||||
#define COLITER_NEXT(iter, word) \
|
||||
do { \
|
||||
const int64_t i = *iter.line_start++ + iter.col; \
|
||||
word = i >= *iter.line_start ? "" : iter.words[i]; \
|
||||
} while (0)
|
||||
|
||||
parser_t *parser_new(void);
|
||||
|
||||
int parser_init(parser_t *self);
|
||||
|
||||
int parser_consume_rows(parser_t *self, size_t nrows);
|
||||
|
||||
int parser_trim_buffers(parser_t *self);
|
||||
|
||||
int parser_add_skiprow(parser_t *self, int64_t row);
|
||||
|
||||
int parser_set_skipfirstnrows(parser_t *self, int64_t nrows);
|
||||
|
||||
void parser_free(parser_t *self);
|
||||
|
||||
void parser_del(parser_t *self);
|
||||
|
||||
void parser_set_default_options(parser_t *self);
|
||||
|
||||
int tokenize_nrows(parser_t *self, size_t nrows, const char *encoding_errors);
|
||||
|
||||
int tokenize_all_rows(parser_t *self, const char *encoding_errors);
|
||||
|
||||
// Have parsed / type-converted a chunk of data
|
||||
// and want to free memory from the token stream
|
||||
|
||||
typedef struct uint_state {
|
||||
int seen_sint;
|
||||
int seen_uint;
|
||||
int seen_null;
|
||||
} uint_state;
|
||||
|
||||
void uint_state_init(uint_state *self);
|
||||
|
||||
int uint64_conflict(uint_state *self);
|
||||
|
||||
uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
|
||||
uint64_t uint_max, int *error, char tsep);
|
||||
int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
|
||||
int *error, char tsep);
|
||||
double xstrtod(const char *p, char **q, char decimal, char sci, char tsep,
|
||||
int skip_trailing, int *error, int *maybe_int);
|
||||
double precise_xstrtod(const char *p, char **q, char decimal,
|
||||
char sci, char tsep, int skip_trailing,
|
||||
int *error, int *maybe_int);
|
||||
|
||||
// GH-15140 - round_trip requires and acquires the GIL on its own
|
||||
double round_trip(const char *p, char **q, char decimal, char sci, char tsep,
|
||||
int skip_trailing, int *error, int *maybe_int);
|
||||
int to_boolean(const char *item, uint8_t *val);
|
||||
|
||||
#endif // PANDAS__LIBS_SRC_PARSER_TOKENIZER_H_
|
||||
@@ -0,0 +1,300 @@
|
||||
/*
|
||||
Copyright (c) 2016, PyData Development Team
|
||||
All rights reserved.
|
||||
|
||||
Distributed under the terms of the BSD Simplified License.
|
||||
|
||||
The full license is in the LICENSE file, distributed with this software.
|
||||
|
||||
Flexibly-sized, index-able skiplist data structure for maintaining a sorted
|
||||
list of values
|
||||
|
||||
Port of Wes McKinney's Cython version of Raymond Hettinger's original pure
|
||||
Python recipe (https://rhettinger.wordpress.com/2010/02/06/lost-knowledge/)
|
||||
*/
|
||||
|
||||
#ifndef PANDAS__LIBS_SRC_SKIPLIST_H_
|
||||
#define PANDAS__LIBS_SRC_SKIPLIST_H_
|
||||
|
||||
#include <math.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "inline_helper.h"
|
||||
|
||||
PANDAS_INLINE float __skiplist_nanf(void) {
|
||||
const union {
|
||||
int __i;
|
||||
float __f;
|
||||
} __bint = {0x7fc00000UL};
|
||||
return __bint.__f;
|
||||
}
|
||||
#define PANDAS_NAN ((double)__skiplist_nanf())
|
||||
|
||||
PANDAS_INLINE double Log2(double val) { return log(val) / log(2.); }
|
||||
|
||||
typedef struct node_t node_t;
|
||||
|
||||
struct node_t {
|
||||
node_t **next;
|
||||
int *width;
|
||||
double value;
|
||||
int is_nil;
|
||||
int levels;
|
||||
int ref_count;
|
||||
};
|
||||
|
||||
typedef struct {
|
||||
node_t *head;
|
||||
node_t **tmp_chain;
|
||||
int *tmp_steps;
|
||||
int size;
|
||||
int maxlevels;
|
||||
} skiplist_t;
|
||||
|
||||
PANDAS_INLINE double urand(void) {
|
||||
return ((double)rand() + 1) / ((double)RAND_MAX + 2);
|
||||
}
|
||||
|
||||
PANDAS_INLINE int int_min(int a, int b) { return a < b ? a : b; }
|
||||
|
||||
PANDAS_INLINE node_t *node_init(double value, int levels) {
|
||||
node_t *result;
|
||||
result = (node_t *)malloc(sizeof(node_t));
|
||||
if (result) {
|
||||
result->value = value;
|
||||
result->levels = levels;
|
||||
result->is_nil = 0;
|
||||
result->ref_count = 0;
|
||||
result->next = (node_t **)malloc(levels * sizeof(node_t *));
|
||||
result->width = (int *)malloc(levels * sizeof(int));
|
||||
if (!(result->next && result->width) && (levels != 0)) {
|
||||
free(result->next);
|
||||
free(result->width);
|
||||
free(result);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
// do this ourselves
|
||||
PANDAS_INLINE void node_incref(node_t *node) { ++(node->ref_count); }
|
||||
|
||||
PANDAS_INLINE void node_decref(node_t *node) { --(node->ref_count); }
|
||||
|
||||
static void node_destroy(node_t *node) {
|
||||
int i;
|
||||
if (node) {
|
||||
if (node->ref_count <= 1) {
|
||||
for (i = 0; i < node->levels; ++i) {
|
||||
node_destroy(node->next[i]);
|
||||
}
|
||||
free(node->next);
|
||||
free(node->width);
|
||||
// printf("Reference count was 1, freeing\n");
|
||||
free(node);
|
||||
} else {
|
||||
node_decref(node);
|
||||
}
|
||||
// pretty sure that freeing the struct above will be enough
|
||||
}
|
||||
}
|
||||
|
||||
PANDAS_INLINE void skiplist_destroy(skiplist_t *skp) {
|
||||
if (skp) {
|
||||
node_destroy(skp->head);
|
||||
free(skp->tmp_steps);
|
||||
free(skp->tmp_chain);
|
||||
free(skp);
|
||||
}
|
||||
}
|
||||
|
||||
PANDAS_INLINE skiplist_t *skiplist_init(int expected_size) {
|
||||
skiplist_t *result;
|
||||
node_t *NIL, *head;
|
||||
int maxlevels, i;
|
||||
|
||||
maxlevels = 1 + Log2((double)expected_size);
|
||||
result = (skiplist_t *)malloc(sizeof(skiplist_t));
|
||||
if (!result) {
|
||||
return NULL;
|
||||
}
|
||||
result->tmp_chain = (node_t **)malloc(maxlevels * sizeof(node_t *));
|
||||
result->tmp_steps = (int *)malloc(maxlevels * sizeof(int));
|
||||
result->maxlevels = maxlevels;
|
||||
result->size = 0;
|
||||
|
||||
head = result->head = node_init(PANDAS_NAN, maxlevels);
|
||||
NIL = node_init(0.0, 0);
|
||||
|
||||
if (!(result->tmp_chain && result->tmp_steps && result->head && NIL)) {
|
||||
skiplist_destroy(result);
|
||||
node_destroy(NIL);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
node_incref(head);
|
||||
|
||||
NIL->is_nil = 1;
|
||||
|
||||
for (i = 0; i < maxlevels; ++i) {
|
||||
head->next[i] = NIL;
|
||||
head->width[i] = 1;
|
||||
node_incref(NIL);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// 1 if left < right, 0 if left == right, -1 if left > right
|
||||
PANDAS_INLINE int _node_cmp(node_t *node, double value) {
|
||||
if (node->is_nil || node->value > value) {
|
||||
return -1;
|
||||
} else if (node->value < value) {
|
||||
return 1;
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
PANDAS_INLINE double skiplist_get(skiplist_t *skp, int i, int *ret) {
|
||||
node_t *node;
|
||||
int level;
|
||||
|
||||
if (i < 0 || i >= skp->size) {
|
||||
*ret = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
node = skp->head;
|
||||
++i;
|
||||
for (level = skp->maxlevels - 1; level >= 0; --level) {
|
||||
while (node->width[level] <= i) {
|
||||
i -= node->width[level];
|
||||
node = node->next[level];
|
||||
}
|
||||
}
|
||||
|
||||
*ret = 1;
|
||||
return node->value;
|
||||
}
|
||||
|
||||
// Returns the lowest rank of all elements with value `value`, as opposed to the
|
||||
// highest rank returned by `skiplist_insert`.
|
||||
PANDAS_INLINE int skiplist_min_rank(skiplist_t *skp, double value) {
|
||||
node_t *node;
|
||||
int level, rank = 0;
|
||||
|
||||
node = skp->head;
|
||||
for (level = skp->maxlevels - 1; level >= 0; --level) {
|
||||
while (_node_cmp(node->next[level], value) > 0) {
|
||||
rank += node->width[level];
|
||||
node = node->next[level];
|
||||
}
|
||||
}
|
||||
|
||||
return rank + 1;
|
||||
}
|
||||
|
||||
// Returns the rank of the inserted element. When there are duplicates,
|
||||
// `rank` is the highest of the group, i.e. the 'max' method of
|
||||
// https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.rank.html
|
||||
PANDAS_INLINE int skiplist_insert(skiplist_t *skp, double value) {
|
||||
node_t *node, *prevnode, *newnode, *next_at_level;
|
||||
int *steps_at_level;
|
||||
int size, steps, level, rank = 0;
|
||||
node_t **chain;
|
||||
|
||||
chain = skp->tmp_chain;
|
||||
|
||||
steps_at_level = skp->tmp_steps;
|
||||
memset(steps_at_level, 0, skp->maxlevels * sizeof(int));
|
||||
|
||||
node = skp->head;
|
||||
|
||||
for (level = skp->maxlevels - 1; level >= 0; --level) {
|
||||
next_at_level = node->next[level];
|
||||
while (_node_cmp(next_at_level, value) >= 0) {
|
||||
steps_at_level[level] += node->width[level];
|
||||
rank += node->width[level];
|
||||
node = next_at_level;
|
||||
next_at_level = node->next[level];
|
||||
}
|
||||
chain[level] = node;
|
||||
}
|
||||
|
||||
size = int_min(skp->maxlevels, 1 - ((int)Log2(urand())));
|
||||
|
||||
newnode = node_init(value, size);
|
||||
if (!newnode) {
|
||||
return -1;
|
||||
}
|
||||
steps = 0;
|
||||
|
||||
for (level = 0; level < size; ++level) {
|
||||
prevnode = chain[level];
|
||||
newnode->next[level] = prevnode->next[level];
|
||||
|
||||
prevnode->next[level] = newnode;
|
||||
node_incref(newnode); // increment the reference count
|
||||
|
||||
newnode->width[level] = prevnode->width[level] - steps;
|
||||
prevnode->width[level] = steps + 1;
|
||||
|
||||
steps += steps_at_level[level];
|
||||
}
|
||||
|
||||
for (level = size; level < skp->maxlevels; ++level) {
|
||||
chain[level]->width[level] += 1;
|
||||
}
|
||||
|
||||
++(skp->size);
|
||||
|
||||
return rank + 1;
|
||||
}
|
||||
|
||||
PANDAS_INLINE int skiplist_remove(skiplist_t *skp, double value) {
|
||||
int level, size;
|
||||
node_t *node, *prevnode, *tmpnode, *next_at_level;
|
||||
node_t **chain;
|
||||
|
||||
chain = skp->tmp_chain;
|
||||
node = skp->head;
|
||||
|
||||
for (level = skp->maxlevels - 1; level >= 0; --level) {
|
||||
next_at_level = node->next[level];
|
||||
while (_node_cmp(next_at_level, value) > 0) {
|
||||
node = next_at_level;
|
||||
next_at_level = node->next[level];
|
||||
}
|
||||
chain[level] = node;
|
||||
}
|
||||
|
||||
if (value != chain[0]->next[0]->value) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
size = chain[0]->next[0]->levels;
|
||||
|
||||
for (level = 0; level < size; ++level) {
|
||||
prevnode = chain[level];
|
||||
|
||||
tmpnode = prevnode->next[level];
|
||||
|
||||
prevnode->width[level] += tmpnode->width[level] - 1;
|
||||
prevnode->next[level] = tmpnode->next[level];
|
||||
|
||||
tmpnode->next[level] = NULL;
|
||||
node_destroy(tmpnode); // decrement refcount or free
|
||||
}
|
||||
|
||||
for (level = size; level < skp->maxlevels; ++level) {
|
||||
--(chain[level]->width[level]);
|
||||
}
|
||||
|
||||
--(skp->size);
|
||||
return 1;
|
||||
}
|
||||
|
||||
#endif // PANDAS__LIBS_SRC_SKIPLIST_H_
|
||||
@@ -0,0 +1,316 @@
|
||||
/*
|
||||
Copyright (c) 2011-2013, ESN Social Software AB and Jonas Tarnstrom
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the ESN Social Software AB nor the
|
||||
names of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB OR JONAS TARNSTROM BE LIABLE
|
||||
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
|
||||
Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc)
|
||||
https://github.com/client9/stringencoders
|
||||
Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved.
|
||||
|
||||
Numeric decoder derived from from TCL library
|
||||
https://www.opensource.apple.com/source/tcl/tcl-14/tcl/license.terms
|
||||
* Copyright (c) 1988-1993 The Regents of the University of California.
|
||||
* Copyright (c) 1994 Sun Microsystems, Inc.
|
||||
*/
|
||||
|
||||
/*
|
||||
Ultra fast JSON encoder and decoder
|
||||
Developed by Jonas Tarnstrom (jonas@esn.me).
|
||||
|
||||
Encoder notes:
|
||||
------------------
|
||||
|
||||
:: Cyclic references ::
|
||||
Cyclic referenced objects are not detected.
|
||||
Set JSONObjectEncoder.recursionMax to suitable value or make sure input object
|
||||
tree doesn't have cyclic references.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef PANDAS__LIBS_SRC_UJSON_LIB_ULTRAJSON_H_
|
||||
#define PANDAS__LIBS_SRC_UJSON_LIB_ULTRAJSON_H_
|
||||
|
||||
#include <stdio.h>
|
||||
#include <wchar.h>
|
||||
|
||||
// Don't output any extra whitespaces when encoding
|
||||
#define JSON_NO_EXTRA_WHITESPACE
|
||||
|
||||
// Max decimals to encode double floating point numbers with
|
||||
#ifndef JSON_DOUBLE_MAX_DECIMALS
|
||||
#define JSON_DOUBLE_MAX_DECIMALS 15
|
||||
#endif
|
||||
|
||||
// Max recursion depth, default for encoder
|
||||
#ifndef JSON_MAX_RECURSION_DEPTH
|
||||
#define JSON_MAX_RECURSION_DEPTH 1024
|
||||
#endif
|
||||
|
||||
// Max recursion depth, default for decoder
|
||||
#ifndef JSON_MAX_OBJECT_DEPTH
|
||||
#define JSON_MAX_OBJECT_DEPTH 1024
|
||||
#endif
|
||||
|
||||
/*
|
||||
Dictates and limits how much stack space for buffers UltraJSON will use before resorting to provided heap functions */
|
||||
#ifndef JSON_MAX_STACK_BUFFER_SIZE
|
||||
#define JSON_MAX_STACK_BUFFER_SIZE 131072
|
||||
#endif
|
||||
|
||||
#ifdef _WIN32
|
||||
|
||||
typedef __int64 JSINT64;
|
||||
typedef unsigned __int64 JSUINT64;
|
||||
|
||||
typedef __int32 JSINT32;
|
||||
typedef unsigned __int32 JSUINT32;
|
||||
typedef unsigned __int8 JSUINT8;
|
||||
typedef unsigned __int16 JSUTF16;
|
||||
typedef unsigned __int32 JSUTF32;
|
||||
typedef __int64 JSLONG;
|
||||
|
||||
#define EXPORTFUNCTION __declspec(dllexport)
|
||||
|
||||
#define FASTCALL_MSVC __fastcall
|
||||
|
||||
#define INLINE_PREFIX static __inline
|
||||
|
||||
#else
|
||||
|
||||
#include <stdint.h>
|
||||
typedef int64_t JSINT64;
|
||||
typedef uint64_t JSUINT64;
|
||||
|
||||
typedef int32_t JSINT32;
|
||||
typedef uint32_t JSUINT32;
|
||||
|
||||
#define FASTCALL_MSVC
|
||||
|
||||
#define INLINE_PREFIX static inline
|
||||
|
||||
typedef uint8_t JSUINT8;
|
||||
typedef uint16_t JSUTF16;
|
||||
typedef uint32_t JSUTF32;
|
||||
|
||||
typedef int64_t JSLONG;
|
||||
|
||||
#define EXPORTFUNCTION
|
||||
#endif
|
||||
|
||||
#if !(defined(__LITTLE_ENDIAN__) || defined(__BIG_ENDIAN__))
|
||||
|
||||
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
|
||||
#define __LITTLE_ENDIAN__
|
||||
#else
|
||||
|
||||
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
||||
#define __BIG_ENDIAN__
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
#if !defined(__LITTLE_ENDIAN__) && !defined(__BIG_ENDIAN__)
|
||||
#error "Endianness not supported"
|
||||
#endif
|
||||
|
||||
enum JSTYPES {
|
||||
JT_NULL, // NULL
|
||||
JT_TRUE, // boolean true
|
||||
JT_FALSE, // boolean false
|
||||
JT_INT, // (JSINT32 (signed 32-bit))
|
||||
JT_LONG, // (JSINT64 (signed 64-bit))
|
||||
JT_DOUBLE, // (double)
|
||||
JT_BIGNUM, // integer larger than sys.maxsize
|
||||
JT_UTF8, // (char 8-bit)
|
||||
JT_ARRAY, // Array structure
|
||||
JT_OBJECT, // Key/Value structure
|
||||
JT_INVALID, // Internal, do not return nor expect
|
||||
JT_POS_INF, // Positive infinity
|
||||
JT_NEG_INF, // Negative infinity
|
||||
};
|
||||
|
||||
typedef void * JSOBJ;
|
||||
typedef void * JSITER;
|
||||
|
||||
typedef struct __JSONTypeContext {
|
||||
int type;
|
||||
void *encoder;
|
||||
void *prv;
|
||||
} JSONTypeContext;
|
||||
|
||||
/*
|
||||
Function pointer declarations, suitable for implementing UltraJSON */
|
||||
typedef void (*JSPFN_ITERBEGIN)(JSOBJ obj, JSONTypeContext *tc);
|
||||
typedef int (*JSPFN_ITERNEXT)(JSOBJ obj, JSONTypeContext *tc);
|
||||
typedef void (*JSPFN_ITEREND)(JSOBJ obj, JSONTypeContext *tc);
|
||||
typedef JSOBJ (*JSPFN_ITERGETVALUE)(JSOBJ obj, JSONTypeContext *tc);
|
||||
typedef char *(*JSPFN_ITERGETNAME)(JSOBJ obj, JSONTypeContext *tc,
|
||||
size_t *outLen);
|
||||
typedef void *(*JSPFN_MALLOC)(size_t size);
|
||||
typedef void (*JSPFN_FREE)(void *pptr);
|
||||
typedef void *(*JSPFN_REALLOC)(void *base, size_t size);
|
||||
|
||||
typedef struct __JSONObjectEncoder {
|
||||
void (*beginTypeContext)(JSOBJ obj, JSONTypeContext *tc);
|
||||
void (*endTypeContext)(JSOBJ obj, JSONTypeContext *tc);
|
||||
const char *(*getStringValue)(JSOBJ obj, JSONTypeContext *tc,
|
||||
size_t *_outLen);
|
||||
JSINT64 (*getLongValue)(JSOBJ obj, JSONTypeContext *tc);
|
||||
JSINT32 (*getIntValue)(JSOBJ obj, JSONTypeContext *tc);
|
||||
double (*getDoubleValue)(JSOBJ obj, JSONTypeContext *tc);
|
||||
const char *(*getBigNumStringValue)(JSOBJ obj, JSONTypeContext *tc,
|
||||
size_t *_outLen);
|
||||
|
||||
/*
|
||||
Begin iteration of an iteratable object (JS_ARRAY or JS_OBJECT)
|
||||
Implementor should setup iteration state in ti->prv
|
||||
*/
|
||||
JSPFN_ITERBEGIN iterBegin;
|
||||
|
||||
/*
|
||||
Retrieve next object in an iteration. Should return 0 to indicate iteration has reached end or 1 if there are more items.
|
||||
Implementor is responsible for keeping state of the iteration. Use ti->prv fields for this
|
||||
*/
|
||||
JSPFN_ITERNEXT iterNext;
|
||||
|
||||
/*
|
||||
Ends the iteration of an iteratable object.
|
||||
Any iteration state stored in ti->prv can be freed here
|
||||
*/
|
||||
JSPFN_ITEREND iterEnd;
|
||||
|
||||
/*
|
||||
Returns a reference to the value object of an iterator
|
||||
The is responsible for the life-cycle of the returned string. Use iterNext/iterEnd and ti->prv to keep track of current object
|
||||
*/
|
||||
JSPFN_ITERGETVALUE iterGetValue;
|
||||
|
||||
/*
|
||||
Return name of iterator.
|
||||
The is responsible for the life-cycle of the returned string. Use iterNext/iterEnd and ti->prv to keep track of current object
|
||||
*/
|
||||
JSPFN_ITERGETNAME iterGetName;
|
||||
|
||||
/*
|
||||
Release a value as indicated by setting ti->release = 1 in the previous getValue call.
|
||||
The ti->prv array should contain the necessary context to release the value
|
||||
*/
|
||||
void (*releaseObject)(JSOBJ obj);
|
||||
|
||||
/* Library functions
|
||||
Set to NULL to use STDLIB malloc,realloc,free */
|
||||
JSPFN_MALLOC malloc;
|
||||
JSPFN_REALLOC realloc;
|
||||
JSPFN_FREE free;
|
||||
|
||||
/*
|
||||
Configuration for max recursion, set to 0 to use default (see JSON_MAX_RECURSION_DEPTH)*/
|
||||
int recursionMax;
|
||||
|
||||
/*
|
||||
Configuration for max decimals of double floating point numbers to encode (0-9) */
|
||||
int doublePrecision;
|
||||
|
||||
/*
|
||||
If true output will be ASCII with all characters above 127 encoded as \uXXXX. If false output will be UTF-8 or what ever charset strings are brought as */
|
||||
int forceASCII;
|
||||
|
||||
/*
|
||||
If true, '<', '>', and '&' characters will be encoded as \u003c, \u003e, and \u0026, respectively. If false, no special encoding will be used. */
|
||||
int encodeHTMLChars;
|
||||
|
||||
/*
|
||||
Configuration for spaces of indent */
|
||||
int indent;
|
||||
|
||||
/*
|
||||
Set to an error message if error occurred */
|
||||
const char *errorMsg;
|
||||
JSOBJ errorObj;
|
||||
|
||||
/* Buffer stuff */
|
||||
char *start;
|
||||
char *offset;
|
||||
char *end;
|
||||
int heap;
|
||||
int level;
|
||||
} JSONObjectEncoder;
|
||||
|
||||
/*
|
||||
Encode an object structure into JSON.
|
||||
|
||||
Arguments:
|
||||
obj - An anonymous type representing the object
|
||||
enc - Function definitions for querying JSOBJ type
|
||||
buffer - Preallocated buffer to store result in. If NULL function allocates own buffer
|
||||
cbBuffer - Length of buffer (ignored if buffer is NULL)
|
||||
|
||||
Returns:
|
||||
Encoded JSON object as a null terminated char string.
|
||||
|
||||
NOTE:
|
||||
If the supplied buffer wasn't enough to hold the result the function will allocate a new buffer.
|
||||
Life cycle of the provided buffer must still be handled by caller.
|
||||
|
||||
If the return value doesn't equal the specified buffer caller must release the memory using
|
||||
JSONObjectEncoder.free or free() as specified when calling this function.
|
||||
*/
|
||||
EXPORTFUNCTION char *JSON_EncodeObject(JSOBJ obj, JSONObjectEncoder *enc,
|
||||
char *buffer, size_t cbBuffer);
|
||||
|
||||
typedef struct __JSONObjectDecoder {
|
||||
JSOBJ (*newString)(void *prv, wchar_t *start, wchar_t *end);
|
||||
int (*objectAddKey)(void *prv, JSOBJ obj, JSOBJ name, JSOBJ value);
|
||||
int (*arrayAddItem)(void *prv, JSOBJ obj, JSOBJ value);
|
||||
JSOBJ (*newTrue)(void *prv);
|
||||
JSOBJ (*newFalse)(void *prv);
|
||||
JSOBJ (*newNull)(void *prv);
|
||||
JSOBJ (*newPosInf)(void *prv);
|
||||
JSOBJ (*newNegInf)(void *prv);
|
||||
JSOBJ (*newObject)(void *prv, void *decoder);
|
||||
JSOBJ (*endObject)(void *prv, JSOBJ obj);
|
||||
JSOBJ (*newArray)(void *prv, void *decoder);
|
||||
JSOBJ (*endArray)(void *prv, JSOBJ obj);
|
||||
JSOBJ (*newInt)(void *prv, JSINT32 value);
|
||||
JSOBJ (*newLong)(void *prv, JSINT64 value);
|
||||
JSOBJ (*newUnsignedLong)(void *prv, JSUINT64 value);
|
||||
JSOBJ (*newDouble)(void *prv, double value);
|
||||
void (*releaseObject)(void *prv, JSOBJ obj, void *decoder);
|
||||
JSPFN_MALLOC malloc;
|
||||
JSPFN_FREE free;
|
||||
JSPFN_REALLOC realloc;
|
||||
char *errorStr;
|
||||
char *errorOffset;
|
||||
int preciseFloat;
|
||||
void *prv;
|
||||
} JSONObjectDecoder;
|
||||
|
||||
EXPORTFUNCTION JSOBJ JSON_DecodeObject(JSONObjectDecoder *dec,
|
||||
const char *buffer, size_t cbBuffer);
|
||||
EXPORTFUNCTION void encode(JSOBJ, JSONObjectEncoder *, const char *, size_t);
|
||||
|
||||
#endif // PANDAS__LIBS_SRC_UJSON_LIB_ULTRAJSON_H_
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,601 @@
|
||||
/*
|
||||
Copyright (c) 2011-2013, ESN Social Software AB and Jonas Tarnstrom
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the ESN Social Software AB nor the
|
||||
names of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB OR JONAS TARNSTROM BE LIABLE
|
||||
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
|
||||
Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc)
|
||||
https://github.com/client9/stringencoders
|
||||
Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved.
|
||||
|
||||
Numeric decoder derived from from TCL library
|
||||
https://www.opensource.apple.com/source/tcl/tcl-14/tcl/license.terms
|
||||
* Copyright (c) 1988-1993 The Regents of the University of California.
|
||||
* Copyright (c) 1994 Sun Microsystems, Inc.
|
||||
*/
|
||||
|
||||
#define PY_ARRAY_UNIQUE_SYMBOL UJSON_NUMPY
|
||||
#define NO_IMPORT_ARRAY
|
||||
#define PY_SSIZE_T_CLEAN
|
||||
#include <Python.h>
|
||||
#include <numpy/arrayobject.h>
|
||||
#include <ultrajson.h>
|
||||
|
||||
#define PRINTMARK()
|
||||
|
||||
typedef struct __PyObjectDecoder {
|
||||
JSONObjectDecoder dec;
|
||||
|
||||
void *npyarr; // Numpy context buffer
|
||||
void *npyarr_addr; // Ref to npyarr ptr to track DECREF calls
|
||||
npy_intp curdim; // Current array dimension
|
||||
|
||||
PyArray_Descr *dtype;
|
||||
} PyObjectDecoder;
|
||||
|
||||
typedef struct __NpyArrContext {
|
||||
PyObject *ret;
|
||||
PyObject *labels[2];
|
||||
PyArray_Dims shape;
|
||||
|
||||
PyObjectDecoder *dec;
|
||||
|
||||
npy_intp i;
|
||||
npy_intp elsize;
|
||||
npy_intp elcount;
|
||||
} NpyArrContext;
|
||||
|
||||
// Numpy handling based on numpy internal code, specifically the function
|
||||
// PyArray_FromIter.
|
||||
|
||||
// numpy related functions are inter-dependent so declare them all here,
|
||||
// to ensure the compiler catches any errors
|
||||
|
||||
// standard numpy array handling
|
||||
JSOBJ Object_npyNewArray(void *prv, void *decoder);
|
||||
JSOBJ Object_npyEndArray(void *prv, JSOBJ obj);
|
||||
int Object_npyArrayAddItem(void *prv, JSOBJ obj, JSOBJ value);
|
||||
|
||||
// for more complex dtypes (object and string) fill a standard Python list
|
||||
// and convert to a numpy array when done.
|
||||
JSOBJ Object_npyNewArrayList(void *prv, void *decoder);
|
||||
JSOBJ Object_npyEndArrayList(void *prv, JSOBJ obj);
|
||||
int Object_npyArrayListAddItem(void *prv, JSOBJ obj, JSOBJ value);
|
||||
|
||||
// labelled support, encode keys and values of JS object into separate numpy
|
||||
// arrays
|
||||
JSOBJ Object_npyNewObject(void *prv, void *decoder);
|
||||
JSOBJ Object_npyEndObject(void *prv, JSOBJ obj);
|
||||
int Object_npyObjectAddKey(void *prv, JSOBJ obj, JSOBJ name, JSOBJ value);
|
||||
|
||||
// free the numpy context buffer
|
||||
void Npy_releaseContext(NpyArrContext *npyarr) {
|
||||
PRINTMARK();
|
||||
if (npyarr) {
|
||||
if (npyarr->shape.ptr) {
|
||||
PyObject_Free(npyarr->shape.ptr);
|
||||
}
|
||||
if (npyarr->dec) {
|
||||
npyarr->dec->npyarr = NULL;
|
||||
npyarr->dec->curdim = 0;
|
||||
}
|
||||
Py_XDECREF(npyarr->labels[0]);
|
||||
Py_XDECREF(npyarr->labels[1]);
|
||||
Py_XDECREF(npyarr->ret);
|
||||
PyObject_Free(npyarr);
|
||||
}
|
||||
}
|
||||
|
||||
JSOBJ Object_npyNewArray(void *prv, void *_decoder) {
|
||||
NpyArrContext *npyarr;
|
||||
PyObjectDecoder *decoder = (PyObjectDecoder *)_decoder;
|
||||
PRINTMARK();
|
||||
if (decoder->curdim <= 0) {
|
||||
// start of array - initialise the context buffer
|
||||
npyarr = decoder->npyarr = PyObject_Malloc(sizeof(NpyArrContext));
|
||||
decoder->npyarr_addr = npyarr;
|
||||
|
||||
if (!npyarr) {
|
||||
PyErr_NoMemory();
|
||||
return NULL;
|
||||
}
|
||||
|
||||
npyarr->dec = decoder;
|
||||
npyarr->labels[0] = npyarr->labels[1] = NULL;
|
||||
|
||||
npyarr->shape.ptr = PyObject_Malloc(sizeof(npy_intp) * NPY_MAXDIMS);
|
||||
npyarr->shape.len = 1;
|
||||
npyarr->ret = NULL;
|
||||
|
||||
npyarr->elsize = 0;
|
||||
npyarr->elcount = 4;
|
||||
npyarr->i = 0;
|
||||
} else {
|
||||
// starting a new dimension continue the current array (and reshape
|
||||
// after)
|
||||
npyarr = (NpyArrContext *)decoder->npyarr;
|
||||
if (decoder->curdim >= npyarr->shape.len) {
|
||||
npyarr->shape.len++;
|
||||
}
|
||||
}
|
||||
|
||||
npyarr->shape.ptr[decoder->curdim] = 0;
|
||||
decoder->curdim++;
|
||||
return npyarr;
|
||||
}
|
||||
|
||||
PyObject *Npy_returnLabelled(NpyArrContext *npyarr) {
|
||||
PyObject *ret = npyarr->ret;
|
||||
npy_intp i;
|
||||
|
||||
if (npyarr->labels[0] || npyarr->labels[1]) {
|
||||
// finished decoding, build tuple with values and labels
|
||||
ret = PyTuple_New(npyarr->shape.len + 1);
|
||||
for (i = 0; i < npyarr->shape.len; i++) {
|
||||
if (npyarr->labels[i]) {
|
||||
PyTuple_SET_ITEM(ret, i + 1, npyarr->labels[i]);
|
||||
npyarr->labels[i] = NULL;
|
||||
} else {
|
||||
Py_INCREF(Py_None);
|
||||
PyTuple_SET_ITEM(ret, i + 1, Py_None);
|
||||
}
|
||||
}
|
||||
PyTuple_SET_ITEM(ret, 0, npyarr->ret);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
JSOBJ Object_npyEndArray(void *prv, JSOBJ obj) {
|
||||
PyObject *ret;
|
||||
char *new_data;
|
||||
NpyArrContext *npyarr = (NpyArrContext *)obj;
|
||||
int emptyType = NPY_DEFAULT_TYPE;
|
||||
npy_intp i;
|
||||
PRINTMARK();
|
||||
if (!npyarr) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
ret = npyarr->ret;
|
||||
i = npyarr->i;
|
||||
|
||||
npyarr->dec->curdim--;
|
||||
|
||||
if (i == 0 || !npyarr->ret) {
|
||||
// empty array would not have been initialised so do it now.
|
||||
if (npyarr->dec->dtype) {
|
||||
emptyType = npyarr->dec->dtype->type_num;
|
||||
}
|
||||
npyarr->ret = ret =
|
||||
PyArray_EMPTY(npyarr->shape.len, npyarr->shape.ptr, emptyType, 0);
|
||||
} else if (npyarr->dec->curdim <= 0) {
|
||||
// realloc to final size
|
||||
new_data = PyDataMem_RENEW(PyArray_DATA(ret), i * npyarr->elsize);
|
||||
if (new_data == NULL) {
|
||||
PyErr_NoMemory();
|
||||
Npy_releaseContext(npyarr);
|
||||
return NULL;
|
||||
}
|
||||
((PyArrayObject *)ret)->data = (void *)new_data;
|
||||
// PyArray_BYTES(ret) = new_data;
|
||||
}
|
||||
|
||||
if (npyarr->dec->curdim <= 0) {
|
||||
// finished decoding array, reshape if necessary
|
||||
if (npyarr->shape.len > 1) {
|
||||
npyarr->ret = PyArray_Newshape((PyArrayObject *)ret, &npyarr->shape,
|
||||
NPY_ANYORDER);
|
||||
Py_DECREF(ret);
|
||||
}
|
||||
|
||||
ret = Npy_returnLabelled(npyarr);
|
||||
|
||||
npyarr->ret = NULL;
|
||||
Npy_releaseContext(npyarr);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int Object_npyArrayAddItem(void *prv, JSOBJ obj, JSOBJ value) {
|
||||
PyObject *type;
|
||||
PyArray_Descr *dtype;
|
||||
npy_intp i;
|
||||
char *new_data, *item;
|
||||
NpyArrContext *npyarr = (NpyArrContext *)obj;
|
||||
PRINTMARK();
|
||||
if (!npyarr) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
i = npyarr->i;
|
||||
|
||||
npyarr->shape.ptr[npyarr->dec->curdim - 1]++;
|
||||
|
||||
if (PyArray_Check((PyObject *)value)) {
|
||||
// multidimensional array, keep decoding values.
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (!npyarr->ret) {
|
||||
// Array not initialised yet.
|
||||
// We do it here so we can 'sniff' the data type if none was provided
|
||||
if (!npyarr->dec->dtype) {
|
||||
type = PyObject_Type(value);
|
||||
if (!PyArray_DescrConverter(type, &dtype)) {
|
||||
Py_DECREF(type);
|
||||
goto fail;
|
||||
}
|
||||
Py_INCREF(dtype);
|
||||
Py_DECREF(type);
|
||||
} else {
|
||||
dtype = PyArray_DescrNew(npyarr->dec->dtype);
|
||||
}
|
||||
|
||||
// If it's an object or string then fill a Python list and subsequently
|
||||
// convert. Otherwise we would need to somehow mess about with
|
||||
// reference counts when renewing memory.
|
||||
npyarr->elsize = dtype->elsize;
|
||||
if (PyDataType_REFCHK(dtype) || npyarr->elsize == 0) {
|
||||
Py_XDECREF(dtype);
|
||||
|
||||
if (npyarr->dec->curdim > 1) {
|
||||
PyErr_SetString(PyExc_ValueError,
|
||||
"Cannot decode multidimensional arrays with "
|
||||
"variable length elements to numpy");
|
||||
goto fail;
|
||||
}
|
||||
npyarr->elcount = 0;
|
||||
npyarr->ret = PyList_New(0);
|
||||
if (!npyarr->ret) {
|
||||
goto fail;
|
||||
}
|
||||
((JSONObjectDecoder *)npyarr->dec)->newArray =
|
||||
Object_npyNewArrayList;
|
||||
((JSONObjectDecoder *)npyarr->dec)->arrayAddItem =
|
||||
Object_npyArrayListAddItem;
|
||||
((JSONObjectDecoder *)npyarr->dec)->endArray =
|
||||
Object_npyEndArrayList;
|
||||
return Object_npyArrayListAddItem(prv, obj, value);
|
||||
}
|
||||
|
||||
npyarr->ret = PyArray_NewFromDescr(
|
||||
&PyArray_Type, dtype, 1, &npyarr->elcount, NULL, NULL, 0, NULL);
|
||||
|
||||
if (!npyarr->ret) {
|
||||
goto fail;
|
||||
}
|
||||
}
|
||||
|
||||
if (i >= npyarr->elcount) {
|
||||
// Grow PyArray_DATA(ret):
|
||||
// this is similar for the strategy for PyListObject, but we use
|
||||
// 50% overallocation => 0, 4, 8, 14, 23, 36, 56, 86 ...
|
||||
if (npyarr->elsize == 0) {
|
||||
PyErr_SetString(PyExc_ValueError,
|
||||
"Cannot decode multidimensional arrays with "
|
||||
"variable length elements to numpy");
|
||||
goto fail;
|
||||
}
|
||||
|
||||
npyarr->elcount = (i >> 1) + (i < 4 ? 4 : 2) + i;
|
||||
if (npyarr->elcount <= NPY_MAX_INTP / npyarr->elsize) {
|
||||
new_data = PyDataMem_RENEW(PyArray_DATA(npyarr->ret),
|
||||
npyarr->elcount * npyarr->elsize);
|
||||
} else {
|
||||
PyErr_NoMemory();
|
||||
goto fail;
|
||||
}
|
||||
((PyArrayObject *)npyarr->ret)->data = (void *)new_data;
|
||||
|
||||
// PyArray_BYTES(npyarr->ret) = new_data;
|
||||
}
|
||||
|
||||
PyArray_DIMS(npyarr->ret)[0] = i + 1;
|
||||
|
||||
if ((item = PyArray_GETPTR1(npyarr->ret, i)) == NULL ||
|
||||
PyArray_SETITEM(npyarr->ret, item, value) == -1) {
|
||||
goto fail;
|
||||
}
|
||||
|
||||
Py_DECREF((PyObject *)value);
|
||||
npyarr->i++;
|
||||
return 1;
|
||||
|
||||
fail:
|
||||
|
||||
Npy_releaseContext(npyarr);
|
||||
return 0;
|
||||
}
|
||||
|
||||
JSOBJ Object_npyNewArrayList(void *prv, void *_decoder) {
|
||||
PyObjectDecoder *decoder = (PyObjectDecoder *)_decoder;
|
||||
PRINTMARK();
|
||||
PyErr_SetString(
|
||||
PyExc_ValueError,
|
||||
"nesting not supported for object or variable length dtypes");
|
||||
Npy_releaseContext(decoder->npyarr);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
JSOBJ Object_npyEndArrayList(void *prv, JSOBJ obj) {
|
||||
PyObject *list, *ret;
|
||||
NpyArrContext *npyarr = (NpyArrContext *)obj;
|
||||
PRINTMARK();
|
||||
if (!npyarr) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// convert decoded list to numpy array
|
||||
list = (PyObject *)npyarr->ret;
|
||||
npyarr->ret = PyArray_FROM_O(list);
|
||||
|
||||
ret = Npy_returnLabelled(npyarr);
|
||||
npyarr->ret = list;
|
||||
|
||||
((JSONObjectDecoder *)npyarr->dec)->newArray = Object_npyNewArray;
|
||||
((JSONObjectDecoder *)npyarr->dec)->arrayAddItem = Object_npyArrayAddItem;
|
||||
((JSONObjectDecoder *)npyarr->dec)->endArray = Object_npyEndArray;
|
||||
Npy_releaseContext(npyarr);
|
||||
return ret;
|
||||
}
|
||||
|
||||
int Object_npyArrayListAddItem(void *prv, JSOBJ obj, JSOBJ value) {
|
||||
NpyArrContext *npyarr = (NpyArrContext *)obj;
|
||||
PRINTMARK();
|
||||
if (!npyarr) {
|
||||
return 0;
|
||||
}
|
||||
PyList_Append((PyObject *)npyarr->ret, value);
|
||||
Py_DECREF((PyObject *)value);
|
||||
npyarr->elcount++;
|
||||
return 1;
|
||||
}
|
||||
|
||||
JSOBJ Object_npyNewObject(void *prv, void *_decoder) {
|
||||
PyObjectDecoder *decoder = (PyObjectDecoder *)_decoder;
|
||||
PRINTMARK();
|
||||
if (decoder->curdim > 1) {
|
||||
PyErr_SetString(PyExc_ValueError,
|
||||
"labels only supported up to 2 dimensions");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return ((JSONObjectDecoder *)decoder)->newArray(prv, decoder);
|
||||
}
|
||||
|
||||
JSOBJ Object_npyEndObject(void *prv, JSOBJ obj) {
|
||||
PyObject *list;
|
||||
npy_intp labelidx;
|
||||
NpyArrContext *npyarr = (NpyArrContext *)obj;
|
||||
PRINTMARK();
|
||||
if (!npyarr) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
labelidx = npyarr->dec->curdim - 1;
|
||||
|
||||
list = npyarr->labels[labelidx];
|
||||
if (list) {
|
||||
npyarr->labels[labelidx] = PyArray_FROM_O(list);
|
||||
Py_DECREF(list);
|
||||
}
|
||||
|
||||
return (PyObject *)((JSONObjectDecoder *)npyarr->dec)->endArray(prv, obj);
|
||||
}
|
||||
|
||||
int Object_npyObjectAddKey(void *prv, JSOBJ obj, JSOBJ name, JSOBJ value) {
|
||||
PyObject *label, *labels;
|
||||
npy_intp labelidx;
|
||||
// add key to label array, value to values array
|
||||
NpyArrContext *npyarr = (NpyArrContext *)obj;
|
||||
PRINTMARK();
|
||||
if (!npyarr) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
label = (PyObject *)name;
|
||||
labelidx = npyarr->dec->curdim - 1;
|
||||
|
||||
if (!npyarr->labels[labelidx]) {
|
||||
npyarr->labels[labelidx] = PyList_New(0);
|
||||
}
|
||||
labels = npyarr->labels[labelidx];
|
||||
// only fill label array once, assumes all column labels are the same
|
||||
// for 2-dimensional arrays.
|
||||
if (PyList_Check(labels) && PyList_GET_SIZE(labels) <= npyarr->elcount) {
|
||||
PyList_Append(labels, label);
|
||||
}
|
||||
|
||||
if (((JSONObjectDecoder *)npyarr->dec)->arrayAddItem(prv, obj, value)) {
|
||||
Py_DECREF(label);
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int Object_objectAddKey(void *prv, JSOBJ obj, JSOBJ name, JSOBJ value) {
|
||||
int ret = PyDict_SetItem(obj, name, value);
|
||||
Py_DECREF((PyObject *)name);
|
||||
Py_DECREF((PyObject *)value);
|
||||
return ret == 0 ? 1 : 0;
|
||||
}
|
||||
|
||||
int Object_arrayAddItem(void *prv, JSOBJ obj, JSOBJ value) {
|
||||
int ret = PyList_Append(obj, value);
|
||||
Py_DECREF((PyObject *)value);
|
||||
return ret == 0 ? 1 : 0;
|
||||
}
|
||||
|
||||
JSOBJ Object_newString(void *prv, wchar_t *start, wchar_t *end) {
|
||||
return PyUnicode_FromWideChar(start, (end - start));
|
||||
}
|
||||
|
||||
JSOBJ Object_newTrue(void *prv) { Py_RETURN_TRUE; }
|
||||
|
||||
JSOBJ Object_newFalse(void *prv) { Py_RETURN_FALSE; }
|
||||
|
||||
JSOBJ Object_newNull(void *prv) { Py_RETURN_NONE; }
|
||||
|
||||
JSOBJ Object_newPosInf(void *prv) { return PyFloat_FromDouble(Py_HUGE_VAL); }
|
||||
|
||||
JSOBJ Object_newNegInf(void *prv) { return PyFloat_FromDouble(-Py_HUGE_VAL); }
|
||||
|
||||
JSOBJ Object_newObject(void *prv, void *decoder) { return PyDict_New(); }
|
||||
|
||||
JSOBJ Object_endObject(void *prv, JSOBJ obj) { return obj; }
|
||||
|
||||
JSOBJ Object_newArray(void *prv, void *decoder) { return PyList_New(0); }
|
||||
|
||||
JSOBJ Object_endArray(void *prv, JSOBJ obj) { return obj; }
|
||||
|
||||
JSOBJ Object_newInteger(void *prv, JSINT32 value) {
|
||||
return PyLong_FromLong((long)value);
|
||||
}
|
||||
|
||||
JSOBJ Object_newLong(void *prv, JSINT64 value) {
|
||||
return PyLong_FromLongLong(value);
|
||||
}
|
||||
|
||||
JSOBJ Object_newUnsignedLong(void *prv, JSUINT64 value) {
|
||||
return PyLong_FromUnsignedLongLong(value);
|
||||
}
|
||||
|
||||
JSOBJ Object_newDouble(void *prv, double value) {
|
||||
return PyFloat_FromDouble(value);
|
||||
}
|
||||
|
||||
static void Object_releaseObject(void *prv, JSOBJ obj, void *_decoder) {
|
||||
PyObjectDecoder *decoder = (PyObjectDecoder *)_decoder;
|
||||
if (obj != decoder->npyarr_addr) {
|
||||
Py_XDECREF(((PyObject *)obj));
|
||||
}
|
||||
}
|
||||
|
||||
static char *g_kwlist[] = {"obj", "precise_float", "numpy",
|
||||
"labelled", "dtype", NULL};
|
||||
|
||||
PyObject *JSONToObj(PyObject *self, PyObject *args, PyObject *kwargs) {
|
||||
PyObject *ret;
|
||||
PyObject *sarg;
|
||||
PyObject *arg;
|
||||
PyObject *opreciseFloat = NULL;
|
||||
JSONObjectDecoder *decoder;
|
||||
PyObjectDecoder pyDecoder;
|
||||
PyArray_Descr *dtype = NULL;
|
||||
int numpy = 0, labelled = 0;
|
||||
|
||||
JSONObjectDecoder dec = {
|
||||
Object_newString, Object_objectAddKey, Object_arrayAddItem,
|
||||
Object_newTrue, Object_newFalse, Object_newNull,
|
||||
Object_newPosInf, Object_newNegInf, Object_newObject,
|
||||
Object_endObject, Object_newArray, Object_endArray,
|
||||
Object_newInteger, Object_newLong, Object_newUnsignedLong,
|
||||
Object_newDouble,
|
||||
Object_releaseObject, PyObject_Malloc, PyObject_Free,
|
||||
PyObject_Realloc};
|
||||
|
||||
dec.preciseFloat = 0;
|
||||
dec.prv = NULL;
|
||||
|
||||
pyDecoder.dec = dec;
|
||||
pyDecoder.curdim = 0;
|
||||
pyDecoder.npyarr = NULL;
|
||||
pyDecoder.npyarr_addr = NULL;
|
||||
|
||||
decoder = (JSONObjectDecoder *)&pyDecoder;
|
||||
|
||||
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|OiiO&", g_kwlist, &arg,
|
||||
&opreciseFloat, &numpy, &labelled,
|
||||
PyArray_DescrConverter2, &dtype)) {
|
||||
Npy_releaseContext(pyDecoder.npyarr);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (opreciseFloat && PyObject_IsTrue(opreciseFloat)) {
|
||||
decoder->preciseFloat = 1;
|
||||
}
|
||||
|
||||
if (PyBytes_Check(arg)) {
|
||||
sarg = arg;
|
||||
} else if (PyUnicode_Check(arg)) {
|
||||
sarg = PyUnicode_AsUTF8String(arg);
|
||||
if (sarg == NULL) {
|
||||
// Exception raised above us by codec according to docs
|
||||
return NULL;
|
||||
}
|
||||
} else {
|
||||
PyErr_Format(PyExc_TypeError, "Expected 'str' or 'bytes'");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
decoder->errorStr = NULL;
|
||||
decoder->errorOffset = NULL;
|
||||
|
||||
if (numpy) {
|
||||
pyDecoder.dtype = dtype;
|
||||
decoder->newArray = Object_npyNewArray;
|
||||
decoder->endArray = Object_npyEndArray;
|
||||
decoder->arrayAddItem = Object_npyArrayAddItem;
|
||||
|
||||
if (labelled) {
|
||||
decoder->newObject = Object_npyNewObject;
|
||||
decoder->endObject = Object_npyEndObject;
|
||||
decoder->objectAddKey = Object_npyObjectAddKey;
|
||||
}
|
||||
}
|
||||
|
||||
ret = JSON_DecodeObject(decoder, PyBytes_AS_STRING(sarg),
|
||||
PyBytes_GET_SIZE(sarg));
|
||||
|
||||
if (sarg != arg) {
|
||||
Py_DECREF(sarg);
|
||||
}
|
||||
|
||||
if (PyErr_Occurred()) {
|
||||
if (ret) {
|
||||
Py_DECREF((PyObject *)ret);
|
||||
}
|
||||
Npy_releaseContext(pyDecoder.npyarr);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (decoder->errorStr) {
|
||||
/*
|
||||
FIXME: It's possible to give a much nicer error message here with actual
|
||||
failing element in input etc*/
|
||||
|
||||
PyErr_Format(PyExc_ValueError, "%s", decoder->errorStr);
|
||||
|
||||
if (ret) {
|
||||
Py_DECREF((PyObject *)ret);
|
||||
}
|
||||
Npy_releaseContext(pyDecoder.npyarr);
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
@@ -0,0 +1,151 @@
|
||||
/*
|
||||
Copyright (c) 2020, PyData Development Team
|
||||
All rights reserved.
|
||||
Distributed under the terms of the BSD Simplified License.
|
||||
The full license is in the LICENSE file, distributed with this software.
|
||||
*/
|
||||
|
||||
// Conversion routines that are useful for serialization,
|
||||
// but which don't interact with JSON objects directly
|
||||
|
||||
#include "date_conversions.h"
|
||||
#include <../../../tslibs/src/datetime/np_datetime.h>
|
||||
#include <../../../tslibs/src/datetime/np_datetime_strings.h>
|
||||
|
||||
/*
|
||||
* Function: scaleNanosecToUnit
|
||||
* -----------------------------
|
||||
*
|
||||
* Scales an integer value representing time in nanoseconds to provided unit.
|
||||
*
|
||||
* Mutates the provided value directly. Returns 0 on success, non-zero on error.
|
||||
*/
|
||||
int scaleNanosecToUnit(npy_int64 *value, NPY_DATETIMEUNIT unit) {
|
||||
switch (unit) {
|
||||
case NPY_FR_ns:
|
||||
break;
|
||||
case NPY_FR_us:
|
||||
*value /= 1000LL;
|
||||
break;
|
||||
case NPY_FR_ms:
|
||||
*value /= 1000000LL;
|
||||
break;
|
||||
case NPY_FR_s:
|
||||
*value /= 1000000000LL;
|
||||
break;
|
||||
default:
|
||||
return -1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Converts the int64_t representation of a datetime to ISO; mutates len */
|
||||
char *int64ToIso(int64_t value, NPY_DATETIMEUNIT base, size_t *len) {
|
||||
npy_datetimestruct dts;
|
||||
int ret_code;
|
||||
|
||||
pandas_datetime_to_datetimestruct(value, NPY_FR_ns, &dts);
|
||||
|
||||
*len = (size_t)get_datetime_iso_8601_strlen(0, base);
|
||||
char *result = PyObject_Malloc(*len);
|
||||
|
||||
if (result == NULL) {
|
||||
PyErr_NoMemory();
|
||||
return NULL;
|
||||
}
|
||||
|
||||
ret_code = make_iso_8601_datetime(&dts, result, *len, base);
|
||||
if (ret_code != 0) {
|
||||
PyErr_SetString(PyExc_ValueError,
|
||||
"Could not convert datetime value to string");
|
||||
PyObject_Free(result);
|
||||
}
|
||||
|
||||
// Note that get_datetime_iso_8601_strlen just gives a generic size
|
||||
// for ISO string conversion, not the actual size used
|
||||
*len = strlen(result);
|
||||
return result;
|
||||
}
|
||||
|
||||
npy_datetime NpyDateTimeToEpoch(npy_datetime dt, NPY_DATETIMEUNIT base) {
|
||||
scaleNanosecToUnit(&dt, base);
|
||||
return dt;
|
||||
}
|
||||
|
||||
/* Convert PyDatetime To ISO C-string. mutates len */
|
||||
char *PyDateTimeToIso(PyObject *obj, NPY_DATETIMEUNIT base,
|
||||
size_t *len) {
|
||||
npy_datetimestruct dts;
|
||||
int ret;
|
||||
|
||||
ret = convert_pydatetime_to_datetimestruct(obj, &dts);
|
||||
if (ret != 0) {
|
||||
if (!PyErr_Occurred()) {
|
||||
PyErr_SetString(PyExc_ValueError,
|
||||
"Could not convert PyDateTime to numpy datetime");
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
*len = (size_t)get_datetime_iso_8601_strlen(0, base);
|
||||
char *result = PyObject_Malloc(*len);
|
||||
ret = make_iso_8601_datetime(&dts, result, *len, base);
|
||||
|
||||
if (ret != 0) {
|
||||
PyErr_SetString(PyExc_ValueError,
|
||||
"Could not convert datetime value to string");
|
||||
PyObject_Free(result);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// Note that get_datetime_iso_8601_strlen just gives a generic size
|
||||
// for ISO string conversion, not the actual size used
|
||||
*len = strlen(result);
|
||||
return result;
|
||||
}
|
||||
|
||||
npy_datetime PyDateTimeToEpoch(PyObject *dt, NPY_DATETIMEUNIT base) {
|
||||
npy_datetimestruct dts;
|
||||
int ret;
|
||||
|
||||
ret = convert_pydatetime_to_datetimestruct(dt, &dts);
|
||||
if (ret != 0) {
|
||||
if (!PyErr_Occurred()) {
|
||||
PyErr_SetString(PyExc_ValueError,
|
||||
"Could not convert PyDateTime to numpy datetime");
|
||||
}
|
||||
// TODO(username): is setting errMsg required?
|
||||
// ((JSONObjectEncoder *)tc->encoder)->errorMsg = "";
|
||||
// return NULL;
|
||||
}
|
||||
|
||||
npy_datetime npy_dt = npy_datetimestruct_to_datetime(NPY_FR_ns, &dts);
|
||||
return NpyDateTimeToEpoch(npy_dt, base);
|
||||
}
|
||||
|
||||
/* Converts the int64_t representation of a duration to ISO; mutates len */
|
||||
char *int64ToIsoDuration(int64_t value, size_t *len) {
|
||||
pandas_timedeltastruct tds;
|
||||
int ret_code;
|
||||
|
||||
pandas_timedelta_to_timedeltastruct(value, NPY_FR_ns, &tds);
|
||||
|
||||
// Max theoretical length of ISO Duration with 64 bit day
|
||||
// as the largest unit is 70 characters + 1 for a null terminator
|
||||
char *result = PyObject_Malloc(71);
|
||||
if (result == NULL) {
|
||||
PyErr_NoMemory();
|
||||
return NULL;
|
||||
}
|
||||
|
||||
ret_code = make_iso_8601_timedelta(&tds, result, len);
|
||||
if (ret_code == -1) {
|
||||
PyErr_SetString(PyExc_ValueError,
|
||||
"Could not convert timedelta value to string");
|
||||
PyObject_Free(result);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
@@ -0,0 +1,39 @@
|
||||
/*
|
||||
Copyright (c) 2020, PyData Development Team
|
||||
All rights reserved.
|
||||
Distributed under the terms of the BSD Simplified License.
|
||||
The full license is in the LICENSE file, distributed with this software.
|
||||
*/
|
||||
|
||||
#ifndef PANDAS__LIBS_SRC_UJSON_PYTHON_DATE_CONVERSIONS_H_
|
||||
#define PANDAS__LIBS_SRC_UJSON_PYTHON_DATE_CONVERSIONS_H_
|
||||
|
||||
#define PY_SSIZE_T_CLEAN
|
||||
#include <Python.h>
|
||||
#include <numpy/ndarraytypes.h>
|
||||
|
||||
// Scales value inplace from nanosecond resolution to unit resolution
|
||||
int scaleNanosecToUnit(npy_int64 *value, NPY_DATETIMEUNIT unit);
|
||||
|
||||
// Converts an int64 object representing a date to ISO format
|
||||
// up to precision `base` e.g. base="s" yields 2020-01-03T00:00:00Z
|
||||
// while base="ns" yields "2020-01-01T00:00:00.000000000Z"
|
||||
// len is mutated to save the length of the returned string
|
||||
char *int64ToIso(int64_t value, NPY_DATETIMEUNIT base, size_t *len);
|
||||
|
||||
// TODO(username): this function doesn't do a lot; should augment or
|
||||
// replace with scaleNanosecToUnit
|
||||
npy_datetime NpyDateTimeToEpoch(npy_datetime dt, NPY_DATETIMEUNIT base);
|
||||
|
||||
// Converts a Python object representing a Date / Datetime to ISO format
|
||||
// up to precision `base` e.g. base="s" yields 2020-01-03T00:00:00Z
|
||||
// while base="ns" yields "2020-01-01T00:00:00.000000000Z"
|
||||
// len is mutated to save the length of the returned string
|
||||
char *PyDateTimeToIso(PyObject *obj, NPY_DATETIMEUNIT base, size_t *len);
|
||||
|
||||
// Convert a Python Date/Datetime to Unix epoch with resolution base
|
||||
npy_datetime PyDateTimeToEpoch(PyObject *dt, NPY_DATETIMEUNIT base);
|
||||
|
||||
char *int64ToIsoDuration(int64_t value, size_t *len);
|
||||
|
||||
#endif // PANDAS__LIBS_SRC_UJSON_PYTHON_DATE_CONVERSIONS_H_
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,81 @@
|
||||
/*
|
||||
Copyright (c) 2011-2013, ESN Social Software AB and Jonas Tarnstrom
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the ESN Social Software AB nor the
|
||||
names of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB OR JONAS TARNSTROM BE LIABLE
|
||||
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
|
||||
Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc)
|
||||
https://github.com/client9/stringencoders
|
||||
Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved.
|
||||
|
||||
Numeric decoder derived from from TCL library
|
||||
https://www.opensource.apple.com/source/tcl/tcl-14/tcl/license.terms
|
||||
* Copyright (c) 1988-1993 The Regents of the University of California.
|
||||
* Copyright (c) 1994 Sun Microsystems, Inc.
|
||||
*/
|
||||
|
||||
#include "version.h"
|
||||
#define PY_SSIZE_T_CLEAN
|
||||
#include <Python.h>
|
||||
#define PY_ARRAY_UNIQUE_SYMBOL UJSON_NUMPY
|
||||
#include "numpy/arrayobject.h"
|
||||
|
||||
/* objToJSON */
|
||||
PyObject *objToJSON(PyObject *self, PyObject *args, PyObject *kwargs);
|
||||
void initObjToJSON(void);
|
||||
|
||||
/* JSONToObj */
|
||||
PyObject *JSONToObj(PyObject *self, PyObject *args, PyObject *kwargs);
|
||||
|
||||
#define ENCODER_HELP_TEXT \
|
||||
"Use ensure_ascii=false to output UTF-8. Pass in double_precision to " \
|
||||
"alter the maximum digit precision of doubles. Set " \
|
||||
"encode_html_chars=True to encode < > & as unicode escape sequences."
|
||||
|
||||
static PyMethodDef ujsonMethods[] = {
|
||||
{"encode", (PyCFunction)objToJSON, METH_VARARGS | METH_KEYWORDS,
|
||||
"Converts arbitrary object recursively into JSON. " ENCODER_HELP_TEXT},
|
||||
{"decode", (PyCFunction)JSONToObj, METH_VARARGS | METH_KEYWORDS,
|
||||
"Converts JSON as string to dict object structure. Use precise_float=True "
|
||||
"to use high precision float decoder."},
|
||||
{"dumps", (PyCFunction)objToJSON, METH_VARARGS | METH_KEYWORDS,
|
||||
"Converts arbitrary object recursively into JSON. " ENCODER_HELP_TEXT},
|
||||
{"loads", (PyCFunction)JSONToObj, METH_VARARGS | METH_KEYWORDS,
|
||||
"Converts JSON as string to dict object structure. Use precise_float=True "
|
||||
"to use high precision float decoder."},
|
||||
{NULL, NULL, 0, NULL} /* Sentinel */
|
||||
};
|
||||
|
||||
static PyModuleDef moduledef = {
|
||||
.m_base = PyModuleDef_HEAD_INIT,
|
||||
.m_name = "_libjson",
|
||||
.m_methods = ujsonMethods
|
||||
};
|
||||
|
||||
|
||||
PyMODINIT_FUNC PyInit_json(void) {
|
||||
import_array()
|
||||
initObjToJSON(); // TODO(username): clean up, maybe via tp_free?
|
||||
return PyModuleDef_Init(&moduledef);
|
||||
}
|
||||
@@ -0,0 +1,43 @@
|
||||
/*
|
||||
Copyright (c) 2011-2013, ESN Social Software AB and Jonas Tarnstrom
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the ESN Social Software AB nor the
|
||||
names of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB OR JONAS TARNSTROM BE LIABLE
|
||||
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
|
||||
Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc)
|
||||
https://github.com/client9/stringencoders
|
||||
Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved.
|
||||
|
||||
Numeric decoder derived from from TCL library
|
||||
https://www.opensource.apple.com/source/tcl/tcl-14/tcl/license.terms
|
||||
* Copyright (c) 1988-1993 The Regents of the University of California.
|
||||
* Copyright (c) 1994 Sun Microsystems, Inc.
|
||||
*/
|
||||
|
||||
#ifndef PANDAS__LIBS_SRC_UJSON_PYTHON_VERSION_H_
|
||||
#define PANDAS__LIBS_SRC_UJSON_PYTHON_VERSION_H_
|
||||
|
||||
#define UJSON_VERSION "1.33"
|
||||
|
||||
#endif // PANDAS__LIBS_SRC_UJSON_PYTHON_VERSION_H_
|
||||
Binary file not shown.
@@ -0,0 +1,12 @@
|
||||
def assert_dict_equal(a, b, compare_keys: bool = ...): ...
|
||||
def assert_almost_equal(
|
||||
a,
|
||||
b,
|
||||
rtol: float = ...,
|
||||
atol: float = ...,
|
||||
check_dtype: bool = ...,
|
||||
obj=...,
|
||||
lobj=...,
|
||||
robj=...,
|
||||
index_values=...,
|
||||
): ...
|
||||
@@ -0,0 +1,208 @@
|
||||
import cmath
|
||||
import math
|
||||
|
||||
import numpy as np
|
||||
|
||||
from numpy cimport import_array
|
||||
|
||||
import_array()
|
||||
|
||||
from pandas._libs.util cimport (
|
||||
is_array,
|
||||
is_complex_object,
|
||||
is_real_number_object,
|
||||
)
|
||||
|
||||
from pandas.core.dtypes.common import is_dtype_equal
|
||||
from pandas.core.dtypes.missing import (
|
||||
array_equivalent,
|
||||
isna,
|
||||
)
|
||||
|
||||
|
||||
cdef bint isiterable(obj):
|
||||
return hasattr(obj, '__iter__')
|
||||
|
||||
|
||||
cdef bint has_length(obj):
|
||||
return hasattr(obj, '__len__')
|
||||
|
||||
|
||||
cdef bint is_dictlike(obj):
|
||||
return hasattr(obj, 'keys') and hasattr(obj, '__getitem__')
|
||||
|
||||
|
||||
cpdef assert_dict_equal(a, b, bint compare_keys=True):
|
||||
assert is_dictlike(a) and is_dictlike(b), (
|
||||
"Cannot compare dict objects, one or both is not dict-like"
|
||||
)
|
||||
|
||||
a_keys = frozenset(a.keys())
|
||||
b_keys = frozenset(b.keys())
|
||||
|
||||
if compare_keys:
|
||||
assert a_keys == b_keys
|
||||
|
||||
for k in a_keys:
|
||||
assert_almost_equal(a[k], b[k])
|
||||
|
||||
return True
|
||||
|
||||
|
||||
cpdef assert_almost_equal(a, b,
|
||||
rtol=1.e-5, atol=1.e-8,
|
||||
bint check_dtype=True,
|
||||
obj=None, lobj=None, robj=None, index_values=None):
|
||||
"""
|
||||
Check that left and right objects are almost equal.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
a : object
|
||||
b : object
|
||||
rtol : float, default 1e-5
|
||||
Relative tolerance.
|
||||
|
||||
.. versionadded:: 1.1.0
|
||||
atol : float, default 1e-8
|
||||
Absolute tolerance.
|
||||
|
||||
.. versionadded:: 1.1.0
|
||||
check_dtype: bool, default True
|
||||
check dtype if both a and b are np.ndarray.
|
||||
obj : str, default None
|
||||
Specify object name being compared, internally used to show
|
||||
appropriate assertion message.
|
||||
lobj : str, default None
|
||||
Specify left object name being compared, internally used to show
|
||||
appropriate assertion message.
|
||||
robj : str, default None
|
||||
Specify right object name being compared, internally used to show
|
||||
appropriate assertion message.
|
||||
index_values : ndarray, default None
|
||||
Specify shared index values of objects being compared, internally used
|
||||
to show appropriate assertion message.
|
||||
|
||||
.. versionadded:: 1.1.0
|
||||
|
||||
"""
|
||||
cdef:
|
||||
double diff = 0.0
|
||||
Py_ssize_t i, na, nb
|
||||
double fa, fb
|
||||
bint is_unequal = False, a_is_ndarray, b_is_ndarray
|
||||
|
||||
if lobj is None:
|
||||
lobj = a
|
||||
if robj is None:
|
||||
robj = b
|
||||
|
||||
if isinstance(a, dict) or isinstance(b, dict):
|
||||
return assert_dict_equal(a, b)
|
||||
|
||||
if isinstance(a, str) or isinstance(b, str):
|
||||
assert a == b, f"{a} != {b}"
|
||||
return True
|
||||
|
||||
a_is_ndarray = is_array(a)
|
||||
b_is_ndarray = is_array(b)
|
||||
|
||||
if obj is None:
|
||||
if a_is_ndarray or b_is_ndarray:
|
||||
obj = 'numpy array'
|
||||
else:
|
||||
obj = 'Iterable'
|
||||
|
||||
if isiterable(a):
|
||||
|
||||
if not isiterable(b):
|
||||
from pandas._testing import assert_class_equal
|
||||
|
||||
# classes can't be the same, to raise error
|
||||
assert_class_equal(a, b, obj=obj)
|
||||
|
||||
assert has_length(a) and has_length(b), (
|
||||
f"Can't compare objects without length, one or both is invalid: ({a}, {b})"
|
||||
)
|
||||
|
||||
if a_is_ndarray and b_is_ndarray:
|
||||
na, nb = a.size, b.size
|
||||
if a.shape != b.shape:
|
||||
from pandas._testing import raise_assert_detail
|
||||
raise_assert_detail(
|
||||
obj, f'{obj} shapes are different', a.shape, b.shape)
|
||||
|
||||
if check_dtype and not is_dtype_equal(a.dtype, b.dtype):
|
||||
from pandas._testing import assert_attr_equal
|
||||
assert_attr_equal('dtype', a, b, obj=obj)
|
||||
|
||||
if array_equivalent(a, b, strict_nan=True):
|
||||
return True
|
||||
|
||||
else:
|
||||
na, nb = len(a), len(b)
|
||||
|
||||
if na != nb:
|
||||
from pandas._testing import raise_assert_detail
|
||||
|
||||
# if we have a small diff set, print it
|
||||
if abs(na - nb) < 10:
|
||||
r = list(set(a) ^ set(b))
|
||||
else:
|
||||
r = None
|
||||
|
||||
raise_assert_detail(obj, f"{obj} length are different", na, nb, r)
|
||||
|
||||
for i in range(len(a)):
|
||||
try:
|
||||
assert_almost_equal(a[i], b[i], rtol=rtol, atol=atol)
|
||||
except AssertionError:
|
||||
is_unequal = True
|
||||
diff += 1
|
||||
|
||||
if is_unequal:
|
||||
from pandas._testing import raise_assert_detail
|
||||
msg = (f"{obj} values are different "
|
||||
f"({np.round(diff * 100.0 / na, 5)} %)")
|
||||
raise_assert_detail(obj, msg, lobj, robj, index_values=index_values)
|
||||
|
||||
return True
|
||||
|
||||
elif isiterable(b):
|
||||
from pandas._testing import assert_class_equal
|
||||
|
||||
# classes can't be the same, to raise error
|
||||
assert_class_equal(a, b, obj=obj)
|
||||
|
||||
if isna(a) and isna(b):
|
||||
# TODO: Should require same-dtype NA?
|
||||
# nan / None comparison
|
||||
return True
|
||||
|
||||
if a == b:
|
||||
# object comparison
|
||||
return True
|
||||
|
||||
if is_real_number_object(a) and is_real_number_object(b):
|
||||
if array_equivalent(a, b, strict_nan=True):
|
||||
# inf comparison
|
||||
return True
|
||||
|
||||
fa, fb = a, b
|
||||
|
||||
if not math.isclose(fa, fb, rel_tol=rtol, abs_tol=atol):
|
||||
assert False, (f"expected {fb:.5f} but got {fa:.5f}, "
|
||||
f"with rtol={rtol}, atol={atol}")
|
||||
return True
|
||||
|
||||
if is_complex_object(a) and is_complex_object(b):
|
||||
if array_equivalent(a, b, strict_nan=True):
|
||||
# inf comparison
|
||||
return True
|
||||
|
||||
if not cmath.isclose(a, b, rel_tol=rtol, abs_tol=atol):
|
||||
assert False, (f"expected {b:.5f} but got {a:.5f}, "
|
||||
f"with rtol={rtol}, atol={atol}")
|
||||
return True
|
||||
|
||||
raise AssertionError(f"{a} != {b}")
|
||||
Binary file not shown.
@@ -0,0 +1,28 @@
|
||||
from datetime import tzinfo
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._typing import npt
|
||||
|
||||
def format_array_from_datetime(
|
||||
values: npt.NDArray[np.int64],
|
||||
tz: tzinfo | None = ...,
|
||||
format: str | None = ...,
|
||||
na_rep: object = ...,
|
||||
) -> npt.NDArray[np.object_]: ...
|
||||
def array_with_unit_to_datetime(
|
||||
values: np.ndarray,
|
||||
unit: str,
|
||||
errors: str = ...,
|
||||
) -> tuple[np.ndarray, tzinfo | None]: ...
|
||||
def array_to_datetime(
|
||||
values: npt.NDArray[np.object_],
|
||||
errors: str = ...,
|
||||
dayfirst: bool = ...,
|
||||
yearfirst: bool = ...,
|
||||
utc: bool = ...,
|
||||
require_iso8601: bool = ...,
|
||||
allow_mixed: bool = ...,
|
||||
) -> tuple[np.ndarray, tzinfo | None]: ...
|
||||
|
||||
# returned ndarray may be object dtype or datetime64[ns]
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user