first commit

2022-08-02 09:52:52 -04:00
parent 417ea8660b
commit 05e52aa52b
10444 changed files with 2300232 additions and 0 deletions
--- a/dashboard/flask-server/venv/Lib/site-packages/pandas/io/parquet.py
+++ b/dashboard/flask-server/venv/Lib/site-packages/pandas/io/parquet.py
@@ -0,0 +1,499 @@
+""" parquet compat """
+from __future__ import annotations
+
+import io
+import os
+from typing import Any
+from warnings import catch_warnings
+
+from pandas._typing import (
+    FilePath,
+    ReadBuffer,
+    StorageOptions,
+    WriteBuffer,
+)
+from pandas.compat._optional import import_optional_dependency
+from pandas.errors import AbstractMethodError
+from pandas.util._decorators import doc
+
+from pandas import (
+    DataFrame,
+    MultiIndex,
+    get_option,
+)
+from pandas.core.shared_docs import _shared_docs
+from pandas.util.version import Version
+
+from pandas.io.common import (
+    IOHandles,
+    get_handle,
+    is_fsspec_url,
+    is_url,
+    stringify_path,
+)
+
+
+def get_engine(engine: str) -> BaseImpl:
+    """return our implementation"""
+    if engine == "auto":
+        engine = get_option("io.parquet.engine")
+
+    if engine == "auto":
+        # try engines in this order
+        engine_classes = [PyArrowImpl, FastParquetImpl]
+
+        error_msgs = ""
+        for engine_class in engine_classes:
+            try:
+                return engine_class()
+            except ImportError as err:
+                error_msgs += "\n - " + str(err)
+
+        raise ImportError(
+            "Unable to find a usable engine; "
+            "tried using: 'pyarrow', 'fastparquet'.\n"
+            "A suitable version of "
+            "pyarrow or fastparquet is required for parquet "
+            "support.\n"
+            "Trying to import the above resulted in these errors:"
+            f"{error_msgs}"
+        )
+
+    if engine == "pyarrow":
+        return PyArrowImpl()
+    elif engine == "fastparquet":
+        return FastParquetImpl()
+
+    raise ValueError("engine must be one of 'pyarrow', 'fastparquet'")
+
+
+def _get_path_or_handle(
+    path: FilePath | ReadBuffer[bytes] | WriteBuffer[bytes],
+    fs: Any,
+    storage_options: StorageOptions = None,
+    mode: str = "rb",
+    is_dir: bool = False,
+) -> tuple[
+    FilePath | ReadBuffer[bytes] | WriteBuffer[bytes], IOHandles[bytes] | None, Any
+]:
+    """File handling for PyArrow."""
+    path_or_handle = stringify_path(path)
+    if is_fsspec_url(path_or_handle) and fs is None:
+        fsspec = import_optional_dependency("fsspec")
+
+        fs, path_or_handle = fsspec.core.url_to_fs(
+            path_or_handle, **(storage_options or {})
+        )
+    elif storage_options and (not is_url(path_or_handle) or mode != "rb"):
+        # can't write to a remote url
+        # without making use of fsspec at the moment
+        raise ValueError("storage_options passed with buffer, or non-supported URL")
+
+    handles = None
+    if (
+        not fs
+        and not is_dir
+        and isinstance(path_or_handle, str)
+        and not os.path.isdir(path_or_handle)
+    ):
+        # use get_handle only when we are very certain that it is not a directory
+        # fsspec resources can also point to directories
+        # this branch is used for example when reading from non-fsspec URLs
+        handles = get_handle(
+            path_or_handle, mode, is_text=False, storage_options=storage_options
+        )
+        fs = None
+        path_or_handle = handles.handle
+    return path_or_handle, handles, fs
+
+
+class BaseImpl:
+    @staticmethod
+    def validate_dataframe(df: DataFrame) -> None:
+
+        if not isinstance(df, DataFrame):
+            raise ValueError("to_parquet only supports IO with DataFrames")
+
+        # must have value column names for all index levels (strings only)
+        if isinstance(df.columns, MultiIndex):
+            if not all(
+                x.inferred_type in {"string", "empty"} for x in df.columns.levels
+            ):
+                raise ValueError(
+                    """
+                    parquet must have string column names for all values in
+                     each level of the MultiIndex
+                    """
+                )
+        else:
+            if df.columns.inferred_type not in {"string", "empty"}:
+                raise ValueError("parquet must have string column names")
+
+        # index level names must be strings
+        valid_names = all(
+            isinstance(name, str) for name in df.index.names if name is not None
+        )
+        if not valid_names:
+            raise ValueError("Index level names must be strings")
+
+    def write(self, df: DataFrame, path, compression, **kwargs):
+        raise AbstractMethodError(self)
+
+    def read(self, path, columns=None, **kwargs) -> DataFrame:
+        raise AbstractMethodError(self)
+
+
+class PyArrowImpl(BaseImpl):
+    def __init__(self):
+        import_optional_dependency(
+            "pyarrow", extra="pyarrow is required for parquet support."
+        )
+        import pyarrow.parquet
+
+        # import utils to register the pyarrow extension types
+        import pandas.core.arrays._arrow_utils  # noqa:F401
+
+        self.api = pyarrow
+
+    def write(
+        self,
+        df: DataFrame,
+        path: FilePath | WriteBuffer[bytes],
+        compression: str | None = "snappy",
+        index: bool | None = None,
+        storage_options: StorageOptions = None,
+        partition_cols: list[str] | None = None,
+        **kwargs,
+    ) -> None:
+        self.validate_dataframe(df)
+
+        from_pandas_kwargs: dict[str, Any] = {"schema": kwargs.pop("schema", None)}
+        if index is not None:
+            from_pandas_kwargs["preserve_index"] = index
+
+        table = self.api.Table.from_pandas(df, **from_pandas_kwargs)
+
+        path_or_handle, handles, kwargs["filesystem"] = _get_path_or_handle(
+            path,
+            kwargs.pop("filesystem", None),
+            storage_options=storage_options,
+            mode="wb",
+            is_dir=partition_cols is not None,
+        )
+        try:
+            if partition_cols is not None:
+                # writes to multiple files under the given path
+                self.api.parquet.write_to_dataset(
+                    table,
+                    path_or_handle,
+                    compression=compression,
+                    partition_cols=partition_cols,
+                    **kwargs,
+                )
+            else:
+                # write to single output file
+                self.api.parquet.write_table(
+                    table, path_or_handle, compression=compression, **kwargs
+                )
+        finally:
+            if handles is not None:
+                handles.close()
+
+    def read(
+        self,
+        path,
+        columns=None,
+        use_nullable_dtypes=False,
+        storage_options: StorageOptions = None,
+        **kwargs,
+    ) -> DataFrame:
+        kwargs["use_pandas_metadata"] = True
+
+        to_pandas_kwargs = {}
+        if use_nullable_dtypes:
+            import pandas as pd
+
+            mapping = {
+                self.api.int8(): pd.Int8Dtype(),
+                self.api.int16(): pd.Int16Dtype(),
+                self.api.int32(): pd.Int32Dtype(),
+                self.api.int64(): pd.Int64Dtype(),
+                self.api.uint8(): pd.UInt8Dtype(),
+                self.api.uint16(): pd.UInt16Dtype(),
+                self.api.uint32(): pd.UInt32Dtype(),
+                self.api.uint64(): pd.UInt64Dtype(),
+                self.api.bool_(): pd.BooleanDtype(),
+                self.api.string(): pd.StringDtype(),
+            }
+            to_pandas_kwargs["types_mapper"] = mapping.get
+        manager = get_option("mode.data_manager")
+        if manager == "array":
+            to_pandas_kwargs["split_blocks"] = True  # type: ignore[assignment]
+
+        path_or_handle, handles, kwargs["filesystem"] = _get_path_or_handle(
+            path,
+            kwargs.pop("filesystem", None),
+            storage_options=storage_options,
+            mode="rb",
+        )
+        try:
+            result = self.api.parquet.read_table(
+                path_or_handle, columns=columns, **kwargs
+            ).to_pandas(**to_pandas_kwargs)
+            if manager == "array":
+                result = result._as_manager("array", copy=False)
+            return result
+        finally:
+            if handles is not None:
+                handles.close()
+
+
+class FastParquetImpl(BaseImpl):
+    def __init__(self):
+        # since pandas is a dependency of fastparquet
+        # we need to import on first use
+        fastparquet = import_optional_dependency(
+            "fastparquet", extra="fastparquet is required for parquet support."
+        )
+        self.api = fastparquet
+
+    def write(
+        self,
+        df: DataFrame,
+        path,
+        compression="snappy",
+        index=None,
+        partition_cols=None,
+        storage_options: StorageOptions = None,
+        **kwargs,
+    ) -> None:
+        self.validate_dataframe(df)
+        # thriftpy/protocol/compact.py:339:
+        # DeprecationWarning: tostring() is deprecated.
+        # Use tobytes() instead.
+
+        if "partition_on" in kwargs and partition_cols is not None:
+            raise ValueError(
+                "Cannot use both partition_on and "
+                "partition_cols. Use partition_cols for partitioning data"
+            )
+        elif "partition_on" in kwargs:
+            partition_cols = kwargs.pop("partition_on")
+
+        if partition_cols is not None:
+            kwargs["file_scheme"] = "hive"
+
+        # cannot use get_handle as write() does not accept file buffers
+        path = stringify_path(path)
+        if is_fsspec_url(path):
+            fsspec = import_optional_dependency("fsspec")
+
+            # if filesystem is provided by fsspec, file must be opened in 'wb' mode.
+            kwargs["open_with"] = lambda path, _: fsspec.open(
+                path, "wb", **(storage_options or {})
+            ).open()
+        elif storage_options:
+            raise ValueError(
+                "storage_options passed with file object or non-fsspec file path"
+            )
+
+        with catch_warnings(record=True):
+            self.api.write(
+                path,
+                df,
+                compression=compression,
+                write_index=index,
+                partition_on=partition_cols,
+                **kwargs,
+            )
+
+    def read(
+        self, path, columns=None, storage_options: StorageOptions = None, **kwargs
+    ) -> DataFrame:
+        parquet_kwargs: dict[str, Any] = {}
+        use_nullable_dtypes = kwargs.pop("use_nullable_dtypes", False)
+        if Version(self.api.__version__) >= Version("0.7.1"):
+            # We are disabling nullable dtypes for fastparquet pending discussion
+            parquet_kwargs["pandas_nulls"] = False
+        if use_nullable_dtypes:
+            raise ValueError(
+                "The 'use_nullable_dtypes' argument is not supported for the "
+                "fastparquet engine"
+            )
+        path = stringify_path(path)
+        handles = None
+        if is_fsspec_url(path):
+            fsspec = import_optional_dependency("fsspec")
+
+            if Version(self.api.__version__) > Version("0.6.1"):
+                parquet_kwargs["fs"] = fsspec.open(
+                    path, "rb", **(storage_options or {})
+                ).fs
+            else:
+                parquet_kwargs["open_with"] = lambda path, _: fsspec.open(
+                    path, "rb", **(storage_options or {})
+                ).open()
+        elif isinstance(path, str) and not os.path.isdir(path):
+            # use get_handle only when we are very certain that it is not a directory
+            # fsspec resources can also point to directories
+            # this branch is used for example when reading from non-fsspec URLs
+            handles = get_handle(
+                path, "rb", is_text=False, storage_options=storage_options
+            )
+            path = handles.handle
+
+        parquet_file = self.api.ParquetFile(path, **parquet_kwargs)
+
+        result = parquet_file.to_pandas(columns=columns, **kwargs)
+
+        if handles is not None:
+            handles.close()
+        return result
+
+
+@doc(storage_options=_shared_docs["storage_options"])
+def to_parquet(
+    df: DataFrame,
+    path: FilePath | WriteBuffer[bytes] | None = None,
+    engine: str = "auto",
+    compression: str | None = "snappy",
+    index: bool | None = None,
+    storage_options: StorageOptions = None,
+    partition_cols: list[str] | None = None,
+    **kwargs,
+) -> bytes | None:
+    """
+    Write a DataFrame to the parquet format.
+
+    Parameters
+    ----------
+    df : DataFrame
+    path : str, path object, file-like object, or None, default None
+        String, path object (implementing ``os.PathLike[str]``), or file-like
+        object implementing a binary ``write()`` function. If None, the result is
+        returned as bytes. If a string, it will be used as Root Directory path
+        when writing a partitioned dataset. The engine fastparquet does not
+        accept file-like objects.
+
+        .. versionchanged:: 1.2.0
+
+    engine : {{'auto', 'pyarrow', 'fastparquet'}}, default 'auto'
+        Parquet library to use. If 'auto', then the option
+        ``io.parquet.engine`` is used. The default ``io.parquet.engine``
+        behavior is to try 'pyarrow', falling back to 'fastparquet' if
+        'pyarrow' is unavailable.
+    compression : {{'snappy', 'gzip', 'brotli', 'lz4', 'zstd', None}},
+        default 'snappy'. Name of the compression to use. Use ``None``
+        for no compression. The supported compression methods actually
+        depend on which engine is used. For 'pyarrow', 'snappy', 'gzip',
+        'brotli', 'lz4', 'zstd' are all supported. For 'fastparquet',
+        only 'gzip' and 'snappy' are supported.
+    index : bool, default None
+        If ``True``, include the dataframe's index(es) in the file output. If
+        ``False``, they will not be written to the file.
+        If ``None``, similar to ``True`` the dataframe's index(es)
+        will be saved. However, instead of being saved as values,
+        the RangeIndex will be stored as a range in the metadata so it
+        doesn't require much space and is faster. Other indexes will
+        be included as columns in the file output.
+    partition_cols : str or list, optional, default None
+        Column names by which to partition the dataset.
+        Columns are partitioned in the order they are given.
+        Must be None if path is not a string.
+    {storage_options}
+
+        .. versionadded:: 1.2.0
+
+    kwargs
+        Additional keyword arguments passed to the engine
+
+    Returns
+    -------
+    bytes if no path argument is provided else None
+    """
+    if isinstance(partition_cols, str):
+        partition_cols = [partition_cols]
+    impl = get_engine(engine)
+
+    path_or_buf: FilePath | WriteBuffer[bytes] = io.BytesIO() if path is None else path
+
+    impl.write(
+        df,
+        path_or_buf,
+        compression=compression,
+        index=index,
+        partition_cols=partition_cols,
+        storage_options=storage_options,
+        **kwargs,
+    )
+
+    if path is None:
+        assert isinstance(path_or_buf, io.BytesIO)
+        return path_or_buf.getvalue()
+    else:
+        return None
+
+
+@doc(storage_options=_shared_docs["storage_options"])
+def read_parquet(
+    path,
+    engine: str = "auto",
+    columns=None,
+    storage_options: StorageOptions = None,
+    use_nullable_dtypes: bool = False,
+    **kwargs,
+) -> DataFrame:
+    """
+    Load a parquet object from the file path, returning a DataFrame.
+
+    Parameters
+    ----------
+    path : str, path object or file-like object
+        String, path object (implementing ``os.PathLike[str]``), or file-like
+        object implementing a binary ``read()`` function.
+        The string could be a URL. Valid URL schemes include http, ftp, s3,
+        gs, and file. For file URLs, a host is expected. A local file could be:
+        ``file://localhost/path/to/table.parquet``.
+        A file URL can also be a path to a directory that contains multiple
+        partitioned parquet files. Both pyarrow and fastparquet support
+        paths to directories as well as file URLs. A directory path could be:
+        ``file://localhost/path/to/tables`` or ``s3://bucket/partition_dir``.
+    engine : {{'auto', 'pyarrow', 'fastparquet'}}, default 'auto'
+        Parquet library to use. If 'auto', then the option
+        ``io.parquet.engine`` is used. The default ``io.parquet.engine``
+        behavior is to try 'pyarrow', falling back to 'fastparquet' if
+        'pyarrow' is unavailable.
+    columns : list, default=None
+        If not None, only these columns will be read from the file.
+
+    {storage_options}
+
+        .. versionadded:: 1.3.0
+
+    use_nullable_dtypes : bool, default False
+        If True, use dtypes that use ``pd.NA`` as missing value indicator
+        for the resulting DataFrame. (only applicable for the ``pyarrow``
+        engine)
+        As new dtypes are added that support ``pd.NA`` in the future, the
+        output with this option will change to use those dtypes.
+        Note: this is an experimental option, and behaviour (e.g. additional
+        support dtypes) may change without notice.
+
+        .. versionadded:: 1.2.0
+
+    **kwargs
+        Any additional kwargs are passed to the engine.
+
+    Returns
+    -------
+    DataFrame
+    """
+    impl = get_engine(engine)
+
+    return impl.read(
+        path,
+        columns=columns,
+        storage_options=storage_options,
+        use_nullable_dtypes=use_nullable_dtypes,
+        **kwargs,
+    )