first commit
This commit is contained in:
950
dashboard/flask-server/venv/Lib/site-packages/pandas/io/xml.py
Normal file
950
dashboard/flask-server/venv/Lib/site-packages/pandas/io/xml.py
Normal file
@@ -0,0 +1,950 @@
|
||||
"""
|
||||
:mod:`pandas.io.xml` is a module for reading XML.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import io
|
||||
from typing import Sequence
|
||||
|
||||
from pandas._typing import (
|
||||
CompressionOptions,
|
||||
FilePath,
|
||||
ReadBuffer,
|
||||
StorageOptions,
|
||||
XMLParsers,
|
||||
)
|
||||
from pandas.compat._optional import import_optional_dependency
|
||||
from pandas.errors import (
|
||||
AbstractMethodError,
|
||||
ParserError,
|
||||
)
|
||||
from pandas.util._decorators import (
|
||||
deprecate_nonkeyword_arguments,
|
||||
doc,
|
||||
)
|
||||
|
||||
from pandas.core.dtypes.common import is_list_like
|
||||
|
||||
from pandas.core.frame import DataFrame
|
||||
from pandas.core.shared_docs import _shared_docs
|
||||
|
||||
from pandas.io.common import (
|
||||
file_exists,
|
||||
get_handle,
|
||||
is_fsspec_url,
|
||||
is_url,
|
||||
stringify_path,
|
||||
)
|
||||
from pandas.io.parsers import TextParser
|
||||
|
||||
|
||||
@doc(decompression_options=_shared_docs["decompression_options"] % "path_or_buffer")
|
||||
class _XMLFrameParser:
|
||||
"""
|
||||
Internal subclass to parse XML into DataFrames.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path_or_buffer : a valid JSON str, path object or file-like object
|
||||
Any valid string path is acceptable. The string could be a URL. Valid
|
||||
URL schemes include http, ftp, s3, and file.
|
||||
|
||||
xpath : str or regex
|
||||
The XPath expression to parse required set of nodes for
|
||||
migration to `Data Frame`. `etree` supports limited XPath.
|
||||
|
||||
namespacess : dict
|
||||
The namespaces defined in XML document (`xmlns:namespace='URI')
|
||||
as dicts with key being namespace and value the URI.
|
||||
|
||||
elems_only : bool
|
||||
Parse only the child elements at the specified `xpath`.
|
||||
|
||||
attrs_only : bool
|
||||
Parse only the attributes at the specified `xpath`.
|
||||
|
||||
names : list
|
||||
Column names for Data Frame of parsed XML data.
|
||||
|
||||
encoding : str
|
||||
Encoding of xml object or document.
|
||||
|
||||
stylesheet : str or file-like
|
||||
URL, file, file-like object, or a raw string containing XSLT,
|
||||
`etree` does not support XSLT but retained for consistency.
|
||||
|
||||
{decompression_options}
|
||||
|
||||
.. versionchanged:: 1.4.0 Zstandard support.
|
||||
|
||||
storage_options : dict, optional
|
||||
Extra options that make sense for a particular storage connection,
|
||||
e.g. host, port, username, password, etc.,
|
||||
|
||||
See also
|
||||
--------
|
||||
pandas.io.xml._EtreeFrameParser
|
||||
pandas.io.xml._LxmlFrameParser
|
||||
|
||||
Notes
|
||||
-----
|
||||
To subclass this class effectively you must override the following methods:`
|
||||
* :func:`parse_data`
|
||||
* :func:`_parse_nodes`
|
||||
* :func:`_parse_doc`
|
||||
* :func:`_validate_names`
|
||||
* :func:`_validate_path`
|
||||
|
||||
|
||||
See each method's respective documentation for details on their
|
||||
functionality.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str],
|
||||
xpath: str,
|
||||
namespaces: dict[str, str] | None,
|
||||
elems_only: bool,
|
||||
attrs_only: bool,
|
||||
names: Sequence[str] | None,
|
||||
encoding: str | None,
|
||||
stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None,
|
||||
compression: CompressionOptions,
|
||||
storage_options: StorageOptions,
|
||||
):
|
||||
self.path_or_buffer = path_or_buffer
|
||||
self.xpath = xpath
|
||||
self.namespaces = namespaces
|
||||
self.elems_only = elems_only
|
||||
self.attrs_only = attrs_only
|
||||
self.names = names
|
||||
self.encoding = encoding
|
||||
self.stylesheet = stylesheet
|
||||
self.is_style = None
|
||||
self.compression = compression
|
||||
self.storage_options = storage_options
|
||||
|
||||
def parse_data(self) -> list[dict[str, str | None]]:
|
||||
"""
|
||||
Parse xml data.
|
||||
|
||||
This method will call the other internal methods to
|
||||
validate xpath, names, parse and return specific nodes.
|
||||
"""
|
||||
|
||||
raise AbstractMethodError(self)
|
||||
|
||||
def _parse_nodes(self) -> list[dict[str, str | None]]:
|
||||
"""
|
||||
Parse xml nodes.
|
||||
|
||||
This method will parse the children and attributes of elements
|
||||
in xpath, conditionally for only elements, only attributes
|
||||
or both while optionally renaming node names.
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
* If only elements and only attributes are specified.
|
||||
|
||||
Notes
|
||||
-----
|
||||
Namespace URIs will be removed from return node values.Also,
|
||||
elements with missing children or attributes compared to siblings
|
||||
will have optional keys filled withi None values.
|
||||
"""
|
||||
|
||||
raise AbstractMethodError(self)
|
||||
|
||||
def _validate_path(self) -> None:
|
||||
"""
|
||||
Validate xpath.
|
||||
|
||||
This method checks for syntax, evaluation, or empty nodes return.
|
||||
|
||||
Raises
|
||||
------
|
||||
SyntaxError
|
||||
* If xpah is not supported or issues with namespaces.
|
||||
|
||||
ValueError
|
||||
* If xpah does not return any nodes.
|
||||
"""
|
||||
|
||||
raise AbstractMethodError(self)
|
||||
|
||||
def _validate_names(self) -> None:
|
||||
"""
|
||||
Validate names.
|
||||
|
||||
This method will check if names is a list-like and aligns
|
||||
with length of parse nodes.
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
* If value is not a list and less then length of nodes.
|
||||
"""
|
||||
raise AbstractMethodError(self)
|
||||
|
||||
def _parse_doc(self, raw_doc) -> bytes:
|
||||
"""
|
||||
Build tree from path_or_buffer.
|
||||
|
||||
This method will parse XML object into tree
|
||||
either from string/bytes or file location.
|
||||
"""
|
||||
raise AbstractMethodError(self)
|
||||
|
||||
|
||||
class _EtreeFrameParser(_XMLFrameParser):
|
||||
"""
|
||||
Internal class to parse XML into DataFrames with the Python
|
||||
standard library XML module: `xml.etree.ElementTree`.
|
||||
"""
|
||||
|
||||
def parse_data(self) -> list[dict[str, str | None]]:
|
||||
from xml.etree.ElementTree import XML
|
||||
|
||||
if self.stylesheet is not None:
|
||||
raise ValueError(
|
||||
"To use stylesheet, you need lxml installed and selected as parser."
|
||||
)
|
||||
|
||||
self.xml_doc = XML(self._parse_doc(self.path_or_buffer))
|
||||
|
||||
self._validate_path()
|
||||
self._validate_names()
|
||||
|
||||
return self._parse_nodes()
|
||||
|
||||
def _parse_nodes(self) -> list[dict[str, str | None]]:
|
||||
elems = self.xml_doc.findall(self.xpath, namespaces=self.namespaces)
|
||||
dicts: list[dict[str, str | None]]
|
||||
|
||||
if self.elems_only and self.attrs_only:
|
||||
raise ValueError("Either element or attributes can be parsed not both.")
|
||||
elif self.elems_only:
|
||||
if self.names:
|
||||
dicts = [
|
||||
{
|
||||
**(
|
||||
{el.tag: el.text.strip()}
|
||||
if el.text and not el.text.isspace()
|
||||
else {}
|
||||
),
|
||||
**{
|
||||
nm: ch.text.strip() if ch.text else None
|
||||
for nm, ch in zip(self.names, el.findall("*"))
|
||||
},
|
||||
}
|
||||
for el in elems
|
||||
]
|
||||
else:
|
||||
dicts = [
|
||||
{
|
||||
ch.tag: ch.text.strip() if ch.text else None
|
||||
for ch in el.findall("*")
|
||||
}
|
||||
for el in elems
|
||||
]
|
||||
|
||||
elif self.attrs_only:
|
||||
dicts = [
|
||||
{k: v.strip() if v else None for k, v in el.attrib.items()}
|
||||
for el in elems
|
||||
]
|
||||
|
||||
else:
|
||||
if self.names:
|
||||
dicts = [
|
||||
{
|
||||
**el.attrib,
|
||||
**(
|
||||
{el.tag: el.text.strip()}
|
||||
if el.text and not el.text.isspace()
|
||||
else {}
|
||||
),
|
||||
**{
|
||||
nm: ch.text.strip() if ch.text else None
|
||||
for nm, ch in zip(self.names, el.findall("*"))
|
||||
},
|
||||
}
|
||||
for el in elems
|
||||
]
|
||||
|
||||
else:
|
||||
dicts = [
|
||||
{
|
||||
**el.attrib,
|
||||
**(
|
||||
{el.tag: el.text.strip()}
|
||||
if el.text and not el.text.isspace()
|
||||
else {}
|
||||
),
|
||||
**{
|
||||
ch.tag: ch.text.strip() if ch.text else None
|
||||
for ch in el.findall("*")
|
||||
},
|
||||
}
|
||||
for el in elems
|
||||
]
|
||||
|
||||
dicts = [
|
||||
{k.split("}")[1] if "}" in k else k: v for k, v in d.items()} for d in dicts
|
||||
]
|
||||
|
||||
keys = list(dict.fromkeys([k for d in dicts for k in d.keys()]))
|
||||
dicts = [{k: d[k] if k in d.keys() else None for k in keys} for d in dicts]
|
||||
|
||||
if self.names:
|
||||
dicts = [{nm: v for nm, v in zip(self.names, d.values())} for d in dicts]
|
||||
|
||||
return dicts
|
||||
|
||||
def _validate_path(self) -> None:
|
||||
"""
|
||||
Notes
|
||||
-----
|
||||
`etree` supports limited XPath. If user attempts a more complex
|
||||
expression syntax error will raise.
|
||||
"""
|
||||
|
||||
msg = (
|
||||
"xpath does not return any nodes. "
|
||||
"If document uses namespaces denoted with "
|
||||
"xmlns, be sure to define namespaces and "
|
||||
"use them in xpath."
|
||||
)
|
||||
try:
|
||||
elems = self.xml_doc.find(self.xpath, namespaces=self.namespaces)
|
||||
if elems is None:
|
||||
raise ValueError(msg)
|
||||
|
||||
if elems is not None and elems.find("*") is None and elems.attrib is None:
|
||||
raise ValueError(msg)
|
||||
|
||||
except (KeyError, SyntaxError):
|
||||
raise SyntaxError(
|
||||
"You have used an incorrect or unsupported XPath "
|
||||
"expression for etree library or you used an "
|
||||
"undeclared namespace prefix."
|
||||
)
|
||||
|
||||
def _validate_names(self) -> None:
|
||||
if self.names:
|
||||
parent = self.xml_doc.find(self.xpath, namespaces=self.namespaces)
|
||||
children = parent.findall("*") if parent else []
|
||||
|
||||
if is_list_like(self.names):
|
||||
if len(self.names) < len(children):
|
||||
raise ValueError(
|
||||
"names does not match length of child elements in xpath."
|
||||
)
|
||||
else:
|
||||
raise TypeError(
|
||||
f"{type(self.names).__name__} is not a valid type for names"
|
||||
)
|
||||
|
||||
def _parse_doc(self, raw_doc) -> bytes:
|
||||
from xml.etree.ElementTree import (
|
||||
XMLParser,
|
||||
parse,
|
||||
tostring,
|
||||
)
|
||||
|
||||
handle_data = get_data_from_filepath(
|
||||
filepath_or_buffer=raw_doc,
|
||||
encoding=self.encoding,
|
||||
compression=self.compression,
|
||||
storage_options=self.storage_options,
|
||||
)
|
||||
|
||||
with preprocess_data(handle_data) as xml_data:
|
||||
curr_parser = XMLParser(encoding=self.encoding)
|
||||
r = parse(xml_data, parser=curr_parser)
|
||||
|
||||
return tostring(r.getroot())
|
||||
|
||||
|
||||
class _LxmlFrameParser(_XMLFrameParser):
|
||||
"""
|
||||
Internal class to parse XML into DataFrames with third-party
|
||||
full-featured XML library, `lxml`, that supports
|
||||
XPath 1.0 and XSLT 1.0.
|
||||
"""
|
||||
|
||||
def parse_data(self) -> list[dict[str, str | None]]:
|
||||
"""
|
||||
Parse xml data.
|
||||
|
||||
This method will call the other internal methods to
|
||||
validate xpath, names, optionally parse and run XSLT,
|
||||
and parse original or transformed XML and return specific nodes.
|
||||
"""
|
||||
from lxml.etree import XML
|
||||
|
||||
self.xml_doc = XML(self._parse_doc(self.path_or_buffer))
|
||||
|
||||
if self.stylesheet is not None:
|
||||
self.xsl_doc = XML(self._parse_doc(self.stylesheet))
|
||||
self.xml_doc = XML(self._transform_doc())
|
||||
|
||||
self._validate_path()
|
||||
self._validate_names()
|
||||
|
||||
return self._parse_nodes()
|
||||
|
||||
def _parse_nodes(self) -> list[dict[str, str | None]]:
|
||||
elems = self.xml_doc.xpath(self.xpath, namespaces=self.namespaces)
|
||||
dicts: list[dict[str, str | None]]
|
||||
|
||||
if self.elems_only and self.attrs_only:
|
||||
raise ValueError("Either element or attributes can be parsed not both.")
|
||||
|
||||
elif self.elems_only:
|
||||
if self.names:
|
||||
dicts = [
|
||||
{
|
||||
**(
|
||||
{el.tag: el.text.strip()}
|
||||
if el.text and not el.text.isspace()
|
||||
else {}
|
||||
),
|
||||
**{
|
||||
nm: ch.text.strip() if ch.text else None
|
||||
for nm, ch in zip(self.names, el.xpath("*"))
|
||||
},
|
||||
}
|
||||
for el in elems
|
||||
]
|
||||
else:
|
||||
dicts = [
|
||||
{
|
||||
ch.tag: ch.text.strip() if ch.text else None
|
||||
for ch in el.xpath("*")
|
||||
}
|
||||
for el in elems
|
||||
]
|
||||
|
||||
elif self.attrs_only:
|
||||
dicts = [el.attrib for el in elems]
|
||||
|
||||
else:
|
||||
if self.names:
|
||||
dicts = [
|
||||
{
|
||||
**el.attrib,
|
||||
**(
|
||||
{el.tag: el.text.strip()}
|
||||
if el.text and not el.text.isspace()
|
||||
else {}
|
||||
),
|
||||
**{
|
||||
nm: ch.text.strip() if ch.text else None
|
||||
for nm, ch in zip(self.names, el.xpath("*"))
|
||||
},
|
||||
}
|
||||
for el in elems
|
||||
]
|
||||
else:
|
||||
dicts = [
|
||||
{
|
||||
**el.attrib,
|
||||
**(
|
||||
{el.tag: el.text.strip()}
|
||||
if el.text and not el.text.isspace()
|
||||
else {}
|
||||
),
|
||||
**{
|
||||
ch.tag: ch.text.strip() if ch.text else None
|
||||
for ch in el.xpath("*")
|
||||
},
|
||||
}
|
||||
for el in elems
|
||||
]
|
||||
|
||||
if self.namespaces or "}" in list(dicts[0].keys())[0]:
|
||||
dicts = [
|
||||
{k.split("}")[1] if "}" in k else k: v for k, v in d.items()}
|
||||
for d in dicts
|
||||
]
|
||||
|
||||
keys = list(dict.fromkeys([k for d in dicts for k in d.keys()]))
|
||||
dicts = [{k: d[k] if k in d.keys() else None for k in keys} for d in dicts]
|
||||
|
||||
if self.names:
|
||||
dicts = [{nm: v for nm, v in zip(self.names, d.values())} for d in dicts]
|
||||
|
||||
return dicts
|
||||
|
||||
def _validate_path(self) -> None:
|
||||
|
||||
msg = (
|
||||
"xpath does not return any nodes. "
|
||||
"Be sure row level nodes are in xpath. "
|
||||
"If document uses namespaces denoted with "
|
||||
"xmlns, be sure to define namespaces and "
|
||||
"use them in xpath."
|
||||
)
|
||||
|
||||
elems = self.xml_doc.xpath(self.xpath, namespaces=self.namespaces)
|
||||
children = self.xml_doc.xpath(self.xpath + "/*", namespaces=self.namespaces)
|
||||
attrs = self.xml_doc.xpath(self.xpath + "/@*", namespaces=self.namespaces)
|
||||
|
||||
if elems == []:
|
||||
raise ValueError(msg)
|
||||
|
||||
if elems != [] and attrs == [] and children == []:
|
||||
raise ValueError(msg)
|
||||
|
||||
def _validate_names(self) -> None:
|
||||
"""
|
||||
Validate names.
|
||||
|
||||
This method will check if names is a list and aligns with
|
||||
length of parse nodes.
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
* If value is not a list and less then length of nodes.
|
||||
"""
|
||||
if self.names:
|
||||
children = self.xml_doc.xpath(
|
||||
self.xpath + "[1]/*", namespaces=self.namespaces
|
||||
)
|
||||
|
||||
if is_list_like(self.names):
|
||||
if len(self.names) < len(children):
|
||||
raise ValueError(
|
||||
"names does not match length of child elements in xpath."
|
||||
)
|
||||
else:
|
||||
raise TypeError(
|
||||
f"{type(self.names).__name__} is not a valid type for names"
|
||||
)
|
||||
|
||||
def _parse_doc(self, raw_doc) -> bytes:
|
||||
from lxml.etree import (
|
||||
XMLParser,
|
||||
fromstring,
|
||||
parse,
|
||||
tostring,
|
||||
)
|
||||
|
||||
handle_data = get_data_from_filepath(
|
||||
filepath_or_buffer=raw_doc,
|
||||
encoding=self.encoding,
|
||||
compression=self.compression,
|
||||
storage_options=self.storage_options,
|
||||
)
|
||||
|
||||
with preprocess_data(handle_data) as xml_data:
|
||||
curr_parser = XMLParser(encoding=self.encoding)
|
||||
|
||||
if isinstance(xml_data, io.StringIO):
|
||||
if self.encoding is None:
|
||||
raise TypeError(
|
||||
"Can not pass encoding None when input is StringIO."
|
||||
)
|
||||
|
||||
doc = fromstring(
|
||||
xml_data.getvalue().encode(self.encoding), parser=curr_parser
|
||||
)
|
||||
else:
|
||||
doc = parse(xml_data, parser=curr_parser)
|
||||
|
||||
return tostring(doc)
|
||||
|
||||
def _transform_doc(self) -> bytes:
|
||||
"""
|
||||
Transform original tree using stylesheet.
|
||||
|
||||
This method will transform original xml using XSLT script into
|
||||
am ideally flatter xml document for easier parsing and migration
|
||||
to Data Frame.
|
||||
"""
|
||||
from lxml.etree import XSLT
|
||||
|
||||
transformer = XSLT(self.xsl_doc)
|
||||
new_doc = transformer(self.xml_doc)
|
||||
|
||||
return bytes(new_doc)
|
||||
|
||||
|
||||
def get_data_from_filepath(
|
||||
filepath_or_buffer: FilePath | bytes | ReadBuffer[bytes] | ReadBuffer[str],
|
||||
encoding: str | None,
|
||||
compression: CompressionOptions,
|
||||
storage_options: StorageOptions,
|
||||
) -> str | bytes | ReadBuffer[bytes] | ReadBuffer[str]:
|
||||
"""
|
||||
Extract raw XML data.
|
||||
|
||||
The method accepts three input types:
|
||||
1. filepath (string-like)
|
||||
2. file-like object (e.g. open file object, StringIO)
|
||||
3. XML string or bytes
|
||||
|
||||
This method turns (1) into (2) to simplify the rest of the processing.
|
||||
It returns input types (2) and (3) unchanged.
|
||||
"""
|
||||
if not isinstance(filepath_or_buffer, bytes):
|
||||
filepath_or_buffer = stringify_path(filepath_or_buffer)
|
||||
|
||||
if (
|
||||
isinstance(filepath_or_buffer, str)
|
||||
and not filepath_or_buffer.startswith(("<?xml", "<"))
|
||||
) and (
|
||||
not isinstance(filepath_or_buffer, str)
|
||||
or is_url(filepath_or_buffer)
|
||||
or is_fsspec_url(filepath_or_buffer)
|
||||
or file_exists(filepath_or_buffer)
|
||||
):
|
||||
with get_handle(
|
||||
filepath_or_buffer,
|
||||
"r",
|
||||
encoding=encoding,
|
||||
compression=compression,
|
||||
storage_options=storage_options,
|
||||
) as handle_obj:
|
||||
filepath_or_buffer = (
|
||||
# error: Incompatible types in assignment (expression has type
|
||||
# "Union[str, IO[str]]", variable has type "Union[Union[str,
|
||||
# PathLike[str]], bytes, ReadBuffer[bytes], ReadBuffer[str]]")
|
||||
handle_obj.handle.read() # type: ignore[assignment]
|
||||
if hasattr(handle_obj.handle, "read")
|
||||
else handle_obj.handle
|
||||
)
|
||||
|
||||
return filepath_or_buffer
|
||||
|
||||
|
||||
def preprocess_data(data) -> io.StringIO | io.BytesIO:
|
||||
"""
|
||||
Convert extracted raw data.
|
||||
|
||||
This method will return underlying data of extracted XML content.
|
||||
The data either has a `read` attribute (e.g. a file object or a
|
||||
StringIO/BytesIO) or is a string or bytes that is an XML document.
|
||||
"""
|
||||
|
||||
if isinstance(data, str):
|
||||
data = io.StringIO(data)
|
||||
|
||||
elif isinstance(data, bytes):
|
||||
data = io.BytesIO(data)
|
||||
|
||||
return data
|
||||
|
||||
|
||||
def _data_to_frame(data, **kwargs) -> DataFrame:
|
||||
"""
|
||||
Convert parsed data to Data Frame.
|
||||
|
||||
This method will bind xml dictionary data of keys and values
|
||||
into named columns of Data Frame using the built-in TextParser
|
||||
class that build Data Frame and infers specific dtypes.
|
||||
"""
|
||||
|
||||
tags = next(iter(data))
|
||||
nodes = [list(d.values()) for d in data]
|
||||
|
||||
try:
|
||||
with TextParser(nodes, names=tags, **kwargs) as tp:
|
||||
return tp.read()
|
||||
except ParserError:
|
||||
raise ParserError(
|
||||
"XML document may be too complex for import. "
|
||||
"Try to flatten document and use distinct "
|
||||
"element and attribute names."
|
||||
)
|
||||
|
||||
|
||||
def _parse(
|
||||
path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str],
|
||||
xpath: str,
|
||||
namespaces: dict[str, str] | None,
|
||||
elems_only: bool,
|
||||
attrs_only: bool,
|
||||
names: Sequence[str] | None,
|
||||
encoding: str | None,
|
||||
parser: XMLParsers,
|
||||
stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None,
|
||||
compression: CompressionOptions,
|
||||
storage_options: StorageOptions,
|
||||
**kwargs,
|
||||
) -> DataFrame:
|
||||
"""
|
||||
Call internal parsers.
|
||||
|
||||
This method will conditionally call internal parsers:
|
||||
LxmlFrameParser and/or EtreeParser.
|
||||
|
||||
Raises
|
||||
------
|
||||
ImportError
|
||||
* If lxml is not installed if selected as parser.
|
||||
|
||||
ValueError
|
||||
* If parser is not lxml or etree.
|
||||
"""
|
||||
|
||||
p: _EtreeFrameParser | _LxmlFrameParser
|
||||
|
||||
if parser == "lxml":
|
||||
lxml = import_optional_dependency("lxml.etree", errors="ignore")
|
||||
|
||||
if lxml is not None:
|
||||
p = _LxmlFrameParser(
|
||||
path_or_buffer,
|
||||
xpath,
|
||||
namespaces,
|
||||
elems_only,
|
||||
attrs_only,
|
||||
names,
|
||||
encoding,
|
||||
stylesheet,
|
||||
compression,
|
||||
storage_options,
|
||||
)
|
||||
else:
|
||||
raise ImportError("lxml not found, please install or use the etree parser.")
|
||||
|
||||
elif parser == "etree":
|
||||
p = _EtreeFrameParser(
|
||||
path_or_buffer,
|
||||
xpath,
|
||||
namespaces,
|
||||
elems_only,
|
||||
attrs_only,
|
||||
names,
|
||||
encoding,
|
||||
stylesheet,
|
||||
compression,
|
||||
storage_options,
|
||||
)
|
||||
else:
|
||||
raise ValueError("Values for parser can only be lxml or etree.")
|
||||
|
||||
data_dicts = p.parse_data()
|
||||
|
||||
return _data_to_frame(data=data_dicts, **kwargs)
|
||||
|
||||
|
||||
@deprecate_nonkeyword_arguments(
|
||||
version=None, allowed_args=["path_or_buffer"], stacklevel=2
|
||||
)
|
||||
@doc(
|
||||
storage_options=_shared_docs["storage_options"],
|
||||
decompression_options=_shared_docs["decompression_options"] % "path_or_buffer",
|
||||
)
|
||||
def read_xml(
|
||||
path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str],
|
||||
xpath: str = "./*",
|
||||
namespaces: dict[str, str] | None = None,
|
||||
elems_only: bool = False,
|
||||
attrs_only: bool = False,
|
||||
names: Sequence[str] | None = None,
|
||||
# encoding can not be None for lxml and StringIO input
|
||||
encoding: str | None = "utf-8",
|
||||
parser: XMLParsers = "lxml",
|
||||
stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None = None,
|
||||
compression: CompressionOptions = "infer",
|
||||
storage_options: StorageOptions = None,
|
||||
) -> DataFrame:
|
||||
r"""
|
||||
Read XML document into a ``DataFrame`` object.
|
||||
|
||||
.. versionadded:: 1.3.0
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path_or_buffer : str, path object, or file-like object
|
||||
String, path object (implementing ``os.PathLike[str]``), or file-like
|
||||
object implementing a ``read()`` function. The string can be any valid XML
|
||||
string or a path. The string can further be a URL. Valid URL schemes
|
||||
include http, ftp, s3, and file.
|
||||
|
||||
xpath : str, optional, default './\*'
|
||||
The XPath to parse required set of nodes for migration to DataFrame.
|
||||
XPath should return a collection of elements and not a single
|
||||
element. Note: The ``etree`` parser supports limited XPath
|
||||
expressions. For more complex XPath, use ``lxml`` which requires
|
||||
installation.
|
||||
|
||||
namespaces : dict, optional
|
||||
The namespaces defined in XML document as dicts with key being
|
||||
namespace prefix and value the URI. There is no need to include all
|
||||
namespaces in XML, only the ones used in ``xpath`` expression.
|
||||
Note: if XML document uses default namespace denoted as
|
||||
`xmlns='<URI>'` without a prefix, you must assign any temporary
|
||||
namespace prefix such as 'doc' to the URI in order to parse
|
||||
underlying nodes and/or attributes. For example, ::
|
||||
|
||||
namespaces = {{"doc": "https://example.com"}}
|
||||
|
||||
elems_only : bool, optional, default False
|
||||
Parse only the child elements at the specified ``xpath``. By default,
|
||||
all child elements and non-empty text nodes are returned.
|
||||
|
||||
attrs_only : bool, optional, default False
|
||||
Parse only the attributes at the specified ``xpath``.
|
||||
By default, all attributes are returned.
|
||||
|
||||
names : list-like, optional
|
||||
Column names for DataFrame of parsed XML data. Use this parameter to
|
||||
rename original element names and distinguish same named elements.
|
||||
|
||||
encoding : str, optional, default 'utf-8'
|
||||
Encoding of XML document.
|
||||
|
||||
parser : {{'lxml','etree'}}, default 'lxml'
|
||||
Parser module to use for retrieval of data. Only 'lxml' and
|
||||
'etree' are supported. With 'lxml' more complex XPath searches
|
||||
and ability to use XSLT stylesheet are supported.
|
||||
|
||||
stylesheet : str, path object or file-like object
|
||||
A URL, file-like object, or a raw string containing an XSLT script.
|
||||
This stylesheet should flatten complex, deeply nested XML documents
|
||||
for easier parsing. To use this feature you must have ``lxml`` module
|
||||
installed and specify 'lxml' as ``parser``. The ``xpath`` must
|
||||
reference nodes of transformed XML document generated after XSLT
|
||||
transformation and not the original XML document. Only XSLT 1.0
|
||||
scripts and not later versions is currently supported.
|
||||
|
||||
{decompression_options}
|
||||
|
||||
.. versionchanged:: 1.4.0 Zstandard support.
|
||||
|
||||
{storage_options}
|
||||
|
||||
Returns
|
||||
-------
|
||||
df
|
||||
A DataFrame.
|
||||
|
||||
See Also
|
||||
--------
|
||||
read_json : Convert a JSON string to pandas object.
|
||||
read_html : Read HTML tables into a list of DataFrame objects.
|
||||
|
||||
Notes
|
||||
-----
|
||||
This method is best designed to import shallow XML documents in
|
||||
following format which is the ideal fit for the two-dimensions of a
|
||||
``DataFrame`` (row by column). ::
|
||||
|
||||
<root>
|
||||
<row>
|
||||
<column1>data</column1>
|
||||
<column2>data</column2>
|
||||
<column3>data</column3>
|
||||
...
|
||||
</row>
|
||||
<row>
|
||||
...
|
||||
</row>
|
||||
...
|
||||
</root>
|
||||
|
||||
As a file format, XML documents can be designed any way including
|
||||
layout of elements and attributes as long as it conforms to W3C
|
||||
specifications. Therefore, this method is a convenience handler for
|
||||
a specific flatter design and not all possible XML structures.
|
||||
|
||||
However, for more complex XML documents, ``stylesheet`` allows you to
|
||||
temporarily redesign original document with XSLT (a special purpose
|
||||
language) for a flatter version for migration to a DataFrame.
|
||||
|
||||
This function will *always* return a single :class:`DataFrame` or raise
|
||||
exceptions due to issues with XML document, ``xpath``, or other
|
||||
parameters.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> xml = '''<?xml version='1.0' encoding='utf-8'?>
|
||||
... <data xmlns="http://example.com">
|
||||
... <row>
|
||||
... <shape>square</shape>
|
||||
... <degrees>360</degrees>
|
||||
... <sides>4.0</sides>
|
||||
... </row>
|
||||
... <row>
|
||||
... <shape>circle</shape>
|
||||
... <degrees>360</degrees>
|
||||
... <sides/>
|
||||
... </row>
|
||||
... <row>
|
||||
... <shape>triangle</shape>
|
||||
... <degrees>180</degrees>
|
||||
... <sides>3.0</sides>
|
||||
... </row>
|
||||
... </data>'''
|
||||
|
||||
>>> df = pd.read_xml(xml)
|
||||
>>> df
|
||||
shape degrees sides
|
||||
0 square 360 4.0
|
||||
1 circle 360 NaN
|
||||
2 triangle 180 3.0
|
||||
|
||||
>>> xml = '''<?xml version='1.0' encoding='utf-8'?>
|
||||
... <data>
|
||||
... <row shape="square" degrees="360" sides="4.0"/>
|
||||
... <row shape="circle" degrees="360"/>
|
||||
... <row shape="triangle" degrees="180" sides="3.0"/>
|
||||
... </data>'''
|
||||
|
||||
>>> df = pd.read_xml(xml, xpath=".//row")
|
||||
>>> df
|
||||
shape degrees sides
|
||||
0 square 360 4.0
|
||||
1 circle 360 NaN
|
||||
2 triangle 180 3.0
|
||||
|
||||
>>> xml = '''<?xml version='1.0' encoding='utf-8'?>
|
||||
... <doc:data xmlns:doc="https://example.com">
|
||||
... <doc:row>
|
||||
... <doc:shape>square</doc:shape>
|
||||
... <doc:degrees>360</doc:degrees>
|
||||
... <doc:sides>4.0</doc:sides>
|
||||
... </doc:row>
|
||||
... <doc:row>
|
||||
... <doc:shape>circle</doc:shape>
|
||||
... <doc:degrees>360</doc:degrees>
|
||||
... <doc:sides/>
|
||||
... </doc:row>
|
||||
... <doc:row>
|
||||
... <doc:shape>triangle</doc:shape>
|
||||
... <doc:degrees>180</doc:degrees>
|
||||
... <doc:sides>3.0</doc:sides>
|
||||
... </doc:row>
|
||||
... </doc:data>'''
|
||||
|
||||
>>> df = pd.read_xml(xml,
|
||||
... xpath="//doc:row",
|
||||
... namespaces={{"doc": "https://example.com"}})
|
||||
>>> df
|
||||
shape degrees sides
|
||||
0 square 360 4.0
|
||||
1 circle 360 NaN
|
||||
2 triangle 180 3.0
|
||||
"""
|
||||
|
||||
return _parse(
|
||||
path_or_buffer=path_or_buffer,
|
||||
xpath=xpath,
|
||||
namespaces=namespaces,
|
||||
elems_only=elems_only,
|
||||
attrs_only=attrs_only,
|
||||
names=names,
|
||||
encoding=encoding,
|
||||
parser=parser,
|
||||
stylesheet=stylesheet,
|
||||
compression=compression,
|
||||
storage_options=storage_options,
|
||||
)
|
||||
Reference in New Issue
Block a user