polario.main

Main CLI interface

View Source

 1"""Main CLI interface
 2"""
 3import argparse
 4import json
 5import sys
 6from enum import Enum
 7from functools import reduce
 8from pathlib import Path
 9from pprint import pprint
10
11import polars as pl
12
13from polario import __version__
14
15
16class Command(Enum):
17    SHOW = "show"
18    SCHEMA = "schema"
19    JSON_HEAD = "json_head"
20    JSONL = "jsonl"
21    CONCAT_CSV = "concat_csv"
22    WRITE_CSV = "write_csv"
23
24
25def main() -> int:
26    """Main CLI interface"""
27    parser = argparse.ArgumentParser(
28        description="polario library commandline tool to inspect Parquet files"
29    )
30    parser.add_argument(
31        "--version",
32        action="version",
33        version="%(prog)s {version}".format(version=__version__),
34    )
35    parser.add_argument(
36        "cmd",
37        choices=[c.value for c in Command],
38        help="command to run",
39    )
40    parser.add_argument(
41        "paths",
42        metavar="PATH",
43        type=Path,
44        nargs="+",
45        help="input paths",
46    )
47    args = parser.parse_args()
48    cmd = Command(args.cmd)
49
50    if cmd == Command.CONCAT_CSV:
51        df = reduce(
52            lambda a, b: pl.concat([a, pl.read_csv(b, infer_schema_length=0)]),
53            args.paths[1:],
54            pl.read_csv(args.paths[0], infer_schema_length=0),
55        )
56        print(df)
57        output_filename = Path(args.paths[0].stem + ".parquet")
58        if output_filename.exists():
59            raise ValueError(f"Output file {output_filename} already exists")
60        print("Writing to", output_filename)
61        df.write_parquet(output_filename)
62        return 0
63    paths: list[Path] = args.paths
64    for path in paths:
65        if path.is_dir():
66            raise ValueError(
67                "Input path must be a file. File an issue if you want dataset support."
68            )
69        df = pl.read_parquet(path, use_pyarrow=True)
70        if cmd == Command.SHOW:
71            print(df)
72        elif cmd == Command.SCHEMA:
73            pprint(df.schema)
74        elif cmd == Command.JSON_HEAD:
75            json.dump(df.head().to_dicts(), sys.stdout, indent=2)
76        elif cmd == Command.JSONL:
77            for row in df.to_dicts():
78                json.dump(row, sys.stdout, separators=(",", ":"))
79                sys.stdout.write("\n")
80        elif cmd == Command.WRITE_CSV:
81            output_path = Path(path.name).with_suffix(".csv")
82            if output_path.exists():
83                print(f"Output file {output_path} already exists")
84                continue
85            df.write_csv(output_path)
86    return 0
87
88
89if __name__ == "__main__":
90    sys.exit(main())

class Command(enum.Enum): View Source

17class Command(Enum):
18    SHOW = "show"
19    SCHEMA = "schema"
20    JSON_HEAD = "json_head"
21    JSONL = "jsonl"
22    CONCAT_CSV = "concat_csv"
23    WRITE_CSV = "write_csv"

An enumeration.

SHOW = <Command.SHOW: 'show'>

SCHEMA = <Command.SCHEMA: 'schema'>

JSON_HEAD = <Command.JSON_HEAD: 'json_head'>

JSONL = <Command.JSONL: 'jsonl'>

CONCAT_CSV = <Command.CONCAT_CSV: 'concat_csv'>

WRITE_CSV = <Command.WRITE_CSV: 'write_csv'>

Inherited Members

enum.Enum: name; value

def main() -> int: View Source

26def main() -> int:
27    """Main CLI interface"""
28    parser = argparse.ArgumentParser(
29        description="polario library commandline tool to inspect Parquet files"
30    )
31    parser.add_argument(
32        "--version",
33        action="version",
34        version="%(prog)s {version}".format(version=__version__),
35    )
36    parser.add_argument(
37        "cmd",
38        choices=[c.value for c in Command],
39        help="command to run",
40    )
41    parser.add_argument(
42        "paths",
43        metavar="PATH",
44        type=Path,
45        nargs="+",
46        help="input paths",
47    )
48    args = parser.parse_args()
49    cmd = Command(args.cmd)
50
51    if cmd == Command.CONCAT_CSV:
52        df = reduce(
53            lambda a, b: pl.concat([a, pl.read_csv(b, infer_schema_length=0)]),
54            args.paths[1:],
55            pl.read_csv(args.paths[0], infer_schema_length=0),
56        )
57        print(df)
58        output_filename = Path(args.paths[0].stem + ".parquet")
59        if output_filename.exists():
60            raise ValueError(f"Output file {output_filename} already exists")
61        print("Writing to", output_filename)
62        df.write_parquet(output_filename)
63        return 0
64    paths: list[Path] = args.paths
65    for path in paths:
66        if path.is_dir():
67            raise ValueError(
68                "Input path must be a file. File an issue if you want dataset support."
69            )
70        df = pl.read_parquet(path, use_pyarrow=True)
71        if cmd == Command.SHOW:
72            print(df)
73        elif cmd == Command.SCHEMA:
74            pprint(df.schema)
75        elif cmd == Command.JSON_HEAD:
76            json.dump(df.head().to_dicts(), sys.stdout, indent=2)
77        elif cmd == Command.JSONL:
78            for row in df.to_dicts():
79                json.dump(row, sys.stdout, separators=(",", ":"))
80                sys.stdout.write("\n")
81        elif cmd == Command.WRITE_CSV:
82            output_path = Path(path.name).with_suffix(".csv")
83            if output_path.exists():
84                print(f"Output file {output_path} already exists")
85                continue
86            df.write_csv(output_path)
87    return 0

Main CLI interface