commit c19264f6b843c970f032d89a62f23e592bdc72f3 Author: g Date: Wed Apr 23 21:53:55 2025 +0300 first commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..1771a2e --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +tiles/ +geojson/ +geojson2024/ +nohup.out +geojsons.tar.gz \ No newline at end of file diff --git a/extract.py b/extract.py new file mode 100644 index 0000000..605bb6c --- /dev/null +++ b/extract.py @@ -0,0 +1,43 @@ +from datetime import date, timedelta +from pathlib import Path + +from curl_cffi import requests + +# from tqdm import tqdm + +start_date = date(2024, 1, 1) +end_date = date(2025, 1, 1) +delta = end_date - start_date +dates = [str(start_date + timedelta(days=i)) for i in range(delta.days)] + +z = 8 +xs = [153, 154, 155, 156] +ys = [79, 80] +hours = range(24) + +vars = [ + {"z": z, "x": x, "y": y, "hour": hour, "date": date} + for date in dates + for hour in hours + for x in xs + for y in ys +] +print(len(vars)) + +Path("./tiles").mkdir(parents=True, exist_ok=True) + + +def write_tiles(v): + date = v["date"] + hour = v["hour"] + z = v["z"] + x = v["x"] + y = v["y"] + + url = f"https://prodvizhenie.mos.ru/tiles/v1/taxi/ride-start/{z}/{x}/{y}.pbf?hours={hour},{hour}&weekdays=1,2,3,4,5,6,7&dates={date},{date}&aggregation=h3" + r = requests.get(url, impersonate="chrome") + with open(Path(f"./tiles/{date}_{hour}_{z}_{x}_{y}.pbf"), "wb") as f: + f.write(r.content) + + +[write_tiles(v) for v in vars] diff --git a/transform_concat.py b/transform_concat.py new file mode 100644 index 0000000..1ce9caa --- /dev/null +++ b/transform_concat.py @@ -0,0 +1,25 @@ +import os + +import pandas as pd + +# from tqdm import tqdm + +jsons = os.listdir("geojson") + + +def read_properties(json_filename): + df_json = pd.read_json(f"geojson/{json_filename}") + filename_parts = os.path.splitext(json_filename)[0].split("_") + df_json["date"] = filename_parts[0] + df_json["hour"] = filename_parts[1] + df_json.drop("address", axis="columns", inplace=True) + return df_json + + +dfs = (read_properties(j) for j in jsons) +df = pd.concat(dfs, ignore_index=True) +print(df.shape) +clean_df = df.drop_duplicates() +print(clean_df.shape) +clean_df.to_parquet("2023.parquet", index=False) +print("finish") diff --git a/transform_convert.py b/transform_convert.py new file mode 100644 index 0000000..0b155b3 --- /dev/null +++ b/transform_convert.py @@ -0,0 +1,28 @@ +import json +import os + +import mapbox_vector_tile +from tqdm import tqdm + +tiles = os.listdir("tiles") +tiles = [t for t in tiles if t[:4] == "2024"] + + +def to_geojson(tile): + tile_path = f"tiles/{tile}" + with open(tile_path, "rb") as f: + pbf = f.read() + decoded = mapbox_vector_tile.decode(pbf) + features = decoded["travel"]["features"] + if len(features) == 0: + return + else: + features_properties = [f["properties"] for f in features] + + filename = os.path.splitext(tile)[0] + with open(f"geojson2024/{filename}.geojson", "w") as f: + json.dump(features_properties, f, ensure_ascii=False) + + +[to_geojson(t) for t in tiles] +print("end")