Initial commit (clean, ignores in place)
This commit is contained in:
18
mileage_logger/ingest/__init__.py
Normal file
18
mileage_logger/ingest/__init__.py
Normal file
@@ -0,0 +1,18 @@
|
||||
"""Subpackage for data ingestion.
|
||||
|
||||
The :mod:`mileage_logger.ingest` package contains utilities for reading
|
||||
Google Semantic Location History JSON exports. The core entry point is
|
||||
the :func:`load_place_visits` function which converts raw JSON into
|
||||
structured :class:`PlaceVisit` objects. These objects expose
|
||||
timezone-aware start and end timestamps as well as geographic
|
||||
coordinates and the human readable name of the location.
|
||||
"""
|
||||
|
||||
from .semantic_reader import Location, PlaceVisit, ActivitySegment, load_place_visits
|
||||
|
||||
__all__ = [
|
||||
"Location",
|
||||
"PlaceVisit",
|
||||
"ActivitySegment",
|
||||
"load_place_visits",
|
||||
]
|
258
mileage_logger/ingest/semantic_reader.py
Normal file
258
mileage_logger/ingest/semantic_reader.py
Normal file
@@ -0,0 +1,258 @@
|
||||
"""Parser for Google Semantic Location History exports.
|
||||
|
||||
Google Takeout and on-device exports of the Timeline API are provided
|
||||
as JSON files under a ``timelineObjects`` key. Each entry in
|
||||
``timelineObjects`` is either a ``placeVisit`` or an ``activitySegment``.
|
||||
This module exposes data classes representing those events and a
|
||||
convenient loader that normalises timestamps and coordinate formats.
|
||||
|
||||
Timestamps in the source JSON are encoded as millisecond epoch
|
||||
strings. When loaded these are converted into timezone-aware
|
||||
:class:`datetime.datetime` objects. Coordinates in the JSON are stored
|
||||
as integer multiples of 1e-7 degrees; we scale them to floats.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime, timezone
|
||||
from typing import Iterable, List, Optional
|
||||
|
||||
import pytz
|
||||
|
||||
|
||||
@dataclass
|
||||
class Location:
|
||||
"""A simple geographic location.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
lat : float
|
||||
Latitude in decimal degrees.
|
||||
lon : float
|
||||
Longitude in decimal degrees.
|
||||
name : str
|
||||
Human readable name of the location as provided by Google.
|
||||
"""
|
||||
|
||||
lat: float
|
||||
lon: float
|
||||
name: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class PlaceVisit:
|
||||
"""A visit to a single location for a period of time.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
location : :class:`Location`
|
||||
The geographic coordinates and name of the place.
|
||||
start_time : :class:`datetime`
|
||||
The timezone-aware start timestamp of the visit.
|
||||
end_time : :class:`datetime`
|
||||
The timezone-aware end timestamp of the visit.
|
||||
"""
|
||||
|
||||
location: Location
|
||||
start_time: datetime
|
||||
end_time: datetime
|
||||
|
||||
|
||||
@dataclass
|
||||
class ActivitySegment:
|
||||
"""A movement between two locations.
|
||||
|
||||
While not used directly in itinerary detection, activity segments
|
||||
contain useful timing information that could be used to derive the
|
||||
start date for a hop between recognised sites. This class is
|
||||
provided for completeness and potential future use.
|
||||
"""
|
||||
|
||||
start_location: Location
|
||||
end_location: Location
|
||||
start_time: datetime
|
||||
end_time: datetime
|
||||
activity_type: str
|
||||
|
||||
|
||||
def _ms_to_dt(ms: str, tz: pytz.BaseTzInfo) -> datetime:
|
||||
"""Convert a millisecond epoch string into a timezone-aware datetime.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
ms : str
|
||||
Milliseconds since the Unix epoch encoded as a decimal string.
|
||||
tz : :class:`pytz.tzinfo.BaseTzInfo`
|
||||
The timezone into which to localise the resulting datetime.
|
||||
|
||||
Returns
|
||||
-------
|
||||
:class:`datetime`
|
||||
A timezone-aware datetime corresponding to the input.
|
||||
"""
|
||||
|
||||
# Google exports store times in milliseconds since UTC epoch
|
||||
ts = int(ms) / 1000.0
|
||||
utc_dt = datetime.fromtimestamp(ts, timezone.utc)
|
||||
return utc_dt.astimezone(tz)
|
||||
|
||||
|
||||
def _parse_location(raw: dict) -> Location:
|
||||
"""Parse a location dictionary from the export format.
|
||||
|
||||
The export encodes lat/lon in integer multiples of 1e-7 degrees.
|
||||
This helper scales the values into decimals and extracts the
|
||||
``name`` field.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
raw : dict
|
||||
A mapping containing ``latitudeE7``, ``longitudeE7`` and
|
||||
``name`` keys.
|
||||
|
||||
Returns
|
||||
-------
|
||||
:class:`Location`
|
||||
A populated location object.
|
||||
"""
|
||||
|
||||
lat = raw.get("latitudeE7")
|
||||
lon = raw.get("longitudeE7")
|
||||
name = raw.get("name", "")
|
||||
return Location(lat=float(lat) / 1e7 if lat is not None else 0.0,
|
||||
lon=float(lon) / 1e7 if lon is not None else 0.0,
|
||||
name=name)
|
||||
|
||||
|
||||
def load_place_visits(path: str, tz_name: str = "Europe/London") -> List[PlaceVisit]:
|
||||
"""Load all place visits from a Location History JSON file.
|
||||
|
||||
This function supports both the legacy "Semantic Location History"
|
||||
exports (containing a top-level ``timelineObjects`` array) and
|
||||
newer on-device Timeline exports that expose a ``semanticSegments``
|
||||
array. In both cases the goal is to extract "place visits" –
|
||||
periods of time spent at a single location.
|
||||
|
||||
For legacy files the timestamps are millisecond epoch strings and
|
||||
coordinates are encoded as integer multiples of 1e-7 degrees. For
|
||||
device-local exports the timestamps are ISO 8601 strings with
|
||||
timezone offsets and coordinates are stored in a ``latLng`` string
|
||||
on the ``visit.topCandidate.placeLocation``.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path : str
|
||||
Path to the JSON file produced by Google Takeout or the
|
||||
on-device Timeline export.
|
||||
tz_name : str, optional
|
||||
The name of the timezone used for localisation, by default
|
||||
``Europe/London``. See the ``pytz`` documentation for valid
|
||||
identifiers.
|
||||
|
||||
Returns
|
||||
-------
|
||||
list of :class:`PlaceVisit`
|
||||
A chronologically ordered list of place visits.
|
||||
"""
|
||||
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
|
||||
tz = pytz.timezone(tz_name)
|
||||
visits: List[PlaceVisit] = []
|
||||
|
||||
# Legacy Semantic Location History format
|
||||
if "timelineObjects" in data and isinstance(data["timelineObjects"], list):
|
||||
timeline_objects = data.get("timelineObjects", [])
|
||||
for obj in timeline_objects:
|
||||
if "placeVisit" in obj:
|
||||
pv = obj["placeVisit"]
|
||||
loc = _parse_location(pv.get("location", {}))
|
||||
dur = pv.get("duration", {})
|
||||
start_ms = dur.get("startTimestampMs")
|
||||
end_ms = dur.get("endTimestampMs")
|
||||
if start_ms is None or end_ms is None:
|
||||
# Skip malformed entries
|
||||
continue
|
||||
visits.append(PlaceVisit(
|
||||
location=loc,
|
||||
start_time=_ms_to_dt(start_ms, tz),
|
||||
end_time=_ms_to_dt(end_ms, tz),
|
||||
))
|
||||
elif "activitySegment" in obj:
|
||||
# We ignore activity segments for now; they are parsed here
|
||||
# only to support potential future features such as deriving
|
||||
# more accurate hop start times.
|
||||
seg = obj["activitySegment"]
|
||||
start_loc = _parse_location(seg.get("startLocation", {}))
|
||||
end_loc = _parse_location(seg.get("endLocation", {}))
|
||||
dur = seg.get("duration", {})
|
||||
start_ms = dur.get("startTimestampMs")
|
||||
end_ms = dur.get("endTimestampMs")
|
||||
if start_ms is None or end_ms is None:
|
||||
continue
|
||||
# Create ActivitySegment instance (unused for now)
|
||||
# The object is not appended to the visits list because
|
||||
# itinerary detection only relies on place visits.
|
||||
_ = ActivitySegment(
|
||||
start_location=start_loc,
|
||||
end_location=end_loc,
|
||||
start_time=_ms_to_dt(start_ms, tz),
|
||||
end_time=_ms_to_dt(end_ms, tz),
|
||||
activity_type=seg.get("activityType", "UNKNOWN"),
|
||||
)
|
||||
# New device-local Timeline export format
|
||||
elif "semanticSegments" in data and isinstance(data["semanticSegments"], list):
|
||||
try:
|
||||
from dateutil import parser as dateutil_parser # type: ignore
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"python-dateutil is required to parse device-local Timeline exports. "
|
||||
"Install it with 'pip install python-dateutil'."
|
||||
)
|
||||
for segment in data["semanticSegments"]:
|
||||
# Only interested in visit segments; skip activities and path-only entries
|
||||
visit = segment.get("visit")
|
||||
if not visit:
|
||||
continue
|
||||
# Extract start and end times (ISO 8601 with timezone offsets)
|
||||
start_time_iso = segment.get("startTime")
|
||||
end_time_iso = segment.get("endTime")
|
||||
if not start_time_iso or not end_time_iso:
|
||||
continue
|
||||
try:
|
||||
start_dt = dateutil_parser.isoparse(start_time_iso).astimezone(tz)
|
||||
end_dt = dateutil_parser.isoparse(end_time_iso).astimezone(tz)
|
||||
except (ValueError, OverflowError):
|
||||
# Skip unparseable times
|
||||
continue
|
||||
# Extract coordinates; stored as "latLng": "lat°, lon°"
|
||||
place_loc = visit.get("topCandidate", {}).get("placeLocation", {})
|
||||
latlng_str = place_loc.get("latLng")
|
||||
if not latlng_str:
|
||||
continue
|
||||
# Strip degree symbol and split into lat/lon components
|
||||
try:
|
||||
lat_str, lon_str = [c.strip().replace("°", "") for c in latlng_str.split(",")]
|
||||
lat = float(lat_str)
|
||||
lon = float(lon_str)
|
||||
except Exception:
|
||||
continue
|
||||
# Use the semantic type or label as the name if available
|
||||
candidate = visit.get("topCandidate", {})
|
||||
name = candidate.get("label") or candidate.get("semanticType") or ""
|
||||
visits.append(PlaceVisit(
|
||||
location=Location(lat=lat, lon=lon, name=str(name)),
|
||||
start_time=start_dt,
|
||||
end_time=end_dt,
|
||||
))
|
||||
# Ignore any other structures (e.g. rawSignals, userLocationProfile)
|
||||
else:
|
||||
# If the file doesn't contain known keys, return empty list
|
||||
return []
|
||||
|
||||
# Sort visits chronologically by start time
|
||||
visits.sort(key=lambda v: v.start_time)
|
||||
return visits
|
Reference in New Issue
Block a user