Initial commit (clean, ignores in place)

This commit is contained in:
2025-08-12 01:13:41 +01:00
commit c74790b014
26 changed files with 2331 additions and 0 deletions

View File

@@ -0,0 +1,18 @@
"""Subpackage for data ingestion.
The :mod:`mileage_logger.ingest` package contains utilities for reading
Google Semantic Location History JSON exports. The core entry point is
the :func:`load_place_visits` function which converts raw JSON into
structured :class:`PlaceVisit` objects. These objects expose
timezone-aware start and end timestamps as well as geographic
coordinates and the human readable name of the location.
"""
from .semantic_reader import Location, PlaceVisit, ActivitySegment, load_place_visits
__all__ = [
"Location",
"PlaceVisit",
"ActivitySegment",
"load_place_visits",
]

View File

@@ -0,0 +1,258 @@
"""Parser for Google Semantic Location History exports.
Google Takeout and on-device exports of the Timeline API are provided
as JSON files under a ``timelineObjects`` key. Each entry in
``timelineObjects`` is either a ``placeVisit`` or an ``activitySegment``.
This module exposes data classes representing those events and a
convenient loader that normalises timestamps and coordinate formats.
Timestamps in the source JSON are encoded as millisecond epoch
strings. When loaded these are converted into timezone-aware
:class:`datetime.datetime` objects. Coordinates in the JSON are stored
as integer multiples of 1e-7 degrees; we scale them to floats.
"""
from __future__ import annotations
import json
from dataclasses import dataclass
from datetime import datetime, timezone
from typing import Iterable, List, Optional
import pytz
@dataclass
class Location:
"""A simple geographic location.
Attributes
----------
lat : float
Latitude in decimal degrees.
lon : float
Longitude in decimal degrees.
name : str
Human readable name of the location as provided by Google.
"""
lat: float
lon: float
name: str
@dataclass
class PlaceVisit:
"""A visit to a single location for a period of time.
Attributes
----------
location : :class:`Location`
The geographic coordinates and name of the place.
start_time : :class:`datetime`
The timezone-aware start timestamp of the visit.
end_time : :class:`datetime`
The timezone-aware end timestamp of the visit.
"""
location: Location
start_time: datetime
end_time: datetime
@dataclass
class ActivitySegment:
"""A movement between two locations.
While not used directly in itinerary detection, activity segments
contain useful timing information that could be used to derive the
start date for a hop between recognised sites. This class is
provided for completeness and potential future use.
"""
start_location: Location
end_location: Location
start_time: datetime
end_time: datetime
activity_type: str
def _ms_to_dt(ms: str, tz: pytz.BaseTzInfo) -> datetime:
"""Convert a millisecond epoch string into a timezone-aware datetime.
Parameters
----------
ms : str
Milliseconds since the Unix epoch encoded as a decimal string.
tz : :class:`pytz.tzinfo.BaseTzInfo`
The timezone into which to localise the resulting datetime.
Returns
-------
:class:`datetime`
A timezone-aware datetime corresponding to the input.
"""
# Google exports store times in milliseconds since UTC epoch
ts = int(ms) / 1000.0
utc_dt = datetime.fromtimestamp(ts, timezone.utc)
return utc_dt.astimezone(tz)
def _parse_location(raw: dict) -> Location:
"""Parse a location dictionary from the export format.
The export encodes lat/lon in integer multiples of 1e-7 degrees.
This helper scales the values into decimals and extracts the
``name`` field.
Parameters
----------
raw : dict
A mapping containing ``latitudeE7``, ``longitudeE7`` and
``name`` keys.
Returns
-------
:class:`Location`
A populated location object.
"""
lat = raw.get("latitudeE7")
lon = raw.get("longitudeE7")
name = raw.get("name", "")
return Location(lat=float(lat) / 1e7 if lat is not None else 0.0,
lon=float(lon) / 1e7 if lon is not None else 0.0,
name=name)
def load_place_visits(path: str, tz_name: str = "Europe/London") -> List[PlaceVisit]:
"""Load all place visits from a Location History JSON file.
This function supports both the legacy "Semantic Location History"
exports (containing a top-level ``timelineObjects`` array) and
newer on-device Timeline exports that expose a ``semanticSegments``
array. In both cases the goal is to extract "place visits"
periods of time spent at a single location.
For legacy files the timestamps are millisecond epoch strings and
coordinates are encoded as integer multiples of 1e-7 degrees. For
device-local exports the timestamps are ISO 8601 strings with
timezone offsets and coordinates are stored in a ``latLng`` string
on the ``visit.topCandidate.placeLocation``.
Parameters
----------
path : str
Path to the JSON file produced by Google Takeout or the
on-device Timeline export.
tz_name : str, optional
The name of the timezone used for localisation, by default
``Europe/London``. See the ``pytz`` documentation for valid
identifiers.
Returns
-------
list of :class:`PlaceVisit`
A chronologically ordered list of place visits.
"""
with open(path, "r", encoding="utf-8") as f:
data = json.load(f)
tz = pytz.timezone(tz_name)
visits: List[PlaceVisit] = []
# Legacy Semantic Location History format
if "timelineObjects" in data and isinstance(data["timelineObjects"], list):
timeline_objects = data.get("timelineObjects", [])
for obj in timeline_objects:
if "placeVisit" in obj:
pv = obj["placeVisit"]
loc = _parse_location(pv.get("location", {}))
dur = pv.get("duration", {})
start_ms = dur.get("startTimestampMs")
end_ms = dur.get("endTimestampMs")
if start_ms is None or end_ms is None:
# Skip malformed entries
continue
visits.append(PlaceVisit(
location=loc,
start_time=_ms_to_dt(start_ms, tz),
end_time=_ms_to_dt(end_ms, tz),
))
elif "activitySegment" in obj:
# We ignore activity segments for now; they are parsed here
# only to support potential future features such as deriving
# more accurate hop start times.
seg = obj["activitySegment"]
start_loc = _parse_location(seg.get("startLocation", {}))
end_loc = _parse_location(seg.get("endLocation", {}))
dur = seg.get("duration", {})
start_ms = dur.get("startTimestampMs")
end_ms = dur.get("endTimestampMs")
if start_ms is None or end_ms is None:
continue
# Create ActivitySegment instance (unused for now)
# The object is not appended to the visits list because
# itinerary detection only relies on place visits.
_ = ActivitySegment(
start_location=start_loc,
end_location=end_loc,
start_time=_ms_to_dt(start_ms, tz),
end_time=_ms_to_dt(end_ms, tz),
activity_type=seg.get("activityType", "UNKNOWN"),
)
# New device-local Timeline export format
elif "semanticSegments" in data and isinstance(data["semanticSegments"], list):
try:
from dateutil import parser as dateutil_parser # type: ignore
except ImportError:
raise ImportError(
"python-dateutil is required to parse device-local Timeline exports. "
"Install it with 'pip install python-dateutil'."
)
for segment in data["semanticSegments"]:
# Only interested in visit segments; skip activities and path-only entries
visit = segment.get("visit")
if not visit:
continue
# Extract start and end times (ISO 8601 with timezone offsets)
start_time_iso = segment.get("startTime")
end_time_iso = segment.get("endTime")
if not start_time_iso or not end_time_iso:
continue
try:
start_dt = dateutil_parser.isoparse(start_time_iso).astimezone(tz)
end_dt = dateutil_parser.isoparse(end_time_iso).astimezone(tz)
except (ValueError, OverflowError):
# Skip unparseable times
continue
# Extract coordinates; stored as "latLng": "lat°, lon°"
place_loc = visit.get("topCandidate", {}).get("placeLocation", {})
latlng_str = place_loc.get("latLng")
if not latlng_str:
continue
# Strip degree symbol and split into lat/lon components
try:
lat_str, lon_str = [c.strip().replace("°", "") for c in latlng_str.split(",")]
lat = float(lat_str)
lon = float(lon_str)
except Exception:
continue
# Use the semantic type or label as the name if available
candidate = visit.get("topCandidate", {})
name = candidate.get("label") or candidate.get("semanticType") or ""
visits.append(PlaceVisit(
location=Location(lat=lat, lon=lon, name=str(name)),
start_time=start_dt,
end_time=end_dt,
))
# Ignore any other structures (e.g. rawSignals, userLocationProfile)
else:
# If the file doesn't contain known keys, return empty list
return []
# Sort visits chronologically by start time
visits.sort(key=lambda v: v.start_time)
return visits