Files
Mileage-Logger/mileage_logger/ingest/semantic_reader.py

258 lines
9.2 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Parser for Google Semantic Location History exports.
Google Takeout and on-device exports of the Timeline API are provided
as JSON files under a ``timelineObjects`` key. Each entry in
``timelineObjects`` is either a ``placeVisit`` or an ``activitySegment``.
This module exposes data classes representing those events and a
convenient loader that normalises timestamps and coordinate formats.
Timestamps in the source JSON are encoded as millisecond epoch
strings. When loaded these are converted into timezone-aware
:class:`datetime.datetime` objects. Coordinates in the JSON are stored
as integer multiples of 1e-7 degrees; we scale them to floats.
"""
from __future__ import annotations
import json
from dataclasses import dataclass
from datetime import datetime, timezone
from typing import Iterable, List, Optional
import pytz
@dataclass
class Location:
"""A simple geographic location.
Attributes
----------
lat : float
Latitude in decimal degrees.
lon : float
Longitude in decimal degrees.
name : str
Human readable name of the location as provided by Google.
"""
lat: float
lon: float
name: str
@dataclass
class PlaceVisit:
"""A visit to a single location for a period of time.
Attributes
----------
location : :class:`Location`
The geographic coordinates and name of the place.
start_time : :class:`datetime`
The timezone-aware start timestamp of the visit.
end_time : :class:`datetime`
The timezone-aware end timestamp of the visit.
"""
location: Location
start_time: datetime
end_time: datetime
@dataclass
class ActivitySegment:
"""A movement between two locations.
While not used directly in itinerary detection, activity segments
contain useful timing information that could be used to derive the
start date for a hop between recognised sites. This class is
provided for completeness and potential future use.
"""
start_location: Location
end_location: Location
start_time: datetime
end_time: datetime
activity_type: str
def _ms_to_dt(ms: str, tz: pytz.BaseTzInfo) -> datetime:
"""Convert a millisecond epoch string into a timezone-aware datetime.
Parameters
----------
ms : str
Milliseconds since the Unix epoch encoded as a decimal string.
tz : :class:`pytz.tzinfo.BaseTzInfo`
The timezone into which to localise the resulting datetime.
Returns
-------
:class:`datetime`
A timezone-aware datetime corresponding to the input.
"""
# Google exports store times in milliseconds since UTC epoch
ts = int(ms) / 1000.0
utc_dt = datetime.fromtimestamp(ts, timezone.utc)
return utc_dt.astimezone(tz)
def _parse_location(raw: dict) -> Location:
"""Parse a location dictionary from the export format.
The export encodes lat/lon in integer multiples of 1e-7 degrees.
This helper scales the values into decimals and extracts the
``name`` field.
Parameters
----------
raw : dict
A mapping containing ``latitudeE7``, ``longitudeE7`` and
``name`` keys.
Returns
-------
:class:`Location`
A populated location object.
"""
lat = raw.get("latitudeE7")
lon = raw.get("longitudeE7")
name = raw.get("name", "")
return Location(lat=float(lat) / 1e7 if lat is not None else 0.0,
lon=float(lon) / 1e7 if lon is not None else 0.0,
name=name)
def load_place_visits(path: str, tz_name: str = "Europe/London") -> List[PlaceVisit]:
"""Load all place visits from a Location History JSON file.
This function supports both the legacy "Semantic Location History"
exports (containing a top-level ``timelineObjects`` array) and
newer on-device Timeline exports that expose a ``semanticSegments``
array. In both cases the goal is to extract "place visits"
periods of time spent at a single location.
For legacy files the timestamps are millisecond epoch strings and
coordinates are encoded as integer multiples of 1e-7 degrees. For
device-local exports the timestamps are ISO 8601 strings with
timezone offsets and coordinates are stored in a ``latLng`` string
on the ``visit.topCandidate.placeLocation``.
Parameters
----------
path : str
Path to the JSON file produced by Google Takeout or the
on-device Timeline export.
tz_name : str, optional
The name of the timezone used for localisation, by default
``Europe/London``. See the ``pytz`` documentation for valid
identifiers.
Returns
-------
list of :class:`PlaceVisit`
A chronologically ordered list of place visits.
"""
with open(path, "r", encoding="utf-8") as f:
data = json.load(f)
tz = pytz.timezone(tz_name)
visits: List[PlaceVisit] = []
# Legacy Semantic Location History format
if "timelineObjects" in data and isinstance(data["timelineObjects"], list):
timeline_objects = data.get("timelineObjects", [])
for obj in timeline_objects:
if "placeVisit" in obj:
pv = obj["placeVisit"]
loc = _parse_location(pv.get("location", {}))
dur = pv.get("duration", {})
start_ms = dur.get("startTimestampMs")
end_ms = dur.get("endTimestampMs")
if start_ms is None or end_ms is None:
# Skip malformed entries
continue
visits.append(PlaceVisit(
location=loc,
start_time=_ms_to_dt(start_ms, tz),
end_time=_ms_to_dt(end_ms, tz),
))
elif "activitySegment" in obj:
# We ignore activity segments for now; they are parsed here
# only to support potential future features such as deriving
# more accurate hop start times.
seg = obj["activitySegment"]
start_loc = _parse_location(seg.get("startLocation", {}))
end_loc = _parse_location(seg.get("endLocation", {}))
dur = seg.get("duration", {})
start_ms = dur.get("startTimestampMs")
end_ms = dur.get("endTimestampMs")
if start_ms is None or end_ms is None:
continue
# Create ActivitySegment instance (unused for now)
# The object is not appended to the visits list because
# itinerary detection only relies on place visits.
_ = ActivitySegment(
start_location=start_loc,
end_location=end_loc,
start_time=_ms_to_dt(start_ms, tz),
end_time=_ms_to_dt(end_ms, tz),
activity_type=seg.get("activityType", "UNKNOWN"),
)
# New device-local Timeline export format
elif "semanticSegments" in data and isinstance(data["semanticSegments"], list):
try:
from dateutil import parser as dateutil_parser # type: ignore
except ImportError:
raise ImportError(
"python-dateutil is required to parse device-local Timeline exports. "
"Install it with 'pip install python-dateutil'."
)
for segment in data["semanticSegments"]:
# Only interested in visit segments; skip activities and path-only entries
visit = segment.get("visit")
if not visit:
continue
# Extract start and end times (ISO 8601 with timezone offsets)
start_time_iso = segment.get("startTime")
end_time_iso = segment.get("endTime")
if not start_time_iso or not end_time_iso:
continue
try:
start_dt = dateutil_parser.isoparse(start_time_iso).astimezone(tz)
end_dt = dateutil_parser.isoparse(end_time_iso).astimezone(tz)
except (ValueError, OverflowError):
# Skip unparseable times
continue
# Extract coordinates; stored as "latLng": "lat°, lon°"
place_loc = visit.get("topCandidate", {}).get("placeLocation", {})
latlng_str = place_loc.get("latLng")
if not latlng_str:
continue
# Strip degree symbol and split into lat/lon components
try:
lat_str, lon_str = [c.strip().replace("°", "") for c in latlng_str.split(",")]
lat = float(lat_str)
lon = float(lon_str)
except Exception:
continue
# Use the semantic type or label as the name if available
candidate = visit.get("topCandidate", {})
name = candidate.get("label") or candidate.get("semanticType") or ""
visits.append(PlaceVisit(
location=Location(lat=lat, lon=lon, name=str(name)),
start_time=start_dt,
end_time=end_dt,
))
# Ignore any other structures (e.g. rawSignals, userLocationProfile)
else:
# If the file doesn't contain known keys, return empty list
return []
# Sort visits chronologically by start time
visits.sort(key=lambda v: v.start_time)
return visits