Initial commit (clean, ignores in place)

This commit is contained in:
2025-08-12 01:13:41 +01:00
commit c74790b014
26 changed files with 2331 additions and 0 deletions

View File

@@ -0,0 +1,38 @@
"""Top level package for the mileage logging tool.
This package provides a set of modules used to ingest Google Semantic
Location History data, detect work related travel itineraries based on a
whitelisted set of sites, resolve driving distances between those sites
and export the resulting mileage claims into an Excel workbook ready
for submission to a HR system.
The project is organised into subpackages:
* :mod:`mileage_logger.ingest` parse Google Takeout JSON exports
into structured Python objects.
* :mod:`mileage_logger.logic` implement the state machine that
identifies ordered hops between recognised locations in a days
timeline.
* :mod:`mileage_logger.distance` resolve distances via a route
catalogue or, optionally, an external API with caching.
* :mod:`mileage_logger.export` write Excel workbooks or CSV files
containing the final mileage log.
* :mod:`mileage_logger.cli` command line interface for invoking
common workflows such as importing a new export or rebuilding a
monthly workbook.
This package requires Python 3.11 or newer. See the README for
installation and usage instructions.
"""
from .ingest import semantic_reader # noqa: F401
from .logic import detect_itinerary # noqa: F401
from .distance import resolve # noqa: F401
from .export import excel_writer # noqa: F401
__all__ = [
"semantic_reader",
"detect_itinerary",
"resolve",
"excel_writer",
]

189
mileage_logger/cli.py Normal file
View File

@@ -0,0 +1,189 @@
"""Command line interface for the mileage logging tool."""
from __future__ import annotations
import argparse
import os
from datetime import date, datetime, timedelta
from typing import Optional, Tuple
import pytz
from .ingest.semantic_reader import load_place_visits
from .logic.detect_itinerary import SiteConfig, detect_itinerary
from .distance.resolve import DistanceResolver
from .export.excel_writer import build_monthly_rows, write_monthly_workbook
TZ = pytz.timezone("Europe/London")
def _today_local() -> date:
return datetime.now(TZ).date()
def _prev_month_bounds(today: Optional[date] = None) -> Tuple[date, date]:
"""Return (start_date, end_date) for the previous calendar month in Europe/London."""
if today is None:
today = _today_local()
first_this_month = today.replace(day=1)
last_prev_month = first_this_month - timedelta(days=1)
start_prev_month = last_prev_month.replace(day=1)
return start_prev_month, last_prev_month
def _month_bounds(ym: str) -> Tuple[date, date]:
"""Return (start_date, end_date) for the given YYYY-MM."""
year, month = map(int, ym.split("-"))
start = date(year, month, 1)
if month == 12:
end = date(year + 1, 1, 1) - timedelta(days=1)
else:
end = date(year, month + 1, 1) - timedelta(days=1)
return start, end
def _parse_date(s: str) -> date:
y, m, d = map(int, s.split("-"))
return date(y, m, d)
def import_file(
json_path: str,
site_config_path: str,
route_csv_path: str,
output_dir: str,
assume_home_start: bool,
weekdays_only: bool,
month: Optional[str],
last_month: bool,
since: Optional[str],
until: Optional[str],
days: Optional[int],
) -> None:
"""Import a single JSON file and write Excel workbooks (one per month)."""
visits = load_place_visits(json_path)
if not visits:
print(f"No place visits found in {json_path}")
return
# 1) Determine date range filter
start_date: Optional[date] = None
end_date: Optional[date] = None
if month:
start_date, end_date = _month_bounds(month)
elif last_month:
start_date, end_date = _prev_month_bounds()
elif since or until:
if since:
start_date = _parse_date(since)
if until:
end_date = _parse_date(until)
elif days:
end_date = _today_local()
start_date = end_date - timedelta(days=days - 1)
# 2) Apply date filtering to visits (by visit.start_time local date)
if start_date or end_date:
def in_range(v):
d = v.start_time.date()
if start_date and d < start_date:
return False
if end_date and d > end_date:
return False
return True
visits = [v for v in visits if in_range(v)]
if not visits:
label = f"{start_date or ''}..{end_date or ''}"
print(f"No place visits in requested range {label}")
return
site_config = SiteConfig.from_yaml(site_config_path)
hops = detect_itinerary(visits, site_config, assume_home_start=assume_home_start)
if not hops:
print("No recognised hops detected after filtering.")
return
# 3) Weekday filter (Sat=5, Sun=6)
if weekdays_only:
hops = [h for h in hops if h.date.weekday() < 5]
if not hops:
print("All hops fell on weekends; nothing to write.")
return
resolver = DistanceResolver(route_csv_path)
rows_by_month = build_monthly_rows(hops, site_config, resolver)
# 4) Write one workbook per month present
os.makedirs(output_dir, exist_ok=True)
for month_key, rows in sorted(rows_by_month.items()):
# If a specific month/range was requested, rows_by_month will already reflect it.
output_path = os.path.join(output_dir, f"mileage_{month_key}.xlsx")
write_monthly_workbook({month_key: rows}, output_path)
print(f"Wrote {output_path} ({len(rows)} rows)")
def main(argv: list[str] | None = None) -> None:
parser = argparse.ArgumentParser(description="Mileage logging tool")
subparsers = parser.add_subparsers(dest="command", required=True)
import_parser = subparsers.add_parser("import", help="Import a single JSON export")
import_parser.add_argument("json_path", help="Path to the JSON file to import")
import_parser.add_argument(
"--sites", dest="site_config_path",
default=os.path.join(os.path.dirname(__file__), "../config/sites.yml"),
help="Path to the sites.yml configuration",
)
import_parser.add_argument(
"--routes", dest="route_csv_path",
default=os.path.join(os.path.dirname(__file__), "../tests/data/routes_golden.csv"),
help="Path to the routes CSV catalogue",
)
import_parser.add_argument(
"--output", dest="output_dir", default=os.getcwd(),
help="Directory to write the Excel workbook(s)",
)
# Behavior toggles
import_parser.add_argument(
"--no-assume-home-start", action="store_true",
help="Do not inject a Home→first-site hop when a day doesn't start at Home.",
)
import_parser.add_argument(
"--weekdays-only", action="store_true",
help="Exclude Saturday/Sunday hops.",
)
# Date filters (choose one style)
import_parser.add_argument("--last-month", action="store_true",
help="Process the previous calendar month.")
import_parser.add_argument("--month", metavar="YYYY-MM",
help="Process a specific calendar month, e.g. 2025-08.")
import_parser.add_argument("--since", metavar="YYYY-MM-DD",
help="Lower bound (inclusive) for visits to process.")
import_parser.add_argument("--until", metavar="YYYY-MM-DD",
help="Upper bound (inclusive) for visits to process.")
import_parser.add_argument("--days", type=int,
help="Process the last N days (relative to today).")
args = parser.parse_args(argv)
if args.command == "import":
import_file(
args.json_path,
args.site_config_path,
args.route_csv_path,
args.output_dir,
assume_home_start=(not args.no_assume_home_start),
weekdays_only=args.weekdays_only,
month=args.month,
last_month=args.last_month,
since=args.since,
until=args.until,
days=args.days,
)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,12 @@
"""Distance resolution utilities.
This subpackage exposes classes and functions that resolve driving
distances between pairs of recognised sites. The primary
implementation is :class:`DistanceResolver`, which first consults a
pre-defined route catalogue before optionally consulting an external
API and finally falling back to a simple geodesic calculation.
"""
from .resolve import DistanceResolver, haversine_distance
__all__ = ["DistanceResolver", "haversine_distance"]

View File

@@ -0,0 +1,210 @@
"""Resolve driving distances between sites.
The :class:`DistanceResolver` class provides a simple mechanism to
determine the distance in miles between two points. It is designed to
prefer a local route catalogue (CSV) if available, fall back to
external API calls when API keys are configured and, as a last
resort, compute a straight-line distance using the haversine
formula.
Caching is performed to avoid repeated API calls or calculations. A
time-to-live (TTL) can be specified when constructing the resolver
although it is currently not enforced in the simple in-memory
implementation. Distances are rounded to one decimal place as
required by HR mileage claim forms.
"""
from __future__ import annotations
import csv
import math
import os
import time
from dataclasses import dataclass
from datetime import datetime
from typing import Dict, Optional, Tuple
try:
import httpx # type: ignore
except ImportError: # Optional dependency. If unavailable, API calls will be skipped.
httpx = None # type: ignore
from ..logic.detect_itinerary import haversine_distance
@dataclass
class _CacheEntry:
distance: float
timestamp: float
class DistanceResolver:
"""Resolve driving distances between two locations.
The resolver consults an in-memory cache, a local route catalogue,
an optional external API and finally falls back to a straight-line
calculation using the haversine formula. Distances are cached for
the lifetime of the object. Rounding to one decimal mile is
applied uniformly.
"""
def __init__(self, route_csv_path: Optional[str] = None, api_key: Optional[str] = None,
http_client: Optional[object] = None, ttl_seconds: float = 365 * 24 * 3600,
vehicle_label: str = "SH11 DRV (Own 1.6CC Diesel Car/Van)", job_role: str = "ICT Technician"):
"""Initialise the distance resolver.
Parameters
----------
route_csv_path : str, optional
Path to a CSV file containing pre-approved route distances.
The file should have at least three columns: origin,
destination and miles. The entries are assumed to be
directional; if symmetric distances are desired both
directions must be provided.
api_key : str, optional
API key for the Google Routes API. If omitted, API calls
will be skipped.
http_client : :class:`httpx.Client`, optional
HTTP client instance to use for API requests. A new client
will be created if not provided.
ttl_seconds : float, optional
Time-to-live for cache entries in seconds. Expired
entries are recomputed on demand. The default is one year.
"""
self.api_key = api_key
# Only store an HTTP client if provided and httpx is available.
# When httpx is unavailable the client will be ignored and API
# calls will be skipped.
self.http_client = http_client if httpx is not None else None
self.ttl_seconds = ttl_seconds
self.vehicle_label = vehicle_label
self.job_role = job_role
self.cache: Dict[Tuple[str, str], _CacheEntry] = {}
# Load route catalogue
self.route_catalog: Dict[Tuple[str, str], float] = {}
if route_csv_path and os.path.exists(route_csv_path):
with open(route_csv_path, "r", encoding="utf-8") as f:
reader = csv.reader(f)
for row in reader:
if not row or row[0].startswith("#"):
continue
try:
origin, destination, miles_str = row[:3]
miles = float(miles_str)
self.route_catalog[(origin.strip(), destination.strip())] = miles
except Exception:
# Skip malformed entries silently
continue
def _get_from_cache(self, origin: str, dest: str) -> Optional[float]:
"""Retrieve a cached distance if present and unexpired."""
entry = self.cache.get((origin, dest))
if entry is None:
return None
if (time.time() - entry.timestamp) > self.ttl_seconds:
# Expired
return None
return entry.distance
def _set_cache(self, origin: str, dest: str, distance: float) -> None:
"""Cache the given distance for the origin/destination pair."""
self.cache[(origin, dest)] = _CacheEntry(distance=distance, timestamp=time.time())
def resolve(self, origin_name: str, dest_name: str, origin_coords: Tuple[float, float], dest_coords: Tuple[float, float]) -> float:
"""Resolve the distance between two sites in miles.
This method will consult the cache, route catalogue, external API
and finally compute a haversine distance. Once resolved, the
distance is cached and rounded to one decimal place.
Parameters
----------
origin_name : str
Canonical name of the origin site. Used for cache and
catalogue lookups.
dest_name : str
Canonical name of the destination site.
origin_coords : tuple(float, float)
Latitude and longitude of the origin in decimal degrees.
dest_coords : tuple(float, float)
Latitude and longitude of the destination in decimal degrees.
Returns
-------
float
The resolved driving distance in miles, rounded to one
decimal place.
"""
# First check the cache
cached = self._get_from_cache(origin_name, dest_name)
if cached is not None:
return cached
# Second consult the route catalogue
catalogue_key = (origin_name, dest_name)
if catalogue_key in self.route_catalog:
dist = self.route_catalog[catalogue_key]
rounded = round(dist, 1)
self._set_cache(origin_name, dest_name, rounded)
return rounded
# Attempt to call external API if configured
if self.api_key:
try:
dist = self._call_google_routes_api(origin_coords, dest_coords)
if dist is not None:
rounded = round(dist, 1)
self._set_cache(origin_name, dest_name, rounded)
return rounded
except Exception:
# Swallow API errors and fall back
pass
# Fall back to haversine distance
dist = haversine_distance(origin_coords[0], origin_coords[1], dest_coords[0], dest_coords[1])
rounded = round(dist, 1)
self._set_cache(origin_name, dest_name, rounded)
return rounded
def _call_google_routes_api(self, origin_coords: Tuple[float, float], dest_coords: Tuple[float, float]) -> Optional[float]:
"""Call the Google Maps Routes API to compute driving distance.
Note that this is a blocking call. The caller should ensure that
network access is permitted and that a valid API key has been
configured. If the request fails or the response cannot be
parsed, ``None`` is returned.
"""
# Construct the API request
# See https://developers.google.com/maps/documentation/routes for details
base_url = "https://routes.googleapis.com/directions/v2:computeRoutes"
# Compose JSON payload
payload = {
"origin": {"location": {"latLng": {"latitude": origin_coords[0], "longitude": origin_coords[1]}}},
"destination": {"location": {"latLng": {"latitude": dest_coords[0], "longitude": dest_coords[1]}}},
"travelMode": "DRIVE",
"routingPreference": "TRAFFIC_AWARE",
"computeAlternativeRoutes": False,
"units": "IMPERIAL",
}
headers = {
"Content-Type": "application/json",
"X-Goog-Api-Key": self.api_key,
"X-Goog-FieldMask": "routes.duration,routes.distanceMeters",
}
# If httpx is unavailable, or no API key is configured, skip API call
if httpx is None or self.http_client is None:
return None
resp = self.http_client.post(base_url, json=payload, headers=headers)
if resp.status_code != 200:
return None
try:
data = resp.json()
routes = data.get("routes") or []
if not routes:
return None
# Distance is returned in meters; convert to miles
meters = routes[0]["distanceMeters"]
miles = meters / 1609.34
return float(miles)
except Exception:
return None

View File

@@ -0,0 +1,5 @@
"""Export utilities for writing mileage logs to Excel and CSV."""
from .excel_writer import write_monthly_workbook, build_monthly_rows
__all__ = ["write_monthly_workbook", "build_monthly_rows"]

View File

@@ -0,0 +1,122 @@
"""Write mileage itineraries to Excel workbooks.
This module uses :mod:`openpyxl` to construct a workbook with one sheet
per month. Each row corresponds to a single hop between recognised
sites. Columns follow the specification used by the EveryHR system:
* ``Date`` calendar date in ISO format (YYYY-MM-DD).
* ``Purpose`` free text summarising the journey, e.g. ``"Travel from
Home to Lingwood Primary Academy 13.2mi"``.
* ``Miles`` numeric value rounded to one decimal place.
* ``Vehicle`` the vehicle descriptor configured for the user.
* ``Job Role`` the job role of the user.
* ``From`` friendly label of the origin site.
* ``To`` friendly label of the destination site.
* ``Notes`` blank for manual additions.
Rows are grouped by month (YYYY-MM). Each sheet is named after the
month and contains a header row followed by one row per hop in
chronological order.
"""
from __future__ import annotations
import os
from collections import defaultdict
from datetime import date
from typing import Dict, Iterable, List, Tuple
from openpyxl import Workbook
from openpyxl.utils import get_column_letter
from ..logic.detect_itinerary import Hop, SiteConfig
def build_monthly_rows(hops: Iterable[Hop], site_config: SiteConfig, distance_resolver) -> Dict[str, List[Tuple[str, str, float, str, str, str, str, str]]]:
"""Prepare rows grouped by month for Excel output.
Parameters
----------
hops : iterable of :class:`Hop`
The hops produced by itinerary detection.
site_config : :class:`SiteConfig`
Used to look up friendly labels for canonical site names.
distance_resolver : object
An object with a ``resolve(origin_name, dest_name, origin_coords, dest_coords)``
method that returns a distance in miles. See
:class:`~mileage_logger.distance.resolve.DistanceResolver`.
Returns
-------
dict mapping str -> list of tuples
Keys are month strings in the form ``YYYY-MM``. Values are
lists of tuples containing the data for each row: (date_str,
purpose, miles, vehicle, job_role, from_label, to_label, notes).
"""
rows_by_month: Dict[str, List[Tuple[str, str, float, str, str, str, str, str]]] = defaultdict(list)
for hop in hops:
month_key = hop.date.strftime("%Y-%m")
origin_site = site_config.by_canonical.get(hop.origin)
dest_site = site_config.by_canonical.get(hop.destination)
if origin_site is None or dest_site is None:
continue
# Resolve distance
dist = distance_resolver.resolve(
hop.origin,
hop.destination,
(origin_site.lat, origin_site.lon),
(dest_site.lat, dest_site.lon),
)
# Build purpose string
purpose = f"Travel from {origin_site.label} to {dest_site.label} {dist:.1f}mi"
rows_by_month[month_key].append(
(
hop.date.isoformat(),
purpose,
dist,
distance_resolver.vehicle_label if hasattr(distance_resolver, "vehicle_label") else "SH11 DRV (Own 1.6CC Diesel Car/Van)",
distance_resolver.job_role if hasattr(distance_resolver, "job_role") else "ICT Technician",
origin_site.label,
dest_site.label,
"",
)
)
return rows_by_month
def write_monthly_workbook(rows_by_month: Dict[str, List[Tuple[str, str, float, str, str, str, str, str]]], output_path: str) -> None:
"""Write the grouped rows into an Excel workbook.
Parameters
----------
rows_by_month : dict
Mapping from month strings to lists of row tuples as returned
by :func:`build_monthly_rows`.
output_path : str
Path of the Excel workbook to write. Any existing file will be
overwritten.
"""
wb = Workbook()
# Remove the default sheet created by openpyxl
default_sheet = wb.active
wb.remove(default_sheet)
for month, rows in sorted(rows_by_month.items()):
ws = wb.create_sheet(title=month)
# Write header
header = ["Date", "Purpose", "Miles", "Vehicle", "Job Role", "From", "To", "Notes"]
ws.append(header)
for row in rows:
ws.append(list(row))
# Autosize columns (approximate)
for col_idx in range(1, len(header) + 1):
column_letter = get_column_letter(col_idx)
max_length = max(
len(str(ws.cell(row=r + 1, column=col_idx).value)) for r in range(len(rows) + 1)
)
# Add a little extra padding
ws.column_dimensions[column_letter].width = max_length + 2
# Ensure directory exists
os.makedirs(os.path.dirname(output_path), exist_ok=True)
wb.save(output_path)

131
mileage_logger/gui.py Normal file
View File

@@ -0,0 +1,131 @@
"""Simple web GUI for the mileage logger.
This module exposes a FastAPI application that wraps the core
functionality of the mileage logger with a minimal HTML front end. It
allows a user to upload a Google Semantic Location History JSON file
and returns an Excel workbook containing their mileage claims. The
application also renders a basic status page showing the detected
itinerary.
Usage
-----
Run the server using uvicorn:
```
uvicorn mileage_logger.gui:app --reload --port 8000
```
Then navigate to ``http://localhost:8000`` in your web browser. Use
the form to upload a JSON export. After processing, the server will
return an Excel file for download.
Limitations
-----------
This GUI is intentionally lightweight and is not designed for
concurrent multi-user access. It does not persist files on disk and
does not perform any authentication or authorisation. For production
use consider extending it with proper user management and storage.
"""
from __future__ import annotations
import json
import os
import tempfile
from io import BytesIO
from typing import Dict, List
from fastapi import FastAPI, File, Form, UploadFile
from fastapi.responses import HTMLResponse, FileResponse, StreamingResponse
from .ingest.semantic_reader import load_place_visits
from .logic.detect_itinerary import SiteConfig, detect_itinerary
from .distance.resolve import DistanceResolver
from .export.excel_writer import build_monthly_rows, write_monthly_workbook
# Load configuration once at startup. You can change the path to
# config/sites.yml if you have customised it. The route catalogue is
# loaded on-demand when handling uploads.
DEFAULT_SITE_CONFIG_PATH = os.path.join(os.path.dirname(__file__), "../config/sites.yml")
DEFAULT_ROUTE_CSV_PATH = os.path.join(os.path.dirname(__file__), "../tests/data/routes_golden.csv")
site_config: SiteConfig = SiteConfig.from_yaml(DEFAULT_SITE_CONFIG_PATH)
app = FastAPI(title="Mileage Logger GUI")
@app.get("/", response_class=HTMLResponse)
async def index() -> str:
"""Render a simple upload form."""
return """
<html>
<head>
<title>Mileage Logger</title>
</head>
<body>
<h1>Mileage Logger</h1>
<p>Select a Google Takeout JSON file to process. The file
should contain the "timelineObjects" array from your Semantic
Location History export.</p>
<form action="/process" method="post" enctype="multipart/form-data">
<input type="file" name="file" accept="application/json" required />
<br/><br/>
<label for="vehicle">Vehicle description:</label>
<input type="text" id="vehicle" name="vehicle" value="SH11 DRV (Own 1.6CC Diesel Car/Van)" />
<br/><br/>
<label for="job_role">Job role:</label>
<input type="text" id="job_role" name="job_role" value="ICT Technician" />
<br/><br/>
<input type="submit" value="Process" />
</form>
</body>
</html>
"""
@app.post("/process")
async def process_file(
file: UploadFile = File(...),
vehicle: str = Form("SH11 DRV (Own 1.6CC Diesel Car/Van)"),
job_role: str = Form("ICT Technician"),
) -> StreamingResponse:
"""Handle upload and return an Excel workbook.
The uploaded file is saved to a temporary file on disk and then
passed through the existing CLI pipeline. The resulting workbook
contains one sheet per month and is returned as a streaming
response.
"""
# Persist upload to a temporary file
with tempfile.NamedTemporaryFile(delete=False, suffix=".json") as tmp_in:
contents = await file.read()
tmp_in.write(contents)
tmp_in.flush()
input_path = tmp_in.name
# Parse visits and detect itinerary
visits = load_place_visits(input_path)
hops = detect_itinerary(visits, site_config)
resolver = DistanceResolver(route_csv_path=DEFAULT_ROUTE_CSV_PATH, vehicle_label=vehicle, job_role=job_role)
rows_by_month = build_monthly_rows(hops, site_config, resolver)
# Write workbook to in-memory buffer
output_stream = BytesIO()
# Use openpyxl to write into BytesIO via our helper
# Since write_monthly_workbook writes to a file, create another temp file
with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as tmp_out:
write_monthly_workbook(rows_by_month, tmp_out.name)
tmp_out.flush()
# Read the file back into memory
tmp_out.seek(0)
data = tmp_out.read()
output_stream.write(data)
# Cleanup temporary files
try:
os.remove(input_path)
except Exception:
pass
# Prepare response
output_stream.seek(0)
filename = "mileage.xlsx"
headers = {"Content-Disposition": f"attachment; filename={filename}"}
return StreamingResponse(output_stream, media_type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", headers=headers)

View File

@@ -0,0 +1,18 @@
"""Subpackage for data ingestion.
The :mod:`mileage_logger.ingest` package contains utilities for reading
Google Semantic Location History JSON exports. The core entry point is
the :func:`load_place_visits` function which converts raw JSON into
structured :class:`PlaceVisit` objects. These objects expose
timezone-aware start and end timestamps as well as geographic
coordinates and the human readable name of the location.
"""
from .semantic_reader import Location, PlaceVisit, ActivitySegment, load_place_visits
__all__ = [
"Location",
"PlaceVisit",
"ActivitySegment",
"load_place_visits",
]

View File

@@ -0,0 +1,258 @@
"""Parser for Google Semantic Location History exports.
Google Takeout and on-device exports of the Timeline API are provided
as JSON files under a ``timelineObjects`` key. Each entry in
``timelineObjects`` is either a ``placeVisit`` or an ``activitySegment``.
This module exposes data classes representing those events and a
convenient loader that normalises timestamps and coordinate formats.
Timestamps in the source JSON are encoded as millisecond epoch
strings. When loaded these are converted into timezone-aware
:class:`datetime.datetime` objects. Coordinates in the JSON are stored
as integer multiples of 1e-7 degrees; we scale them to floats.
"""
from __future__ import annotations
import json
from dataclasses import dataclass
from datetime import datetime, timezone
from typing import Iterable, List, Optional
import pytz
@dataclass
class Location:
"""A simple geographic location.
Attributes
----------
lat : float
Latitude in decimal degrees.
lon : float
Longitude in decimal degrees.
name : str
Human readable name of the location as provided by Google.
"""
lat: float
lon: float
name: str
@dataclass
class PlaceVisit:
"""A visit to a single location for a period of time.
Attributes
----------
location : :class:`Location`
The geographic coordinates and name of the place.
start_time : :class:`datetime`
The timezone-aware start timestamp of the visit.
end_time : :class:`datetime`
The timezone-aware end timestamp of the visit.
"""
location: Location
start_time: datetime
end_time: datetime
@dataclass
class ActivitySegment:
"""A movement between two locations.
While not used directly in itinerary detection, activity segments
contain useful timing information that could be used to derive the
start date for a hop between recognised sites. This class is
provided for completeness and potential future use.
"""
start_location: Location
end_location: Location
start_time: datetime
end_time: datetime
activity_type: str
def _ms_to_dt(ms: str, tz: pytz.BaseTzInfo) -> datetime:
"""Convert a millisecond epoch string into a timezone-aware datetime.
Parameters
----------
ms : str
Milliseconds since the Unix epoch encoded as a decimal string.
tz : :class:`pytz.tzinfo.BaseTzInfo`
The timezone into which to localise the resulting datetime.
Returns
-------
:class:`datetime`
A timezone-aware datetime corresponding to the input.
"""
# Google exports store times in milliseconds since UTC epoch
ts = int(ms) / 1000.0
utc_dt = datetime.fromtimestamp(ts, timezone.utc)
return utc_dt.astimezone(tz)
def _parse_location(raw: dict) -> Location:
"""Parse a location dictionary from the export format.
The export encodes lat/lon in integer multiples of 1e-7 degrees.
This helper scales the values into decimals and extracts the
``name`` field.
Parameters
----------
raw : dict
A mapping containing ``latitudeE7``, ``longitudeE7`` and
``name`` keys.
Returns
-------
:class:`Location`
A populated location object.
"""
lat = raw.get("latitudeE7")
lon = raw.get("longitudeE7")
name = raw.get("name", "")
return Location(lat=float(lat) / 1e7 if lat is not None else 0.0,
lon=float(lon) / 1e7 if lon is not None else 0.0,
name=name)
def load_place_visits(path: str, tz_name: str = "Europe/London") -> List[PlaceVisit]:
"""Load all place visits from a Location History JSON file.
This function supports both the legacy "Semantic Location History"
exports (containing a top-level ``timelineObjects`` array) and
newer on-device Timeline exports that expose a ``semanticSegments``
array. In both cases the goal is to extract "place visits"
periods of time spent at a single location.
For legacy files the timestamps are millisecond epoch strings and
coordinates are encoded as integer multiples of 1e-7 degrees. For
device-local exports the timestamps are ISO 8601 strings with
timezone offsets and coordinates are stored in a ``latLng`` string
on the ``visit.topCandidate.placeLocation``.
Parameters
----------
path : str
Path to the JSON file produced by Google Takeout or the
on-device Timeline export.
tz_name : str, optional
The name of the timezone used for localisation, by default
``Europe/London``. See the ``pytz`` documentation for valid
identifiers.
Returns
-------
list of :class:`PlaceVisit`
A chronologically ordered list of place visits.
"""
with open(path, "r", encoding="utf-8") as f:
data = json.load(f)
tz = pytz.timezone(tz_name)
visits: List[PlaceVisit] = []
# Legacy Semantic Location History format
if "timelineObjects" in data and isinstance(data["timelineObjects"], list):
timeline_objects = data.get("timelineObjects", [])
for obj in timeline_objects:
if "placeVisit" in obj:
pv = obj["placeVisit"]
loc = _parse_location(pv.get("location", {}))
dur = pv.get("duration", {})
start_ms = dur.get("startTimestampMs")
end_ms = dur.get("endTimestampMs")
if start_ms is None or end_ms is None:
# Skip malformed entries
continue
visits.append(PlaceVisit(
location=loc,
start_time=_ms_to_dt(start_ms, tz),
end_time=_ms_to_dt(end_ms, tz),
))
elif "activitySegment" in obj:
# We ignore activity segments for now; they are parsed here
# only to support potential future features such as deriving
# more accurate hop start times.
seg = obj["activitySegment"]
start_loc = _parse_location(seg.get("startLocation", {}))
end_loc = _parse_location(seg.get("endLocation", {}))
dur = seg.get("duration", {})
start_ms = dur.get("startTimestampMs")
end_ms = dur.get("endTimestampMs")
if start_ms is None or end_ms is None:
continue
# Create ActivitySegment instance (unused for now)
# The object is not appended to the visits list because
# itinerary detection only relies on place visits.
_ = ActivitySegment(
start_location=start_loc,
end_location=end_loc,
start_time=_ms_to_dt(start_ms, tz),
end_time=_ms_to_dt(end_ms, tz),
activity_type=seg.get("activityType", "UNKNOWN"),
)
# New device-local Timeline export format
elif "semanticSegments" in data and isinstance(data["semanticSegments"], list):
try:
from dateutil import parser as dateutil_parser # type: ignore
except ImportError:
raise ImportError(
"python-dateutil is required to parse device-local Timeline exports. "
"Install it with 'pip install python-dateutil'."
)
for segment in data["semanticSegments"]:
# Only interested in visit segments; skip activities and path-only entries
visit = segment.get("visit")
if not visit:
continue
# Extract start and end times (ISO 8601 with timezone offsets)
start_time_iso = segment.get("startTime")
end_time_iso = segment.get("endTime")
if not start_time_iso or not end_time_iso:
continue
try:
start_dt = dateutil_parser.isoparse(start_time_iso).astimezone(tz)
end_dt = dateutil_parser.isoparse(end_time_iso).astimezone(tz)
except (ValueError, OverflowError):
# Skip unparseable times
continue
# Extract coordinates; stored as "latLng": "lat°, lon°"
place_loc = visit.get("topCandidate", {}).get("placeLocation", {})
latlng_str = place_loc.get("latLng")
if not latlng_str:
continue
# Strip degree symbol and split into lat/lon components
try:
lat_str, lon_str = [c.strip().replace("°", "") for c in latlng_str.split(",")]
lat = float(lat_str)
lon = float(lon_str)
except Exception:
continue
# Use the semantic type or label as the name if available
candidate = visit.get("topCandidate", {})
name = candidate.get("label") or candidate.get("semanticType") or ""
visits.append(PlaceVisit(
location=Location(lat=lat, lon=lon, name=str(name)),
start_time=start_dt,
end_time=end_dt,
))
# Ignore any other structures (e.g. rawSignals, userLocationProfile)
else:
# If the file doesn't contain known keys, return empty list
return []
# Sort visits chronologically by start time
visits.sort(key=lambda v: v.start_time)
return visits

View File

@@ -0,0 +1,13 @@
"""Business logic for detecting work itineraries.
This package exposes functions used to interpret a chronologically
ordered list of :class:`PlaceVisit` objects and reduce them into a
sequence of 'hops' between recognised work locations. Recognition is
driven by a site configuration file (YAML) that defines canonical
names, friendly labels, optional aliases and geofences for each
location.
"""
from .detect_itinerary import SiteConfig, SiteEntry, Hop, detect_itinerary
__all__ = ["SiteConfig", "SiteEntry", "Hop", "detect_itinerary"]

View File

@@ -0,0 +1,176 @@
"""Detect ordered hops between whitelisted sites in a day's timeline.
We process visits per calendar day (Europe/London), resetting state each
day. We also support injecting a synthetic Home→FirstSite hop when the
first recognised site of the day isn't Home (assume_home_start).
"""
from __future__ import annotations
from dataclasses import dataclass
from datetime import date
from typing import Dict, Iterable, List, Optional, Tuple
from collections import defaultdict
import math
import yaml
from ..ingest.semantic_reader import Location, PlaceVisit
def haversine_distance(lat1: float, lon1: float, lat2: float, lon2: float) -> float:
"""Compute the great-circle distance between two points in miles."""
R = 3958.8 # Earth radius in miles
phi1 = math.radians(lat1)
phi2 = math.radians(lat2)
dphi = math.radians(lat2 - lat1)
dlambda = math.radians(lon2 - lon1)
a = math.sin(dphi / 2.0) ** 2 + math.cos(phi1) * math.cos(phi2) * math.sin(dlambda / 2.0) ** 2
c = 2 * math.atan2(math.sqrt(a), math.sqrt(max(0.0, 1 - a)))
return R * c
@dataclass
class SiteEntry:
"""Represents a single recognised site from the configuration."""
canonical: str
label: str
lat: float
lon: float
radius_m: float
aliases: List[str]
class SiteConfig:
"""Holds all recognised site definitions keyed by canonical name."""
def __init__(self, sites: Iterable[SiteEntry]):
self.by_canonical: Dict[str, SiteEntry] = {s.canonical: s for s in sites}
self.alias_map: Dict[str, str] = {}
for site in sites:
for alias in [site.canonical] + site.aliases:
self.alias_map[alias.lower()] = site.canonical
@classmethod
def from_yaml(cls, path: str) -> "SiteConfig":
"""Load a site configuration from a YAML file."""
with open(path, "r", encoding="utf-8") as f:
raw = yaml.safe_load(f)
sites_data: List[Dict[str, object]] = []
if isinstance(raw, list):
sites_data = raw
elif isinstance(raw, dict):
if "sites" in raw and isinstance(raw["sites"], list):
sites_data = raw["sites"]
else:
for canon, entry in raw.items():
entry = entry or {}
if not isinstance(entry, dict):
raise ValueError("Site entry for %s must be a mapping" % canon)
entry = dict(entry)
entry.setdefault("canonical", canon)
sites_data.append(entry)
else:
raise ValueError("Invalid site configuration format")
sites: List[SiteEntry] = []
for entry in sites_data:
canonical = entry.get("canonical") or entry.get("name")
if not canonical:
raise ValueError("Site entry missing canonical name")
label = entry.get("label", canonical)
lat = float(entry.get("lat", 0.0))
lon = float(entry.get("lon", 0.0))
radius_m = float(entry.get("radius_m", 0.0))
aliases = entry.get("aliases") or []
sites.append(SiteEntry(
canonical=canonical,
label=label,
lat=lat,
lon=lon,
radius_m=radius_m,
aliases=list(aliases),
))
return cls(sites)
def recognise(self, location: Location) -> Optional[str]:
"""Return canonical site name if this location matches by name/alias or geofence."""
name_lower = (location.name or "").lower()
# Pass 1: name/alias substring match
for alias, canonical in self.alias_map.items():
if alias in name_lower:
return canonical
# Pass 2: geofence match
for canonical, site in self.by_canonical.items():
if site.radius_m > 0:
max_dist_miles = site.radius_m / 1609.34
d = haversine_distance(location.lat, location.lon, site.lat, site.lon)
if d <= max_dist_miles:
return canonical
return None
@dataclass
class Hop:
"""A hop from one recognised site to another, dated by the origin's start date."""
date: date
origin: str
destination: str
def _build_day_hops(day_visits: List[PlaceVisit], site_config: SiteConfig, assume_home_start: bool) -> List[Hop]:
"""Build ordered hops for a single day of visits."""
# Ensure chronological order by *start* time
day_visits = sorted(day_visits, key=lambda v: v.start_time)
recognised: List[Tuple[str, PlaceVisit]] = []
last_site: Optional[str] = None
for v in day_visits:
s = site_config.recognise(v.location)
if not s:
continue
if s == last_site:
continue # ignore duplicates back-to-back
recognised.append((s, v))
last_site = s
if not recognised:
return []
# Inject Home at start if enabled and first site isn't Home
if assume_home_start and recognised[0][0] != "Home":
first_time = recognised[0][1].start_time
synthetic_home = PlaceVisit(location=Location(lat=0.0, lon=0.0, name="Home"),
start_time=first_time, end_time=first_time)
recognised.insert(0, ("Home", synthetic_home))
# Walk forward, stop at second Home
hops: List[Hop] = []
home_hits = 1 if recognised and recognised[0][0] == "Home" else 0
for i in range(1, len(recognised)):
origin_site, origin_visit = recognised[i - 1]
dest_site, _dest_visit = recognised[i]
hop_date = origin_visit.start_time.date()
if origin_site != dest_site:
hops.append(Hop(date=hop_date, origin=origin_site, destination=dest_site))
if dest_site == "Home":
home_hits += 1
if home_hits >= 2:
break
return hops
def detect_itinerary(visits: List[PlaceVisit], site_config: SiteConfig, *, assume_home_start: bool = True) -> List[Hop]:
"""Reduce all visits into ordered hops per day, concatenated across the file."""
if not visits:
return []
# Group by the local date from each visit's start_time
by_day: Dict[date, List[PlaceVisit]] = defaultdict(list)
for v in visits:
by_day[v.start_time.date()].append(v)
hops_all: List[Hop] = []
for day in sorted(by_day.keys()):
day_hops = _build_day_hops(by_day[day], site_config, assume_home_start=assume_home_start)
hops_all.extend(day_hops)
return hops_all