Tech Spec — Scrape & Persist Internet Providers per Building Address

Generated 2025-09-28

Goal

Given one or more postal building addresses in New York, fetch the internet providers reported by the NY Broadband Map and save each provider row to a new Django model Document_Internet_Vendor with fields:

FieldTypeExample
nameCharField(200)“Charter Communications”
technologyCharField(50)“Fiber”, “Cable”, “DSL”, “Fixed Wireless”, “Other”
max_down_mbpsFloatField1000.0
max_up_mbpsFloatField50.0

Per request, no latitude/longitude inputs. The command accepts addresses only.

Input & Output (conceptual)

Input

Output (for logging / optional JSON file)

[
  {
    "address": "535 5th Ave, New York, NY 10017",
    "provider": "Provider Name",
    "technology": "Fiber | Cable | DSL | Fixed Wireless | Other",
    "max_down_mbps": 1000.0,
    "max_up_mbps": 50.0
  }
]

But the system of record is the database table Document_Internet_Vendor.

Django model

# app: broadband/models.py

from django.db import models

class Document(models.Model):
    building_address = models.CharField(max_length=200, blank=True, null=True)


class Document_Internet_Vendor(models.Model):
    document = models.ForeignKey(Document, null=True, on_delete=models.CASCADE)
    name = models.CharField(max_length=200)
    technology = models.CharField(max_length=50)
    max_down_mbps = models.FloatField(null=True, blank=True)
    max_up_mbps = models.FloatField(null=True, blank=True)

    class Meta:
        verbose_name = "Document Internet Vendor"
        verbose_name_plural = "Document Internet Vendors"
        indexes = [
            models.Index(fields=["name", "technology"]),
        ]

    def __str__(self):
        return f"{self.name} ({self.technology}) ↓{self.max_down_mbps} ↑{self.max_up_mbps}"

If you later need to associate vendors to a specific document or address record, add a ForeignKey in a migration (kept out intentionally per spec).

Management command (address-only)

Initial Implementation steps

Command name: building_providers

# app: broadband/management/commands/building_providers.py

from django.core.management.base import BaseCommand, CommandError
from django.db import transaction
from broadband.models import Document_Internet_Vendor
import json, requests

ARCGIS_URL = "https://gis.dot.ny.gov/hostingny/rest/services/BroadbandAvailability_WGS/MapServer/0/query"

def geocode_to_lonlat(address: str):
    """Address-only per requirements. Implement any geocoder here
    (NYC Geoclient, Nominatim, Google, etc.). Raise CommandError on miss."""
    raise CommandError("No geocoder configured. Plug one into geocode_to_lonlat(address).")

def fetch_rows_for_address(address: str, timeout=15):
    lon, lat = geocode_to_lonlat(address)  # address-only → internally geocode
    params = {
        "geometry": f"{lon},{lat}",
        "geometryType": "esriGeometryPoint",
        "inSR": 4326,
        "outSR": 4326,
        "spatialRel": "esriSpatialRelIntersects",
        "outFields": "PROVIDERNAME,TECHNOLOGY,MAXADDOWN,MAXADUP",
        "returnGeometry": "false",
        "f": "json",
    }
    r = requests.get(ARCGIS_URL, params=params, timeout=timeout)
    r.raise_for_status()
    data = r.json()
    out = []
    for f in data.get("features", []):
        a = f.get("attributes", {})
        out.append({
            "address": address,
            "name": a.get("PROVIDERNAME"),
            "technology": a.get("TECHNOLOGY"),
            "max_down_mbps": a.get("MAXADDOWN"),
            "max_up_mbps": a.get("MAXADUP"),
        })
    return out

def persist_vendor_rows(rows: list[dict]) -> int:
    objs = [Document_Internet_Vendor(
        name=r.get("name") or "",
        technology=r.get("technology") or "",
        max_down_mbps=r.get("max_down_mbps"),
        max_up_mbps=r.get("max_up_mbps"),
    ) for r in rows]
    with transaction.atomic():
        created = Document_Internet_Vendor.objects.bulk_create(objs, ignore_conflicts=False)
    return len(created)

class Command(BaseCommand):
    help = "Fetch & save internet providers for one or many building addresses in NY (address-only; no lat/lon args)."

    def add_arguments(self, parser):
        parser.add_argument("address", nargs="?", help="Street address (quoted)")
        parser.add_argument("--input-file", help="Path to a text file with one address per line")
        parser.add_argument("--dry-run", action="store_true", help="Only print JSON; do not save to DB")
        parser.add_argument("--outfile", help="Optional path to also save JSON output")

    def handle(self, *args, **opts):
        addresses = []
        if opts.get("address"):
            addresses.append(opts["address"].strip())
        if opts.get("input-file"):
            with open(opts["input-file"], "r", encoding="utf-8") as fh:
                addresses.extend([ln.strip() for ln in fh if ln.strip()])

        if not addresses:
            raise CommandError("Provide an address or --input-file with addresses.")

        all_rows = []
        for addr in addresses:
            rows = fetch_rows_for_address(addr)
            all_rows.extend(rows)

        if opts["dry-run"]:
            self.stdout.write(json.dumps(all_rows, indent=2))
        else:
            saved = persist_vendor_rows(all_rows)
            self.stdout.write(self.style.SUCCESS(f"Saved {saved} vendor rows"))

        if path := opts.get("outfile"):
            with open(path, "w", encoding="utf-8") as f:
                f.write(json.dumps(all_rows, indent=2))

The command accepts addresses only. Internally it geocodes each address and queries the ArcGIS layer. Results are saved into Document_Internet_Vendor.

Examples

python manage.py building_providers