Getting Bluesky (or Fediverse) RSS feeds to work

1 minute read


Some popular news outlets don't have RSS feeds, but they do have Fediverse/Bluesky accounts. Bluesky supposedly allows rss feeds if you add /rss at the end of a profile URL, but for some reason, I couldn't get it to work OOTB.

No worries, worked w/ Claude to vibe-code a workaround.

  1. Grab the profile URL for the bsky page you want to follow
  2. The script below fetches content for newsraft. Make it executable and place in $PATH:
#!/usr/bin/env python3
"""Fetch microblog content and normalize to RSS for newsraft.

Handles two input types:
  - Standard RSS/Atom feed URL: patches titleless items, rewrites <link>
    to the first external URL found in the post body.
  - Bluesky AT-URI (at://did:plc:.../app.bsky.feed.generator/<rkey>):
    calls the unauthenticated getFeed XRPC endpoint and emits RSS.

Usage: fediverseXml.sh <feed_url_or_at_uri> <output_file>
"""

import json
import re
import sys
import urllib.parse
import urllib.request
from email.utils import formatdate
from xml.etree import ElementTree as ET
from xml.sax.saxutils import escape

# Domains that are the post itself, not external content
SELF_DOMAINS = {"mstdn.social", "bsky.app", "mastodon.social"}

BSKY_XRPC = "https://public.api.bsky.app/xrpc/app.bsky.feed.getFeed"
UA = "Mozilla/5.0 (fediverseXml)"


def strip_html(text):
    text = re.sub(r"<br\s*/?>", " ", text)
    text = re.sub(r"<[^>]+>", "", text)
    return " ".join(text.split())


def find_external_url(text):
    plain = strip_html(text)
    urls = re.findall(r"https?://[^\s<>\"')\]]+", plain)
    for url in urls:
        if not any(d in url for d in SELF_DOMAINS):
            return url.rstrip(".,;:")
    return None


def fetch(url):
    req = urllib.request.Request(url, headers={"User-Agent": UA})
    with urllib.request.urlopen(req, timeout=20) as resp:
        return resp.read()


def patch_rss(feed_url, output):
    data = fetch(feed_url)
    tree = ET.ElementTree(ET.fromstring(data))

    for item in tree.findall(".//item"):
        title = item.find("title")
        desc = item.find("description")
        link = item.find("link")
        if desc is None:
            continue

        text = strip_html(desc.text or "")

        if title is None:
            title = ET.SubElement(item, "title")
        if not (title.text or "").strip():
            title.text = text[:120]

        ext_url = find_external_url(desc.text or "")
        if ext_url and link is not None:
            link.text = ext_url

    tree.write(output, encoding="unicode", xml_declaration=True)


def build_rss_from_bsky(at_uri, output):
    qs = urllib.parse.urlencode({"feed": at_uri, "limit": 30})
    data = json.loads(fetch(f"{BSKY_XRPC}?{qs}"))

    items = []
    for entry in data.get("feed", []):
        post = entry.get("post", {})
        record = post.get("record", {}) or {}
        text = (record.get("text") or "").strip()
        if not text:
            continue

        # Prefer external embed URL; fall back to first URL in text;
        # last resort is the bsky post itself
        embed = post.get("embed") or {}
        ext = (embed.get("external") or {}).get("uri")
        link = ext or find_external_url(text) or post_url(post)

        title = text[:120]
        pub = record.get("createdAt") or post.get("indexedAt")

        items.append((title, link, text, pub))

    channel_title = at_uri.rsplit("/", 1)[-1]
    write_rss(output, channel_title, items)


def post_url(post):
    """Build a bsky.app post URL from an at-uri post."""
    uri = post.get("uri", "")
    # at://did:plc:xxx/app.bsky.feed.post/<rkey>
    m = re.match(r"at://([^/]+)/app\.bsky\.feed\.post/(.+)", uri)
    if not m:
        return "https://bsky.app"
    did, rkey = m.groups()
    return f"https://bsky.app/profile/{did}/post/{rkey}"


def write_rss(output, channel_title, items):
    rfc822 = lambda iso: formatdate(localtime=False) if not iso else formatdate(
        timeval=__import__("datetime").datetime.fromisoformat(
            iso.replace("Z", "+00:00")
        ).timestamp(),
        usegmt=True,
    )

    parts = [
        '<?xml version="1.0" encoding="utf-8"?>',
        '<rss version="2.0"><channel>',
        f"<title>{escape(channel_title)}</title>",
        "<link>https://bsky.app</link>",
        f"<description>{escape(channel_title)}</description>",
    ]
    for title, link, text, pub in items:
        parts.append("<item>")
        parts.append(f"<title>{escape(title)}</title>")
        parts.append(f"<link>{escape(link)}</link>")
        parts.append(f"<description>{escape(text)}</description>")
        parts.append(f"<guid isPermaLink=\"false\">{escape(link)}</guid>")
        parts.append(f"<pubDate>{rfc822(pub)}</pubDate>")
        parts.append("</item>")
    parts.append("</channel></rss>")

    with open(output, "w", encoding="utf-8") as f:
        f.write("".join(parts))


def main():
    if len(sys.argv) != 3:
        print(f"Usage: {sys.argv[0]} <feed_url_or_at_uri> <output_file>",
              file=sys.stderr)
        sys.exit(1)

    feed_url, output = sys.argv[1], sys.argv[2]

    try:
        if feed_url.startswith("at://"):
            build_rss_from_bsky(feed_url, output)
        else:
            patch_rss(feed_url, output)
    except urllib.error.HTTPError as e:
        print(f"{feed_url}: HTTP {e.code}", file=sys.stderr)
        sys.exit(1)
    except urllib.error.URLError as e:
        print(f"{feed_url}: {e.reason}", file=sys.stderr)
        sys.exit(1)
    except Exception as e:
        print(f"{feed_url}: {e}", file=sys.stderr)
        sys.exit(1)


if __name__ == "__main__":
    main()
  1. Create a file called xmlFeeds.conf in $XDG_CONFIG_HOME/newsraft/feeds or $HOME/.config/newsraft/feeds or $HOME/.newsraft/feeds
  2. Place the profile URL in th file, followed by a file name that ends in xml. It'll look something like this:
https://bsky.app/profile/did:plc:2iotnpqcz2pcanhti3apjs5j/rss wttw.xml
  1. In newsraft's feeds file, point it to your newly created file (in my case, wttw.xml).
#inside of `feeds` file in newsraft/config. I left some examples on excluding irrelevant content ;)
file:///path/to/newsraft/config/wttw.xml "WTTW" < item-rule NOT (title LIKE '%Blackhawks%' OR title LIKE '%Sox%' OR title LIKE '%Cubs%' OR title LIKE '%Bulls%' OR title LIKE '%Bears%')
  1. So now, when you manually refresh the python script above, it will refresh the feed. It would be preferable that this runs whenever I open newsraft, so I created a function inside of .bash_aliases:
nr() {
    echo "Loading .xml files.."
    conf="${XDG_CONFIG_HOME:-$HOME/.config}/newsraft/xmlFeeds.conf"
    dir="$HOME/path/to/newsraft/config"
    while read -r url file; do
        [ -z "$url" ] && continue
        fediverseXml.sh "$url" "$dir/$file"
    echo "Reading links..."
    done < "$conf"
    command newsraft "$@"
}