Skip to main content

Ralsina.Me — Roberto Alsina's website

disqus2data.py (Source)

import json
import logging
from collections import defaultdict
import parse
import conf
def clean_url(u):
    if "//" in u:
        return u.split("//", 1)[1].split("?", 1)[0]
    return u
BASE_URL = clean_url(conf.SITE_URL)
data = json.load(open("disqus-data.json"))
# Each fix is a parse pattern and a format pattern. Get data using one, set data using the other.
thread_link_fixes = [
    [
        "{}://web.archive.org/web/{:d}/http://lateral.netmanagers.com.ar/{path}",
        "http://ralsina.me/{path}",
    ],
    ["{}://lateral.netmanagers.com.ar/{path}", "http://ralsina.me/{path}"],
    ["{}://localhost:8080/{path}", "http://ralsina.me/{path}"],
    [
        "http://feedproxy.google.com/~r/LateralOpinion/~3/{}/{path}",
        "http://ralsina.me/weblog/posts/{path}",
    ],
    [
        "https://disqus.com/home/discussion/lateralopinion/bb{numero1:d}_{numero2:d}/",
        "http://ralsina.me/weblog/posts/BB{numero1}.{numero2:02d}.html",
    ],
    ["{}://example.com/posts/{file}", "http://ralsina.me/weblog/posts/{file}"],
    ["{start}/bbs{id:d}", "{start}/BBS{id:d}"],  # bbs -> BBS
    [
        "{}://ralsina.me/tr/es/{path}",
        "http://ralsina.me/{path}",
    ],  # unify translation comments
    [
        "{}://ralsina.me/weblog/posts/{folder}/",
        "http://ralsina.me/weblog/posts/{folder}.html",
    ],  # unprettify URLs
    [
        "https://ralsina.me/weblog/posts/old-guy-the-terminal-ep-2-puede-fallar.html",
        "https://ralsina.me/weblog/posts/old-guy-the-terminal-ep-3-puede-fallar.html",
    ],
    ["https://ralsina.me/stories/nombres.html", "https://ralsina.me/stories/nombres/"],
]
# Ancient test threads
ignore_links = {
    "http://ralsina.me/weblog/posts/xxx.html",
    "http://ubuntuone.com/p/lKX/",
    "http://ralsina.me/weblog/posts/index.html",
}
# Collect all the post/page URLs in the site
site_urls = {}
for p in site.timeline:
    site_urls[clean_url(p.permalink(absolute=True))] = p
posts_per_thread = defaultdict(list)
posts_by_id = {}
for post in data["disqus"]["post"]:
    posts_per_thread[post["thread"]["@dsq:id"]].append(post)
    posts_by_id[post["@dsq:id"]] = post
threads = {}
for t in data["disqus"]["thread"]:
    if t["@dsq:id"] not in posts_per_thread:
        # Empty thread, don't care
        continue
    if t["link"] in ignore_links:
        print(f'Ignoring {t["link"]}')
        continue
    if not clean_url(t["link"]) in site_urls:
        for parser, formatter in thread_link_fixes:
            parsed = parse.parse(parser, t["link"])
            if parsed is not None:
                t["link"] = formatter.format(**parsed.named)
                if clean_url(t["link"]) in site_urls:
                    break
        else:
            print(f'Unfixed thread link: {t["link"]}')
    threads[t["@dsq:id"]] = t
def find_post_for_thread(thread, site):
    link = thread['link']
    for p in site.timeline:
        if p.permalink(absolute=True).split('//')[1] == link.split('//')[1]:
            return p
for t in threads:
    post = find_post_for_thread(threads[t], site)
    if post is None:
        print('Orphan thread ===>', threads[t])
        continue
    base_path = post.source_path.split('.')[0]
    for post in posts_per_thread[t]:
        if post['isDeleted'] == 'true':
            continue
        comment_path = f"{base_path}.{post['@dsq:id']}.wpcomment"
        print(comment_path)
        with open(comment_path, "w") as outf:
            output = f""".. id: {post['@dsq:id']}
.. approved: True
.. author: {post['author']['name']}
.. date_utc: {post['createdAt']}
.. compiler: html
{post['message']}"""
            if 'parent' in post:
                parent = post['parent']['@dsq:id'].split('=')[-1]
                output = f".. parent_id: {parent}\n" + output
            outf.write(output)

Contents © 2000-2020 Roberto Alsina