#!/usr/bin/env python3 # -*- coding: utf-8 -*- import argparse import concurrent.futures import json import os import re import shutil import time from datetime import datetime from html import escape from urllib.parse import urljoin, urlparse import requests try: from PIL import Image, ImageOps except ImportError: Image = None ImageOps = None USER_UPLOADS_RE = re.compile(r'(https?://[^\s"\'<>]+?/user_uploads/[^\s"\'<>]+|/user_uploads/[^\s"\'<>]+)') IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".bmp", ".gif", ".tif", ".tiff", ".webp"} HTML_TEMPLATE = """ Zulip Archive

Zulip Archive

__META__
0 results
Loading messages…
""" def zulip_get_messages(session, base_url, narrow, anchor="newest", num_before=500, num_after=0): params = { "anchor": anchor, "num_before": num_before, "num_after": num_after, "narrow": json.dumps(narrow), "apply_markdown": "true" } url = urljoin(base_url, "/api/v1/messages") r = session.get(url, params=params) if r.status_code >= 400: print("ERROR STATUS:", r.status_code) print("ERROR BODY:", r.text) r.raise_for_status() return r.json() def ensure_dir(path): os.makedirs(path, exist_ok=True) def sanitize_filename(name): return re.sub(r'[^A-Za-z0-9._-]+', '_', name) def sanitize_output_basename(name): sanitized = re.sub(r'[^A-Za-z0-9]+', '_', name or '') sanitized = re.sub(r'_+', '_', sanitized).strip('_') return sanitized def to_web_path(path): return path.replace("\\", "/") def is_image_upload_url(upload_url): path = urlparse(upload_url).path ext = os.path.splitext(path)[1].lower() return ext in IMAGE_EXTENSIONS def download_upload(session, base_url, upload_url, target_dir): if upload_url.startswith("/"): full_url = urljoin(base_url, upload_url) else: full_url = upload_url parsed = urlparse(full_url) filename = sanitize_filename(os.path.basename(parsed.path)) local_path = os.path.join(target_dir, filename) if os.path.exists(local_path): return local_path try: r = session.get(full_url, stream=True) r.raise_for_status() with open(local_path, "wb") as file_obj: for chunk in r.iter_content(chunk_size=8192): if chunk: file_obj.write(chunk) return local_path except requests.exceptions.HTTPError as err: status = err.response.status_code if err.response is not None else "unknown" print(f"WARN: failed to download upload ({status}): {full_url}") return None except requests.exceptions.RequestException as err: print(f"WARN: network error downloading upload: {full_url} ({err})") return None def convert_to_webp(local_path, webp_dir, quality): ext = os.path.splitext(local_path)[1].lower() if ext not in IMAGE_EXTENSIONS: return None if Image is None: return None base_name = os.path.splitext(os.path.basename(local_path))[0] webp_path = os.path.join(webp_dir, f"{base_name}.webp") if os.path.exists(webp_path): return webp_path try: with Image.open(local_path) as image: image = ImageOps.exif_transpose(image) if image.mode in ("RGBA", "LA", "P"): image = image.convert("RGBA") else: image = image.convert("RGB") image.save(webp_path, "WEBP", quality=quality, method=6) return webp_path except Exception as err: # pylint: disable=broad-except print(f"WARN: could not convert to WEBP {local_path}: {err}") return None def get_stream_id(session, base_url, stream_name): url = urljoin(base_url, "/api/v1/get_stream_id") response = session.get(url, params={"stream": stream_name}) if response.status_code >= 400: print("WARN: could not resolve stream_id for topic verification") return None data = response.json() return data.get("stream_id") def get_stream_topics(session, base_url, stream_id): url = urljoin(base_url, f"/api/v1/users/me/{stream_id}/topics") response = session.get(url) if response.status_code >= 400: print("WARN: could not fetch stream topics for verification") return set() data = response.json() return {topic.get("name", "") for topic in data.get("topics", []) if topic.get("name")} def transform_message_content(session, base_url, content, uploads_dir, images_original_dir, out_dir): uploads = set(USER_UPLOADS_RE.findall(content)) replacements = {} message_local_paths = set() for upload_url in uploads: target_dir = images_original_dir if is_image_upload_url(upload_url) else uploads_dir local_path = download_upload(session, base_url, upload_url, target_dir) if not local_path: continue rel_path = to_web_path(os.path.relpath(local_path, out_dir)) replacements[upload_url] = rel_path message_local_paths.add(rel_path) transformed = content for original_url, rel_path in sorted(replacements.items(), key=lambda item: len(item[0]), reverse=True): transformed = transformed.replace(original_url, rel_path) text_for_search = re.sub('<[^<]+?>', ' ', transformed) text_for_search = " ".join(text_for_search.split()) return transformed, text_for_search, message_local_paths def prepare_messages_json(messages, session, base_url, uploads_dir, images_original_dir, out_dir): js_messages = [] all_local_paths = set() for message in messages: sender = message["sender_full_name"] topic = message["subject"] or "(no topic)" timestamp = datetime.fromtimestamp(message["timestamp"]).strftime("%Y-%m-%d %H:%M") raw_content = message["content"] transformed_content, search_text, message_local_paths = transform_message_content( session=session, base_url=base_url, content=raw_content, uploads_dir=uploads_dir, images_original_dir=images_original_dir, out_dir=out_dir, ) all_local_paths.update(message_local_paths) combined_search = f"{sender} {topic} {search_text}" js_messages.append({ "id": message["id"], "sender": escape(sender), "topic": escape(topic), "date": timestamp, "day": timestamp[:10], "content": transformed_content, "search": combined_search, "_local_paths": sorted(message_local_paths), }) return js_messages, all_local_paths def convert_uploads_to_webp_parallel(local_rel_paths, out_dir, webp_dir, quality, workers): if Image is None: return {} rel_paths = sorted(local_rel_paths) if not rel_paths: return {} webp_map = {} def convert_one(rel_path): local_path = os.path.join(out_dir, rel_path.replace("/", os.sep)) webp_path = convert_to_webp(local_path, webp_dir, quality) if not webp_path: return rel_path, None webp_rel_path = to_web_path(os.path.relpath(webp_path, out_dir)) return rel_path, webp_rel_path max_workers = max(1, workers) with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: future_map = {executor.submit(convert_one, rel_path): rel_path for rel_path in rel_paths} for future in concurrent.futures.as_completed(future_map): rel_path, webp_rel_path = future.result() if webp_rel_path: webp_map[rel_path] = webp_rel_path return webp_map def apply_webp_replacements(js_messages, webp_map): for message in js_messages: content = message["content"] local_paths = message.pop("_local_paths", []) for original_rel_path in local_paths: webp_rel_path = webp_map.get(original_rel_path) if not webp_rel_path: continue content = content.replace(original_rel_path, webp_rel_path) message["content"] = content return js_messages def build_html(meta, messages_json_filename, render_chunk_size): return build_html_with_messages(meta, messages_json_filename, render_chunk_size, None) def build_html_with_messages(meta, messages_json_filename, render_chunk_size, messages_payload): embedded_json = "null" if messages_payload is not None: embedded_json = json.dumps({"messages": messages_payload}, ensure_ascii=False).replace("