From ba91b0765c1a3bf2597b83b7080e4077413361d7 Mon Sep 17 00:00:00 2001 From: kprkpr Date: Fri, 13 Feb 2026 13:39:56 +0100 Subject: [PATCH] Add zulip_export --- zulip/zulip_export/zulip_export.py | 1093 ++++++++++++++++++++++++++++ 1 file changed, 1093 insertions(+) create mode 100644 zulip/zulip_export/zulip_export.py diff --git a/zulip/zulip_export/zulip_export.py b/zulip/zulip_export/zulip_export.py new file mode 100644 index 0000000..b55a0df --- /dev/null +++ b/zulip/zulip_export/zulip_export.py @@ -0,0 +1,1093 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import argparse +import concurrent.futures +import json +import os +import re +import shutil +import time +from datetime import datetime +from html import escape +from urllib.parse import urljoin, urlparse + +import requests + +try: + from PIL import Image, ImageOps +except ImportError: + Image = None + ImageOps = None + +USER_UPLOADS_RE = re.compile(r'(https?://[^\s"\'<>]+?/user_uploads/[^\s"\'<>]+|/user_uploads/[^\s"\'<>]+)') +IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".bmp", ".gif", ".tif", ".tiff", ".webp"} + +HTML_TEMPLATE = """ + + + + + Zulip Archive + + + +
+

Zulip Archive

+
__META__
+ +
+ + + + + + 0 results +
+ +
Loading messages…
+
+
+ + + + + + + + +""" + + +def zulip_get_messages(session, base_url, narrow, anchor="newest", num_before=500, num_after=0): + params = { + "anchor": anchor, + "num_before": num_before, + "num_after": num_after, + "narrow": json.dumps(narrow), + "apply_markdown": "true" + } + url = urljoin(base_url, "/api/v1/messages") + r = session.get(url, params=params) + if r.status_code >= 400: + print("ERROR STATUS:", r.status_code) + print("ERROR BODY:", r.text) + r.raise_for_status() + return r.json() + + +def ensure_dir(path): + os.makedirs(path, exist_ok=True) + + +def sanitize_filename(name): + return re.sub(r'[^A-Za-z0-9._-]+', '_', name) + + +def sanitize_output_basename(name): + sanitized = re.sub(r'[^A-Za-z0-9]+', '_', name or '') + sanitized = re.sub(r'_+', '_', sanitized).strip('_') + return sanitized + + +def to_web_path(path): + return path.replace("\\", "/") + + +def is_image_upload_url(upload_url): + path = urlparse(upload_url).path + ext = os.path.splitext(path)[1].lower() + return ext in IMAGE_EXTENSIONS + + +def download_upload(session, base_url, upload_url, target_dir): + if upload_url.startswith("/"): + full_url = urljoin(base_url, upload_url) + else: + full_url = upload_url + + parsed = urlparse(full_url) + filename = sanitize_filename(os.path.basename(parsed.path)) + local_path = os.path.join(target_dir, filename) + + if os.path.exists(local_path): + return local_path + + try: + r = session.get(full_url, stream=True) + r.raise_for_status() + with open(local_path, "wb") as file_obj: + for chunk in r.iter_content(chunk_size=8192): + if chunk: + file_obj.write(chunk) + return local_path + except requests.exceptions.HTTPError as err: + status = err.response.status_code if err.response is not None else "unknown" + print(f"WARN: failed to download upload ({status}): {full_url}") + return None + except requests.exceptions.RequestException as err: + print(f"WARN: network error downloading upload: {full_url} ({err})") + return None + + +def convert_to_webp(local_path, webp_dir, quality): + ext = os.path.splitext(local_path)[1].lower() + if ext not in IMAGE_EXTENSIONS: + return None + + if Image is None: + return None + + base_name = os.path.splitext(os.path.basename(local_path))[0] + webp_path = os.path.join(webp_dir, f"{base_name}.webp") + + if os.path.exists(webp_path): + return webp_path + + try: + with Image.open(local_path) as image: + image = ImageOps.exif_transpose(image) + if image.mode in ("RGBA", "LA", "P"): + image = image.convert("RGBA") + else: + image = image.convert("RGB") + image.save(webp_path, "WEBP", quality=quality, method=6) + return webp_path + except Exception as err: # pylint: disable=broad-except + print(f"WARN: could not convert to WEBP {local_path}: {err}") + return None + + +def get_stream_id(session, base_url, stream_name): + url = urljoin(base_url, "/api/v1/get_stream_id") + response = session.get(url, params={"stream": stream_name}) + if response.status_code >= 400: + print("WARN: could not resolve stream_id for topic verification") + return None + data = response.json() + return data.get("stream_id") + + +def get_stream_topics(session, base_url, stream_id): + url = urljoin(base_url, f"/api/v1/users/me/{stream_id}/topics") + response = session.get(url) + if response.status_code >= 400: + print("WARN: could not fetch stream topics for verification") + return set() + data = response.json() + return {topic.get("name", "") for topic in data.get("topics", []) if topic.get("name")} + + +def transform_message_content(session, base_url, content, uploads_dir, images_original_dir, out_dir): + uploads = set(USER_UPLOADS_RE.findall(content)) + replacements = {} + message_local_paths = set() + + for upload_url in uploads: + target_dir = images_original_dir if is_image_upload_url(upload_url) else uploads_dir + local_path = download_upload(session, base_url, upload_url, target_dir) + if not local_path: + continue + + rel_path = to_web_path(os.path.relpath(local_path, out_dir)) + replacements[upload_url] = rel_path + message_local_paths.add(rel_path) + + transformed = content + for original_url, rel_path in sorted(replacements.items(), key=lambda item: len(item[0]), reverse=True): + transformed = transformed.replace(original_url, rel_path) + + text_for_search = re.sub('<[^<]+?>', ' ', transformed) + text_for_search = " ".join(text_for_search.split()) + + return transformed, text_for_search, message_local_paths + + +def prepare_messages_json(messages, session, base_url, uploads_dir, images_original_dir, out_dir): + js_messages = [] + all_local_paths = set() + + for message in messages: + sender = message["sender_full_name"] + topic = message["subject"] or "(no topic)" + timestamp = datetime.fromtimestamp(message["timestamp"]).strftime("%Y-%m-%d %H:%M") + raw_content = message["content"] + + transformed_content, search_text, message_local_paths = transform_message_content( + session=session, + base_url=base_url, + content=raw_content, + uploads_dir=uploads_dir, + images_original_dir=images_original_dir, + out_dir=out_dir, + ) + all_local_paths.update(message_local_paths) + + combined_search = f"{sender} {topic} {search_text}" + js_messages.append({ + "id": message["id"], + "sender": escape(sender), + "topic": escape(topic), + "date": timestamp, + "day": timestamp[:10], + "content": transformed_content, + "search": combined_search, + "_local_paths": sorted(message_local_paths), + }) + + return js_messages, all_local_paths + + +def convert_uploads_to_webp_parallel(local_rel_paths, out_dir, webp_dir, quality, workers): + if Image is None: + return {} + + rel_paths = sorted(local_rel_paths) + if not rel_paths: + return {} + + webp_map = {} + + def convert_one(rel_path): + local_path = os.path.join(out_dir, rel_path.replace("/", os.sep)) + webp_path = convert_to_webp(local_path, webp_dir, quality) + if not webp_path: + return rel_path, None + webp_rel_path = to_web_path(os.path.relpath(webp_path, out_dir)) + return rel_path, webp_rel_path + + max_workers = max(1, workers) + with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: + future_map = {executor.submit(convert_one, rel_path): rel_path for rel_path in rel_paths} + for future in concurrent.futures.as_completed(future_map): + rel_path, webp_rel_path = future.result() + if webp_rel_path: + webp_map[rel_path] = webp_rel_path + + return webp_map + + +def apply_webp_replacements(js_messages, webp_map): + for message in js_messages: + content = message["content"] + local_paths = message.pop("_local_paths", []) + + for original_rel_path in local_paths: + webp_rel_path = webp_map.get(original_rel_path) + if not webp_rel_path: + continue + + content = content.replace(original_rel_path, webp_rel_path) + + message["content"] = content + + return js_messages + + +def build_html(meta, messages_json_filename, render_chunk_size): + return build_html_with_messages(meta, messages_json_filename, render_chunk_size, None) + + +def build_html_with_messages(meta, messages_json_filename, render_chunk_size, messages_payload): + embedded_json = "null" + if messages_payload is not None: + embedded_json = json.dumps({"messages": messages_payload}, ensure_ascii=False).replace("