Files
usefulscripts/zulip/zulip_export/zulip_export.py
2026-02-13 13:39:56 +01:00

1094 lines
35 KiB
Python

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import argparse
import concurrent.futures
import json
import os
import re
import shutil
import time
from datetime import datetime
from html import escape
from urllib.parse import urljoin, urlparse
import requests
try:
from PIL import Image, ImageOps
except ImportError:
Image = None
ImageOps = None
USER_UPLOADS_RE = re.compile(r'(https?://[^\s"\'<>]+?/user_uploads/[^\s"\'<>]+|/user_uploads/[^\s"\'<>]+)')
IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".bmp", ".gif", ".tif", ".tiff", ".webp"}
HTML_TEMPLATE = """<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8"/>
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<title>Zulip Archive</title>
<style>
:root {
--bg: #f7f7fb;
--card: #fff;
--text: #111;
--subtle: #555;
--border: #e6e6f0;
--highlight: #fff7cc;
--active: #d6ebff;
}
body {
font-family: system-ui, -apple-system, Segoe UI, Roboto, sans-serif;
margin: 0;
background: var(--bg);
color: var(--text);
}
.wrap {
max-width: 1000px;
margin: 0 auto;
padding: 20px;
}
h1 {
margin: 0 0 8px;
}
.meta {
color: var(--subtle);
margin-bottom: 12px;
}
.toolbar {
position: sticky;
top: 0;
z-index: 100;
background: var(--bg);
border-bottom: 1px solid var(--border);
padding: 12px 0;
margin-bottom: 16px;
display: flex;
gap: 8px;
align-items: center;
flex-wrap: wrap;
}
.toolbar input[type="search"] {
flex: 1;
min-width: 260px;
padding: 10px 12px;
border: 1px solid #ccc;
border-radius: 8px;
font-size: 14px;
}
.toolbar input[type="date"] {
min-width: 170px;
padding: 10px 12px;
border: 1px solid #ccc;
border-radius: 8px;
font-size: 14px;
background: #fff;
}
.toolbar button {
padding: 10px 12px;
border: 1px solid #ccc;
border-radius: 8px;
background: #fff;
cursor: pointer;
}
.toolbar button:disabled {
opacity: 0.45;
cursor: not-allowed;
}
.count {
color: var(--subtle);
font-size: 0.95rem;
margin-left: 4px;
}
#messages {
min-height: 180px;
}
.msg {
background: var(--card);
border: 1px solid var(--border);
border-radius: 12px;
padding: 14px 16px;
margin: 12px 0;
box-shadow: 0 1px 2px rgba(0,0,0,0.04);
scroll-margin-top: 90px;
}
.msg-header {
display: flex;
flex-wrap: wrap;
gap: 8px 16px;
font-size: 0.9rem;
color: #444;
margin-bottom: 8px;
}
.sender {
font-weight: 600;
color: #222;
}
.topic {
background: #eef;
padding: 2px 8px;
border-radius: 999px;
}
.date {
color: #666;
}
.content img {
max-width: min(420px, 100%);
max-height: 300px;
width: auto;
height: auto;
border-radius: 8px;
margin: 8px 0;
cursor: zoom-in;
display: block;
}
.attachment-row {
display: flex;
gap: 8px;
flex-wrap: wrap;
margin: 8px 0;
}
.attachment-btn {
display: inline-flex;
align-items: center;
gap: 6px;
border: 1px solid #cfd3df;
border-radius: 8px;
padding: 7px 11px;
font-size: 0.9rem;
text-decoration: none;
color: #1f2937;
background: #f8faff;
}
.attachment-btn:hover {
background: #edf3ff;
border-color: #bfc9e3;
}
.attachment-btn svg {
width: 16px;
height: 16px;
display: block;
flex: 0 0 16px;
}
.attachment-doc {
background: #f7f7f9;
}
.attachment-name {
color: #444;
font-size: 0.88rem;
margin: 4px 0 8px;
word-break: break-all;
}
.hidden {
display: none;
}
.match {
background: var(--highlight);
}
.match.active {
border-color: #8bb9ff;
box-shadow: 0 0 0 2px var(--active);
}
#loading {
color: var(--subtle);
padding: 8px 0;
}
.modal {
position: fixed;
inset: 0;
background: rgba(0, 0, 0, 0.82);
display: none;
align-items: center;
justify-content: center;
z-index: 999;
padding: 24px;
box-sizing: border-box;
}
.modal.open {
display: flex;
}
.modal-panel {
width: min(96vw, 1200px);
max-height: 95vh;
display: flex;
flex-direction: column;
gap: 10px;
align-items: stretch;
}
.modal-toolbar {
display: flex;
justify-content: flex-end;
gap: 8px;
}
.modal-btn {
display: inline-flex;
align-items: center;
gap: 6px;
border: 1px solid rgba(255,255,255,0.42);
border-radius: 8px;
padding: 8px 12px;
color: #fff;
background: rgba(0,0,0,0.35);
text-decoration: none;
cursor: pointer;
font-size: 0.9rem;
}
.modal-btn:hover {
background: rgba(0,0,0,0.55);
}
.modal-btn svg {
width: 16px;
height: 16px;
display: block;
flex: 0 0 16px;
}
.modal-media {
display: flex;
justify-content: center;
align-items: center;
min-height: 180px;
}
.modal-media img {
max-width: 95vw;
max-height: 95vh;
width: auto;
height: auto;
border-radius: 10px;
box-shadow: 0 8px 40px rgba(0,0,0,0.5);
cursor: default;
}
.modal-media iframe {
width: min(95vw, 1100px);
height: min(88vh, 900px);
border: none;
border-radius: 10px;
box-shadow: 0 8px 40px rgba(0,0,0,0.5);
background: #fff;
}
.modal-hidden {
display: none !important;
}
</style>
</head>
<body>
<div class="wrap">
<h1>Zulip Archive</h1>
<div class="meta">__META__</div>
<div class="toolbar">
<input id="q" type="search" placeholder="Search messages..." />
<button id="prevBtn" type="button" disabled>Previous</button>
<button id="nextBtn" type="button" disabled>Next</button>
<input id="dateQ" type="date" />
<button id="goDateBtn" type="button">Go to date</button>
<span class="count" id="count">0 results</span>
</div>
<div id="loading">Loading messages…</div>
<div id="messages"></div>
</div>
<div class="modal" id="imgModal" aria-hidden="true">
<div class="modal-panel">
<div class="modal-toolbar">
<a id="modalDownloadBtn" class="modal-btn" href="#" download>
<svg viewBox="0 0 24 24" aria-hidden="true"><path fill="currentColor" d="M12 3a1 1 0 0 1 1 1v8.59l2.3-2.29a1 1 0 1 1 1.4 1.41l-4 4a1 1 0 0 1-1.4 0l-4-4a1 1 0 0 1 1.4-1.41L11 12.59V4a1 1 0 0 1 1-1Zm-7 14a1 1 0 0 1 1 1v1h12v-1a1 1 0 1 1 2 0v2a1 1 0 0 1-1 1H5a1 1 0 0 1-1-1v-2a1 1 0 0 1 1-1Z"/></svg>
Download
</a>
<button id="modalCloseBtn" class="modal-btn" type="button">
<svg viewBox="0 0 24 24" aria-hidden="true"><path fill="currentColor" d="M6.7 5.3a1 1 0 0 0-1.4 1.4L10.6 12l-5.3 5.3a1 1 0 1 0 1.4 1.4l5.3-5.3 5.3 5.3a1 1 0 0 0 1.4-1.4L13.4 12l5.3-5.3a1 1 0 1 0-1.4-1.4L12 10.6 6.7 5.3Z"/></svg>
Close
</button>
</div>
<div class="modal-media">
<img id="modalImg" alt="Attachment preview" class="modal-hidden" />
<iframe id="modalPdf" title="PDF preview" class="modal-hidden"></iframe>
</div>
</div>
</div>
<script id="embeddedMessages" type="application/json">__EMBEDDED_MESSAGES__</script>
<script>
const MESSAGES_JSON_URL = '__MESSAGES_JSON__';
const RENDER_CHUNK_SIZE = __RENDER_CHUNK_SIZE__;
const messagesContainer = document.getElementById('messages');
const loadingEl = document.getElementById('loading');
const searchInput = document.getElementById('q');
const dateInput = document.getElementById('dateQ');
const goDateBtn = document.getElementById('goDateBtn');
const prevBtn = document.getElementById('prevBtn');
const nextBtn = document.getElementById('nextBtn');
const countEl = document.getElementById('count');
const modal = document.getElementById('imgModal');
const modalImg = document.getElementById('modalImg');
const modalPdf = document.getElementById('modalPdf');
const modalDownloadBtn = document.getElementById('modalDownloadBtn');
const modalCloseBtn = document.getElementById('modalCloseBtn');
let allMessageElements = [];
let currentMatches = [];
let currentMatchIndex = -1;
function normalize(s) {
return (s || '').toLowerCase();
}
function isImagePath(pathname) {
return /\.(jpe?g|png|gif|bmp|webp|tiff?)$/i.test(pathname || '');
}
function isPdfPath(pathname) {
return /\.pdf$/i.test(pathname || '');
}
function getHrefPath(href) {
try {
return new URL(href, window.location.href).pathname || '';
} catch (_) {
return href || '';
}
}
function getFileName(pathname) {
const parts = (pathname || '').split('/').filter(Boolean);
return parts.length ? decodeURIComponent(parts[parts.length - 1]) : 'file';
}
function iconSvg(kind) {
if (kind === 'download') {
return '<svg viewBox="0 0 24 24" aria-hidden="true"><path fill="currentColor" d="M12 3a1 1 0 0 1 1 1v8.59l2.3-2.29a1 1 0 1 1 1.4 1.41l-4 4a1 1 0 0 1-1.4 0l-4-4a1 1 0 0 1 1.4-1.41L11 12.59V4a1 1 0 0 1 1-1Zm-7 14a1 1 0 0 1 1 1v1h12v-1a1 1 0 1 1 2 0v2a1 1 0 0 1-1 1H5a1 1 0 0 1-1-1v-2a1 1 0 0 1 1-1Z"/></svg>';
}
if (kind === 'pdf') {
return '<svg viewBox="0 0 24 24" aria-hidden="true"><path fill="currentColor" d="M6 2a2 2 0 0 0-2 2v16a2 2 0 0 0 2 2h12a2 2 0 0 0 2-2V8.83a2 2 0 0 0-.59-1.41l-4.83-4.83A2 2 0 0 0 13.17 2H6Zm7 1.5c.13 0 .26.05.35.15l5 5a.5.5 0 0 1-.35.85h-4.5A1.5 1.5 0 0 1 12 8V3.5h1ZM8 14a1 1 0 0 1 1-1h1.5a2.5 2.5 0 0 1 0 5H10v1a1 1 0 1 1-2 0v-5Zm2 2v-1h.5a.5.5 0 0 1 0 1H10Zm4-2a1 1 0 0 1 1-1h2a1 1 0 1 1 0 2h-1v1h.5a1 1 0 1 1 0 2H16v1a1 1 0 1 1-2 0v-5Z"/></svg>';
}
return '<svg viewBox="0 0 24 24" aria-hidden="true"><path fill="currentColor" d="M6 2a2 2 0 0 0-2 2v16a2 2 0 0 0 2 2h12a2 2 0 0 0 2-2V9.17a2 2 0 0 0-.59-1.41l-4.17-4.17A2 2 0 0 0 13.83 3H6Zm8 1.5c.13 0 .26.05.35.15l4 4a.5.5 0 0 1-.35.85h-3.5A1.5 1.5 0 0 1 13 7V3.5h1ZM8 13a1 1 0 1 1 0-2h8a1 1 0 1 1 0 2H8Zm0 4a1 1 0 1 1 0-2h8a1 1 0 1 1 0 2H8Z"/></svg>';
}
function buildActionLink(href, label, kind, className = '') {
const action = document.createElement('a');
action.href = href;
action.className = `attachment-btn ${className}`.trim();
action.innerHTML = `${iconSvg(kind)}<span>${label}</span>`;
return action;
}
function decorateAttachmentLink(anchor) {
const href = anchor.getAttribute('href');
if (!href || href.startsWith('#') || href.startsWith('mailto:') || href.startsWith('javascript:')) {
return;
}
const pathname = getHrefPath(href);
if (!pathname || isImagePath(pathname)) {
return;
}
const fileName = getFileName(pathname);
if (isPdfPath(pathname)) {
const row = document.createElement('div');
row.className = 'attachment-row';
const previewBtn = buildActionLink(href, 'View PDF', 'pdf');
previewBtn.addEventListener('click', (event) => {
event.preventDefault();
openModal(href, 'pdf', fileName);
});
const downloadBtn = buildActionLink(href, 'Download PDF', 'download');
downloadBtn.setAttribute('download', fileName);
row.appendChild(previewBtn);
row.appendChild(downloadBtn);
const name = document.createElement('div');
name.className = 'attachment-name';
name.textContent = fileName;
const block = document.createElement('div');
block.appendChild(row);
block.appendChild(name);
anchor.replaceWith(block);
return;
}
const downloadBtn = buildActionLink(href, `Download ${fileName}`, 'download', 'attachment-doc');
downloadBtn.setAttribute('download', fileName);
anchor.replaceWith(downloadBtn);
}
function enhanceAttachmentLinks(scope) {
const anchors = scope.querySelectorAll('.content a[href]');
anchors.forEach((a) => decorateAttachmentLink(a));
}
function createMessageElement(msg) {
const wrapper = document.createElement('div');
wrapper.className = 'msg';
wrapper.dataset.text = msg.search;
wrapper.dataset.messageId = msg.id;
wrapper.dataset.day = msg.day || (msg.date || '').slice(0, 10);
wrapper.innerHTML = `
<div class="msg-header">
<span class="sender">${msg.sender}</span>
<span class="topic">${msg.topic}</span>
<span class="date">${msg.date}</span>
</div>
<div class="content">${msg.content}</div>
`;
const images = wrapper.querySelectorAll('.content img');
images.forEach((img) => {
img.addEventListener('click', (event) => {
event.preventDefault();
const fullSrc = img.getAttribute('src');
openModal(fullSrc, 'image');
});
});
enhanceAttachmentLinks(wrapper);
return wrapper;
}
function renderChunks(messages, start = 0) {
if (start >= messages.length) {
loadingEl.textContent = `Loaded ${allMessageElements.length} messages`;
updateSearch();
return;
}
const fragment = document.createDocumentFragment();
const end = Math.min(start + RENDER_CHUNK_SIZE, messages.length);
for (let i = start; i < end; i += 1) {
const msg = messages[i];
const el = createMessageElement(msg);
allMessageElements.push(el);
fragment.appendChild(el);
}
messagesContainer.appendChild(fragment);
loadingEl.textContent = `Loading… ${allMessageElements.length} messages`;
requestAnimationFrame(() => renderChunks(messages, end));
}
async function loadMessages() {
try {
const embeddedEl = document.getElementById('embeddedMessages');
const embeddedRaw = embeddedEl ? (embeddedEl.textContent || '').trim() : '';
if (embeddedRaw && embeddedRaw !== 'null') {
const payload = JSON.parse(embeddedRaw);
const messages = Array.isArray(payload) ? payload : (payload.messages || []);
renderChunks(messages, 0);
return;
}
const response = await fetch(MESSAGES_JSON_URL, { cache: 'no-store' });
if (!response.ok) {
throw new Error(`HTTP ${response.status}`);
}
const payload = await response.json();
const messages = Array.isArray(payload) ? payload : (payload.messages || []);
renderChunks(messages, 0);
} catch (error) {
loadingEl.textContent = 'Error loading messages JSON. If you open this HTML with file://, use a local server.';
console.error('Failed to load messages JSON:', error);
}
}
function updateSearch() {
const q = normalize(searchInput.value);
currentMatches = [];
currentMatchIndex = -1;
allMessageElements.forEach((m) => {
m.classList.remove('hidden', 'match', 'active');
if (!q) {
return;
}
const text = normalize(m.dataset.text || '');
if (text.includes(q)) {
m.classList.add('match');
currentMatches.push(m);
} else {
m.classList.add('hidden');
}
});
if (!q) {
countEl.textContent = `${allMessageElements.length} messages`;
} else if (currentMatches.length === 0) {
countEl.textContent = '0 results';
} else {
currentMatchIndex = 0;
focusCurrentMatch(false);
countEl.textContent = `${currentMatchIndex + 1} / ${currentMatches.length}`;
}
const hasMatches = currentMatches.length > 0;
prevBtn.disabled = !hasMatches;
nextBtn.disabled = !hasMatches;
}
function focusCurrentMatch(shouldScroll = true) {
if (!currentMatches.length) {
return;
}
currentMatches.forEach((m) => m.classList.remove('active'));
const target = currentMatches[currentMatchIndex];
target.classList.add('active');
if (shouldScroll) {
target.scrollIntoView({ behavior: 'smooth', block: 'start' });
}
countEl.textContent = `${currentMatchIndex + 1} / ${currentMatches.length}`;
}
function goToNext() {
if (!currentMatches.length) {
return;
}
currentMatchIndex = (currentMatchIndex + 1) % currentMatches.length;
focusCurrentMatch(true);
}
function goToPrev() {
if (!currentMatches.length) {
return;
}
currentMatchIndex = (currentMatchIndex - 1 + currentMatches.length) % currentMatches.length;
focusCurrentMatch(true);
}
function goToDate() {
const targetDay = (dateInput.value || '').trim();
if (!targetDay) {
return;
}
let target = allMessageElements.find((m) => (m.dataset.day || '') === targetDay);
if (!target) {
target = allMessageElements.find((m) => (m.dataset.day || '') > targetDay);
}
if (!target) {
countEl.textContent = `No messages from ${targetDay}`;
return;
}
allMessageElements.forEach((m) => m.classList.remove('active'));
target.classList.remove('hidden');
target.classList.add('active');
target.scrollIntoView({ behavior: 'smooth', block: 'start' });
countEl.textContent = `Date: ${target.dataset.day}`;
}
function openModal(src, kind = 'image', downloadName = '') {
const resolvedName = downloadName || getFileName(getHrefPath(src));
modalDownloadBtn.href = src;
modalDownloadBtn.setAttribute('download', resolvedName || 'file');
modalDownloadBtn.dataset.downloadSrc = src;
modalDownloadBtn.dataset.downloadName = resolvedName || 'file';
if (kind === 'pdf') {
modalImg.classList.add('modal-hidden');
modalImg.src = '';
modalPdf.classList.remove('modal-hidden');
modalPdf.src = src;
} else {
modalPdf.classList.add('modal-hidden');
modalPdf.src = '';
modalImg.classList.remove('modal-hidden');
modalImg.src = src;
}
modal.classList.add('open');
modal.setAttribute('aria-hidden', 'false');
}
function closeModal() {
modal.classList.remove('open');
modal.setAttribute('aria-hidden', 'true');
modalImg.src = '';
modalPdf.src = '';
modalPdf.classList.add('modal-hidden');
modalImg.classList.add('modal-hidden');
modalDownloadBtn.href = '#';
delete modalDownloadBtn.dataset.downloadSrc;
delete modalDownloadBtn.dataset.downloadName;
}
async function triggerDownload(url, fileName) {
const response = await fetch(url, { cache: 'no-store' });
if (!response.ok) {
throw new Error(`HTTP ${response.status}`);
}
const blob = await response.blob();
const objectUrl = URL.createObjectURL(blob);
const tempLink = document.createElement('a');
tempLink.href = objectUrl;
tempLink.download = fileName || 'file';
document.body.appendChild(tempLink);
tempLink.click();
tempLink.remove();
URL.revokeObjectURL(objectUrl);
}
searchInput.addEventListener('input', updateSearch);
searchInput.addEventListener('keydown', (event) => {
if (event.key === 'Enter') {
if (event.shiftKey) {
goToPrev();
} else {
goToNext();
}
}
});
prevBtn.addEventListener('click', goToPrev);
nextBtn.addEventListener('click', goToNext);
goDateBtn.addEventListener('click', goToDate);
dateInput.addEventListener('keydown', (event) => {
if (event.key === 'Enter') {
goToDate();
}
});
modal.addEventListener('click', (event) => {
if (event.target === modal) {
closeModal();
}
});
modalDownloadBtn.addEventListener('click', async (event) => {
event.preventDefault();
const src = modalDownloadBtn.dataset.downloadSrc || modalDownloadBtn.getAttribute('href');
const fileName = modalDownloadBtn.dataset.downloadName || 'file';
if (!src || src === '#') {
return;
}
try {
await triggerDownload(src, fileName);
} catch (error) {
console.error('Download failed, falling back to direct link:', error);
const fallbackLink = document.createElement('a');
fallbackLink.href = src;
fallbackLink.download = fileName;
document.body.appendChild(fallbackLink);
fallbackLink.click();
fallbackLink.remove();
}
});
modalCloseBtn.addEventListener('click', closeModal);
document.addEventListener('keydown', (event) => {
if (event.key === 'Escape') {
closeModal();
}
});
loadMessages();
</script>
</body>
</html>
"""
def zulip_get_messages(session, base_url, narrow, anchor="newest", num_before=500, num_after=0):
params = {
"anchor": anchor,
"num_before": num_before,
"num_after": num_after,
"narrow": json.dumps(narrow),
"apply_markdown": "true"
}
url = urljoin(base_url, "/api/v1/messages")
r = session.get(url, params=params)
if r.status_code >= 400:
print("ERROR STATUS:", r.status_code)
print("ERROR BODY:", r.text)
r.raise_for_status()
return r.json()
def ensure_dir(path):
os.makedirs(path, exist_ok=True)
def sanitize_filename(name):
return re.sub(r'[^A-Za-z0-9._-]+', '_', name)
def sanitize_output_basename(name):
sanitized = re.sub(r'[^A-Za-z0-9]+', '_', name or '')
sanitized = re.sub(r'_+', '_', sanitized).strip('_')
return sanitized
def to_web_path(path):
return path.replace("\\", "/")
def is_image_upload_url(upload_url):
path = urlparse(upload_url).path
ext = os.path.splitext(path)[1].lower()
return ext in IMAGE_EXTENSIONS
def download_upload(session, base_url, upload_url, target_dir):
if upload_url.startswith("/"):
full_url = urljoin(base_url, upload_url)
else:
full_url = upload_url
parsed = urlparse(full_url)
filename = sanitize_filename(os.path.basename(parsed.path))
local_path = os.path.join(target_dir, filename)
if os.path.exists(local_path):
return local_path
try:
r = session.get(full_url, stream=True)
r.raise_for_status()
with open(local_path, "wb") as file_obj:
for chunk in r.iter_content(chunk_size=8192):
if chunk:
file_obj.write(chunk)
return local_path
except requests.exceptions.HTTPError as err:
status = err.response.status_code if err.response is not None else "unknown"
print(f"WARN: failed to download upload ({status}): {full_url}")
return None
except requests.exceptions.RequestException as err:
print(f"WARN: network error downloading upload: {full_url} ({err})")
return None
def convert_to_webp(local_path, webp_dir, quality):
ext = os.path.splitext(local_path)[1].lower()
if ext not in IMAGE_EXTENSIONS:
return None
if Image is None:
return None
base_name = os.path.splitext(os.path.basename(local_path))[0]
webp_path = os.path.join(webp_dir, f"{base_name}.webp")
if os.path.exists(webp_path):
return webp_path
try:
with Image.open(local_path) as image:
image = ImageOps.exif_transpose(image)
if image.mode in ("RGBA", "LA", "P"):
image = image.convert("RGBA")
else:
image = image.convert("RGB")
image.save(webp_path, "WEBP", quality=quality, method=6)
return webp_path
except Exception as err: # pylint: disable=broad-except
print(f"WARN: could not convert to WEBP {local_path}: {err}")
return None
def get_stream_id(session, base_url, stream_name):
url = urljoin(base_url, "/api/v1/get_stream_id")
response = session.get(url, params={"stream": stream_name})
if response.status_code >= 400:
print("WARN: could not resolve stream_id for topic verification")
return None
data = response.json()
return data.get("stream_id")
def get_stream_topics(session, base_url, stream_id):
url = urljoin(base_url, f"/api/v1/users/me/{stream_id}/topics")
response = session.get(url)
if response.status_code >= 400:
print("WARN: could not fetch stream topics for verification")
return set()
data = response.json()
return {topic.get("name", "") for topic in data.get("topics", []) if topic.get("name")}
def transform_message_content(session, base_url, content, uploads_dir, images_original_dir, out_dir):
uploads = set(USER_UPLOADS_RE.findall(content))
replacements = {}
message_local_paths = set()
for upload_url in uploads:
target_dir = images_original_dir if is_image_upload_url(upload_url) else uploads_dir
local_path = download_upload(session, base_url, upload_url, target_dir)
if not local_path:
continue
rel_path = to_web_path(os.path.relpath(local_path, out_dir))
replacements[upload_url] = rel_path
message_local_paths.add(rel_path)
transformed = content
for original_url, rel_path in sorted(replacements.items(), key=lambda item: len(item[0]), reverse=True):
transformed = transformed.replace(original_url, rel_path)
text_for_search = re.sub('<[^<]+?>', ' ', transformed)
text_for_search = " ".join(text_for_search.split())
return transformed, text_for_search, message_local_paths
def prepare_messages_json(messages, session, base_url, uploads_dir, images_original_dir, out_dir):
js_messages = []
all_local_paths = set()
for message in messages:
sender = message["sender_full_name"]
topic = message["subject"] or "(no topic)"
timestamp = datetime.fromtimestamp(message["timestamp"]).strftime("%Y-%m-%d %H:%M")
raw_content = message["content"]
transformed_content, search_text, message_local_paths = transform_message_content(
session=session,
base_url=base_url,
content=raw_content,
uploads_dir=uploads_dir,
images_original_dir=images_original_dir,
out_dir=out_dir,
)
all_local_paths.update(message_local_paths)
combined_search = f"{sender} {topic} {search_text}"
js_messages.append({
"id": message["id"],
"sender": escape(sender),
"topic": escape(topic),
"date": timestamp,
"day": timestamp[:10],
"content": transformed_content,
"search": combined_search,
"_local_paths": sorted(message_local_paths),
})
return js_messages, all_local_paths
def convert_uploads_to_webp_parallel(local_rel_paths, out_dir, webp_dir, quality, workers):
if Image is None:
return {}
rel_paths = sorted(local_rel_paths)
if not rel_paths:
return {}
webp_map = {}
def convert_one(rel_path):
local_path = os.path.join(out_dir, rel_path.replace("/", os.sep))
webp_path = convert_to_webp(local_path, webp_dir, quality)
if not webp_path:
return rel_path, None
webp_rel_path = to_web_path(os.path.relpath(webp_path, out_dir))
return rel_path, webp_rel_path
max_workers = max(1, workers)
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
future_map = {executor.submit(convert_one, rel_path): rel_path for rel_path in rel_paths}
for future in concurrent.futures.as_completed(future_map):
rel_path, webp_rel_path = future.result()
if webp_rel_path:
webp_map[rel_path] = webp_rel_path
return webp_map
def apply_webp_replacements(js_messages, webp_map):
for message in js_messages:
content = message["content"]
local_paths = message.pop("_local_paths", [])
for original_rel_path in local_paths:
webp_rel_path = webp_map.get(original_rel_path)
if not webp_rel_path:
continue
content = content.replace(original_rel_path, webp_rel_path)
message["content"] = content
return js_messages
def build_html(meta, messages_json_filename, render_chunk_size):
return build_html_with_messages(meta, messages_json_filename, render_chunk_size, None)
def build_html_with_messages(meta, messages_json_filename, render_chunk_size, messages_payload):
embedded_json = "null"
if messages_payload is not None:
embedded_json = json.dumps({"messages": messages_payload}, ensure_ascii=False).replace("</", "<\\/")
return (
HTML_TEMPLATE
.replace("__META__", escape(meta))
.replace("__MESSAGES_JSON__", messages_json_filename)
.replace("__RENDER_CHUNK_SIZE__", str(max(1, render_chunk_size)))
.replace("__EMBEDDED_MESSAGES__", embedded_json)
)
def verify_topics_if_needed(session, base_url, stream_name, exported_messages, topic_filter):
if topic_filter:
return
stream_id = get_stream_id(session, base_url, stream_name)
if stream_id is None:
return
api_topics = get_stream_topics(session, base_url, stream_id)
exported_topics = {msg.get("subject", "") for msg in exported_messages if msg.get("subject")}
missing = sorted(topic for topic in api_topics if topic not in exported_topics)
print(f"INFO: topics detected in API: {len(api_topics)}")
print(f"INFO: topics present in export: {len(exported_topics)}")
if missing:
print(f"WARN: missing {len(missing)} topics in export (sample): {missing[:10]}")
else:
print("OK: export includes all detectable stream topics.")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Export Zulip messages to HTML with optimized attachments.")
parser.add_argument("--base-url", required=True, help="Zulip base URL, e.g. https://zulip.yourcompany.com")
parser.add_argument("--email", required=True, help="Your Zulip email")
parser.add_argument("--api-key", required=True, help="Your Zulip API key")
parser.add_argument("--stream", required=True, help="Stream name")
parser.add_argument("--topic", default=None, help="Topic name (optional)")
parser.add_argument("--out", default="zulip_export", help="Output directory")
parser.add_argument("--chunk-size", type=int, default=120, help="Messages per JS render chunk")
parser.add_argument("--embed-html", dest="embed_html", action="store_true", default=True, help="Embed messages in generated HTML (default: enabled)")
parser.add_argument("--no-embed-html", dest="embed_html", action="store_false", help="Do not embed messages; load from generated JSON")
parser.add_argument("--webp-quality", type=int, default=72, help="WEBP quality (1-100)")
parser.add_argument("--webp-workers", type=int, default=max(2, (os.cpu_count() or 4)), help="Worker threads for WEBP conversion")
parser.add_argument("--delete-original-images", action="store_true", help="Delete uploads_originalimages after successful WEBP conversion")
args = parser.parse_args()
base_url = args.base_url.rstrip("/")
out_dir = args.out
uploads_dir = os.path.join(out_dir, "uploads")
images_original_dir = os.path.join(out_dir, "uploads_originalimages")
webp_dir = os.path.join(out_dir, "uploads_webp")
ensure_dir(out_dir)
ensure_dir(uploads_dir)
ensure_dir(images_original_dir)
ensure_dir(webp_dir)
if Image is None:
print("WARN: Pillow is not installed. Original images will be used without WEBP conversion.")
session = requests.Session()
session.auth = (args.email, args.api_key)
narrow = [{"operator": "stream", "operand": args.stream}]
if args.topic:
narrow.append({"operator": "topic", "operand": args.topic})
all_messages = []
anchor = "newest"
while True:
data = zulip_get_messages(session, base_url, narrow, anchor=anchor, num_before=1000, num_after=0)
messages = data.get("messages", [])
if not messages:
break
all_messages.extend(messages)
oldest_id = messages[0]["id"]
anchor = oldest_id - 1
if len(messages) < 1000:
break
time.sleep(0.2)
all_messages = sorted(all_messages, key=lambda msg: msg["timestamp"])
verify_topics_if_needed(
session=session,
base_url=base_url,
stream_name=args.stream,
exported_messages=all_messages,
topic_filter=args.topic,
)
meta = (
f"Stream: {args.stream}"
+ (f" · Topic: {args.topic}" if args.topic else " · All topics")
+ f" · Total: {len(all_messages)}"
)
webp_quality = max(1, min(100, args.webp_quality))
messages_payload, local_paths = prepare_messages_json(
messages=all_messages,
session=session,
base_url=base_url,
uploads_dir=uploads_dir,
images_original_dir=images_original_dir,
out_dir=out_dir,
)
webp_map = convert_uploads_to_webp_parallel(
local_rel_paths=local_paths,
out_dir=out_dir,
webp_dir=webp_dir,
quality=webp_quality,
workers=args.webp_workers,
)
messages_payload = apply_webp_replacements(messages_payload, webp_map)
if Image is not None:
print(f"INFO: WEBP conversions available: {len(webp_map)}")
if args.delete_original_images:
original_image_paths = {path for path in local_paths if path.startswith("uploads_originalimages/")}
failed_image_paths = original_image_paths - set(webp_map.keys())
if Image is None:
print("WARN: --delete-original-images skipped because Pillow is not installed.")
elif failed_image_paths:
print(f"WARN: --delete-original-images skipped. {len(failed_image_paths)} image(s) still use originals.")
elif os.path.isdir(images_original_dir):
shutil.rmtree(images_original_dir, ignore_errors=True)
print(f"OK: deleted {images_original_dir}")
stream_base = sanitize_output_basename(args.stream) or "zulip_export"
if args.topic:
topic_base = sanitize_output_basename(args.topic)
if topic_base:
stream_base = f"{stream_base}_{topic_base}"
messages_json_filename = f"{stream_base}.json"
html_filename = f"{stream_base}.html"
messages_json_path = os.path.join(out_dir, messages_json_filename)
with open(messages_json_path, "w", encoding="utf-8") as file_obj:
json.dump({"messages": messages_payload}, file_obj, ensure_ascii=False)
html = build_html_with_messages(
meta=meta,
messages_json_filename=messages_json_filename,
render_chunk_size=args.chunk_size,
messages_payload=messages_payload if args.embed_html else None,
)
html_path = os.path.join(out_dir, html_filename)
with open(html_path, "w", encoding="utf-8") as file_obj:
file_obj.write(html)
print(f"OK: {html_path}")
print(f"OK: {messages_json_path}")