This commit is contained in:
parent
a0706c95c3
commit
18f4b4e1e0
|
@ -1 +1,5 @@
|
||||||
mbox-files/*
|
mbox-files/*
|
||||||
|
output
|
||||||
|
output/*
|
||||||
|
trash
|
||||||
|
trash/*
|
||||||
|
|
12
Readme.md
12
Readme.md
|
@ -1,7 +1,13 @@
|
||||||
## Mbox to Markdown converter
|
## Mbox to Markdown converter
|
||||||
|
|
||||||
|
This simple docker image can be used to convert a .mbox file (for example from a google gmail account export / take out) to convert it to markdown files.
|
||||||
|
I used it for archive purposes, wanted to store my old gmail emails.
|
||||||
|
Nothing fancy, but usefull for those who dont want to use online converters nor mess with python directly.
|
||||||
|
|
||||||
|
## Usage:
|
||||||
|
|
||||||
|
```
|
||||||
docker build -t mbox-to-markdown .
|
docker build -t mbox-to-markdown .
|
||||||
|
|
||||||
docker run --rm -v ./mbox-files:/mnt/input -v /path/to/output/directory:/mnt/output mbox-to-markdown python mbox_to_markdown.py /mnt/input/yourfile.mbox /mnt/output/
|
docker run --rm -v ./mbox-files:/mnt/input -v ./output/:/mnt/output mbox-to-markdown
|
||||||
|
```
|
||||||
|
|
||||||
|
|
|
@ -1,31 +1,114 @@
|
||||||
import mailbox
|
import mailbox
|
||||||
import os
|
import os
|
||||||
|
import re
|
||||||
|
import logging
|
||||||
|
from datetime import datetime
|
||||||
|
from email.utils import parsedate_tz, mktime_tz
|
||||||
from markdownify import markdownify
|
from markdownify import markdownify
|
||||||
|
from watchdog.observers import Observer
|
||||||
|
from watchdog.events import FileSystemEventHandler
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
# Configuration
|
# Configuration
|
||||||
mbox_file = 'path/to/your/file.mbox'
|
input_dir = '/mnt/input'
|
||||||
output_dir = 'path/to/output/directory'
|
output_dir = '/mnt/output'
|
||||||
|
|
||||||
|
# Setup logging
|
||||||
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
# Ensure output directory exists
|
# Ensure output directory exists
|
||||||
os.makedirs(output_dir, exist_ok=True)
|
os.makedirs(output_dir, exist_ok=True)
|
||||||
|
|
||||||
def save_email_as_markdown(email, index):
|
def sanitize_filename(filename):
|
||||||
|
"""Sanitize the filename to remove invalid characters."""
|
||||||
|
return re.sub(r'[<>:"/\\|?*\x00-\x1F]', '', filename)
|
||||||
|
|
||||||
|
def format_date(date_str):
|
||||||
|
"""Format the date string to be suitable for filenames."""
|
||||||
|
try:
|
||||||
|
parsed_date = parsedate_tz(date_str)
|
||||||
|
if parsed_date is not None:
|
||||||
|
timestamp = mktime_tz(parsed_date)
|
||||||
|
formatted_date = datetime.fromtimestamp(timestamp).strftime('%Y-%m-%d_%H-%M-%S')
|
||||||
|
return formatted_date
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error formatting date: {e}")
|
||||||
|
return 'NoDate'
|
||||||
|
|
||||||
|
def save_email_as_markdown(email, index, output_subdir):
|
||||||
subject = email.get('subject', 'No Subject')
|
subject = email.get('subject', 'No Subject')
|
||||||
date = email.get('date', 'No Date')
|
date = email.get('date', 'No Date')
|
||||||
body = email.get_payload(decode=True).decode(errors='ignore')
|
sender = email.get('from', 'Unknown Sender')
|
||||||
|
recipients = email.get('to', 'Unknown Recipient')
|
||||||
|
|
||||||
|
# Sanitize and format the filename
|
||||||
|
sanitized_subject = sanitize_filename(subject)
|
||||||
|
sanitized_sender = sanitize_filename(sender.split('<')[0].strip())
|
||||||
|
sanitized_recipients = sanitize_filename(recipients.split(',')[0].strip())
|
||||||
|
formatted_date = format_date(date)
|
||||||
|
filename = f"{sanitized_subject} - {sanitized_sender} - {sanitized_recipients} - {formatted_date}.md"
|
||||||
|
filename = os.path.join(output_subdir, filename)
|
||||||
|
|
||||||
|
# Handle potential None payload
|
||||||
|
payload = email.get_payload(decode=True)
|
||||||
|
if payload is None:
|
||||||
|
body_markdown = "No content available"
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
body = payload.decode(errors='ignore')
|
||||||
body_markdown = markdownify(body)
|
body_markdown = markdownify(body)
|
||||||
|
except (UnicodeDecodeError, AttributeError) as e:
|
||||||
|
body_markdown = f"Error decoding content: {e}"
|
||||||
|
|
||||||
# Create a Markdown file for each email
|
# Create a Markdown file for each email
|
||||||
filename = os.path.join(output_dir, f'email_{index}.md')
|
|
||||||
with open(filename, 'w', encoding='utf-8') as file:
|
with open(filename, 'w', encoding='utf-8') as file:
|
||||||
file.write(f'# {subject}\n')
|
file.write(f'# {subject}\n')
|
||||||
file.write(f'*Date: {date}*\n\n')
|
file.write(f'*Date: {date}*\n')
|
||||||
|
file.write(f'*From: {sender}*\n')
|
||||||
|
file.write(f'*To: {recipients}*\n\n')
|
||||||
file.write(body_markdown)
|
file.write(body_markdown)
|
||||||
|
|
||||||
|
logger.info(f"Saved email {index + 1} as Markdown: {filename}")
|
||||||
|
|
||||||
def convert_mbox_to_markdown(mbox_file):
|
def convert_mbox_to_markdown(mbox_file):
|
||||||
|
# Create a subdirectory in the output directory with the name of the .mbox file
|
||||||
|
base_name = os.path.basename(mbox_file)
|
||||||
|
subdir_name = os.path.splitext(base_name)[0] # Remove the .mbox extension
|
||||||
|
output_subdir = os.path.join(output_dir, subdir_name)
|
||||||
|
os.makedirs(output_subdir, exist_ok=True)
|
||||||
|
|
||||||
|
logger.info(f"Processing .mbox file: {mbox_file}")
|
||||||
mbox = mailbox.mbox(mbox_file)
|
mbox = mailbox.mbox(mbox_file)
|
||||||
for i, email in enumerate(mbox):
|
|
||||||
save_email_as_markdown(email, i)
|
|
||||||
|
|
||||||
convert_mbox_to_markdown(mbox_file)
|
# Show progress bar
|
||||||
|
total_emails = len(mbox)
|
||||||
|
logger.info(f"Total emails to process: {total_emails}")
|
||||||
|
|
||||||
|
for i, email in tqdm(enumerate(mbox), total=total_emails, desc='Converting emails'):
|
||||||
|
save_email_as_markdown(email, i, output_subdir)
|
||||||
|
|
||||||
|
class MboxFileHandler(FileSystemEventHandler):
|
||||||
|
def on_created(self, event):
|
||||||
|
if event.is_directory:
|
||||||
|
return
|
||||||
|
if event.src_path.endswith('.mbox'):
|
||||||
|
logger.info(f"New .mbox file detected: {event.src_path}")
|
||||||
|
convert_mbox_to_markdown(event.src_path)
|
||||||
|
|
||||||
|
def start_watching():
|
||||||
|
event_handler = MboxFileHandler()
|
||||||
|
observer = Observer()
|
||||||
|
observer.schedule(event_handler, path=input_dir, recursive=False)
|
||||||
|
observer.start()
|
||||||
|
logger.info(f"Watching for new .mbox files in {input_dir}...")
|
||||||
|
try:
|
||||||
|
while True:
|
||||||
|
pass # Keep the script running
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
observer.stop()
|
||||||
|
observer.join()
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
start_watching()
|
||||||
|
|
||||||
|
|
|
@ -1,2 +1,4 @@
|
||||||
mailbox
|
mailbox
|
||||||
markdownify
|
markdownify
|
||||||
|
watchdog
|
||||||
|
tqdm
|
||||||
|
|
Loading…
Reference in New Issue