This commit is contained in:
friedemann.blume 2024-07-19 13:41:51 +02:00
parent 18f4b4e1e0
commit c3065028a3
1 changed files with 51 additions and 41 deletions

View File

@ -37,56 +37,66 @@ def format_date(date_str):
return 'NoDate' return 'NoDate'
def save_email_as_markdown(email, index, output_subdir): def save_email_as_markdown(email, index, output_subdir):
subject = email.get('subject', 'No Subject') logger.info(f"Starting to process email {index + 1}")
date = email.get('date', 'No Date') try:
sender = email.get('from', 'Unknown Sender') subject = email.get('subject', 'No Subject')
recipients = email.get('to', 'Unknown Recipient') date = email.get('date', 'No Date')
sender = email.get('from', 'Unknown Sender')
recipients = email.get('to', 'Unknown Recipient')
# Sanitize and format the filename # Sanitize and format the filename
sanitized_subject = sanitize_filename(subject) sanitized_subject = sanitize_filename(subject)
sanitized_sender = sanitize_filename(sender.split('<')[0].strip()) sanitized_sender = sanitize_filename(sender.split('<')[0].strip())
sanitized_recipients = sanitize_filename(recipients.split(',')[0].strip()) sanitized_recipients = sanitize_filename(recipients.split(',')[0].strip())
formatted_date = format_date(date) formatted_date = format_date(date)
filename = f"{sanitized_subject} - {sanitized_sender} - {sanitized_recipients} - {formatted_date}.md" filename = f"{sanitized_subject} - {sanitized_sender} - {sanitized_recipients} - {formatted_date}.md"
filename = os.path.join(output_subdir, filename) filename = os.path.join(output_subdir, filename)
# Handle potential None payload # Handle potential None payload
payload = email.get_payload(decode=True) payload = email.get_payload(decode=True)
if payload is None: if payload is None:
body_markdown = "No content available" body_markdown = "No content available"
else: else:
try: try:
body = payload.decode(errors='ignore') body = payload.decode(errors='ignore')
body_markdown = markdownify(body) body_markdown = markdownify(body)
except (UnicodeDecodeError, AttributeError) as e: except (UnicodeDecodeError, AttributeError) as e:
body_markdown = f"Error decoding content: {e}" body_markdown = f"Error decoding content: {e}"
# Create a Markdown file for each email # Create a Markdown file for each email
with open(filename, 'w', encoding='utf-8') as file: with open(filename, 'w', encoding='utf-8') as file:
file.write(f'# {subject}\n') file.write(f'# {subject}\n')
file.write(f'*Date: {date}*\n') file.write(f'*Date: {date}*\n')
file.write(f'*From: {sender}*\n') file.write(f'*From: {sender}*\n')
file.write(f'*To: {recipients}*\n\n') file.write(f'*To: {recipients}*\n\n')
file.write(body_markdown) file.write(body_markdown)
logger.info(f"Saved email {index + 1} as Markdown: {filename}") logger.info(f"Saved email {index + 1} as Markdown: {filename}")
except Exception as e:
logger.error(f"Error processing email {index + 1}: {e}")
def convert_mbox_to_markdown(mbox_file): def convert_mbox_to_markdown(mbox_file):
# Create a subdirectory in the output directory with the name of the .mbox file try:
base_name = os.path.basename(mbox_file) # Create a subdirectory in the output directory with the name of the .mbox file
subdir_name = os.path.splitext(base_name)[0] # Remove the .mbox extension base_name = os.path.basename(mbox_file)
output_subdir = os.path.join(output_dir, subdir_name) subdir_name = os.path.splitext(base_name)[0] # Remove the .mbox extension
os.makedirs(output_subdir, exist_ok=True) output_subdir = os.path.join(output_dir, subdir_name)
os.makedirs(output_subdir, exist_ok=True)
logger.info(f"Processing .mbox file: {mbox_file}") logger.info(f"Processing .mbox file: {mbox_file}")
mbox = mailbox.mbox(mbox_file) mbox = mailbox.mbox(mbox_file)
# Show progress bar # Show progress bar
total_emails = len(mbox) total_emails = len(mbox)
logger.info(f"Total emails to process: {total_emails}") logger.info(f"Total emails to process: {total_emails}")
for i, email in tqdm(enumerate(mbox), total=total_emails, desc='Converting emails'): for i, email in tqdm(enumerate(mbox), total=total_emails, desc='Converting emails'):
save_email_as_markdown(email, i, output_subdir) logger.info(f"Processing email {i + 1}/{total_emails}")
save_email_as_markdown(email, i, output_subdir)
logger.info(f"Completed processing {mbox_file}")
except Exception as e:
logger.error(f"Error processing mbox file {mbox_file}: {e}")
class MboxFileHandler(FileSystemEventHandler): class MboxFileHandler(FileSystemEventHandler):
def on_created(self, event): def on_created(self, event):