SOMA/soma/scripts/seed_from_markdown.py

#!/usr/bin/env python3
"""
seed_from_markdown.py

Reads a Markdown file, extracts all unchecked task items (- [ ] lines),
parses optional metadata from the same line or the preceding heading,
and inserts them as tasks into the SOMA database.

Usage:
    python seed_from_markdown.py --file /path/to/life_plan.md [--dry-run]

Format recognized:
    # Project Name
    ## Sub-section
    - [ ] Task title @tag #priority due:2026-04-01
    - [x] Already done task (skipped)

Metadata extraction:
    @tag         → maps to project_tag (e.g., @openclaw, @mak)
    #priority    → maps to priority (low|medium|high|critical)
    due:YYYY-MM-DD → maps to due_date
    Default tag: derived from nearest parent heading (slugified, max 50 chars)
    Default priority: medium
"""

import argparse
import re
import sys
import os
import asyncio
from datetime import date

# Add parent directory to path so we can import app modules
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))


def parse_markdown_tasks(filepath: str) -> list[dict]:
    """Parse all unchecked [ ] tasks from a markdown file."""
    tasks = []
    current_tag = "general"
    current_section = ""

    with open(filepath, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    for line in lines:
        line = line.rstrip()

        # Track headings for default tag
        heading_match = re.match(r'^#{1,3}\s+(.+)$', line)
        if heading_match:
            heading_text = heading_match.group(1).strip()
            # Slugify heading: lowercase, replace spaces/special chars with -
            slug = re.sub(r'[^a-z0-9]+', '-', heading_text.lower()).strip('-')[:50]
            # Map common heading words to known tags
            tag_map = {
                'openclaw': 'openclaw', 'mak': 'mak', 'rarity': 'rarity-media',
                'japan': 'japan', 'germany': 'germany', 'personal': 'personal',
                'habit': 'habits', 'retreat': 'retreat'
            }
            mapped = False
            for keyword, tag in tag_map.items():
                if keyword in heading_text.lower():
                    current_tag = tag
                    mapped = True
                    break
            if not mapped:
                current_tag = slug if slug else "general"
            current_section = heading_text
            continue

        # Match unchecked task: - [ ] or * [ ]
        task_match = re.match(r'^\s*[-*]\s+\[ \]\s+(.+)$', line)
        if not task_match:
            continue

        raw_title = task_match.group(1).strip()

        # Extract metadata from title
        tag = current_tag
        priority = "medium"
        due_date = None

        # @tag
        tag_match = re.search(r'@(\S+)', raw_title)
        if tag_match:
            tag = tag_match.group(1)
            raw_title = raw_title.replace(tag_match.group(0), '').strip()

        # #priority
        priority_match = re.search(r'#(low|medium|high|critical)', raw_title, re.IGNORECASE)
        if priority_match:
            priority = priority_match.group(1).lower()
            raw_title = raw_title.replace(priority_match.group(0), '').strip()

        # due:YYYY-MM-DD
        due_match = re.search(r'due:(\d{4}-\d{2}-\d{2})', raw_title)
        if due_match:
            try:
                due_date = due_match.group(1)
                date.fromisoformat(due_date)  # validate
            except ValueError:
                due_date = None
            raw_title = raw_title.replace(due_match.group(0), '').strip()

        # Clean up extra whitespace
        title = re.sub(r'\s+', ' ', raw_title).strip()
        if not title:
            continue

        # XP based on priority
        xp_map = {'low': 25, 'medium': 50, 'high': 100, 'critical': 200}

        tasks.append({
            'title': title,
            'description': f"Imported from: {current_section}" if current_section else None,
            'status': 'backlog',
            'priority': priority,
            'project_tag': tag,
            'due_date': due_date,
            'xp_reward': xp_map[priority],
            'position': 0,
            'checklist': [],
        })

    return tasks


async def insert_tasks(tasks: list[dict], dry_run: bool = False):
    """Insert parsed tasks into the database."""
    from app.database import AsyncSessionLocal
    from app.models.task import Task
    import uuid

    if dry_run:
        print(f"DRY RUN: Would insert {len(tasks)} tasks:")
        for i, t in enumerate(tasks, 1):
            print(f"  {i:3}. [{t['priority'].upper():8}] [{t['project_tag']:20}] {t['title']}")
        return

    async with AsyncSessionLocal() as db:
        inserted = 0
        skipped = 0
        for i, task_data in enumerate(tasks):
            # Set position based on insertion order within each status+tag
            task_data['position'] = i
            task = Task(id=uuid.uuid4(), **task_data)
            db.add(task)
            inserted += 1

        await db.commit()
        print(f"Inserted {inserted} tasks, skipped {skipped}.")


def main():
    parser = argparse.ArgumentParser(description='Seed SOMA tasks from a Markdown file')
    parser.add_argument('--file', required=True, help='Path to the Markdown file')
    parser.add_argument('--dry-run', action='store_true', help='Parse only, do not insert')
    args = parser.parse_args()

    if not os.path.exists(args.file):
        print(f"Error: File not found: {args.file}", file=sys.stderr)
        sys.exit(1)

    print(f"Parsing: {args.file}")
    tasks = parse_markdown_tasks(args.file)
    print(f"Found {len(tasks)} unchecked tasks")

    if not tasks:
        print("No tasks found. Check that the file contains '- [ ] ' lines.")
        sys.exit(0)

    asyncio.run(insert_tasks(tasks, dry_run=args.dry_run))


if __name__ == '__main__':
    main()