#
# Copyright 2025 translate-toolkit contributors
#
# This file is part of the Translate Toolkit.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, see <http://www.gnu.org/licenses/>.

"""
Module for parsing AsciiDoc files for translation.

The principles for extraction of translation units are as follows:

1. Extract all content relevant for translation, at the cost of also
   including some formatting.
2. One translation unit per paragraph.
3. Keep formatting out of the translation units as much as possible.
   Use placeholders {1}, {2}, ..., as needed for complex inline elements.
4. Support common AsciiDoc elements: headings, paragraphs, lists, etc.

White space within translation units is normalized.
"""

from __future__ import annotations

import re
from typing import Any

from translate.storage import base


class AsciiDocUnit(base.TranslationUnit):
    """A unit of translatable/localisable AsciiDoc content."""

    def __init__(self, source: str | None = None) -> None:
        super().__init__(source)
        self.locations: list[str] = []
        self._element_type: str = "paragraph"
        self._prefix: str = ""
        self._suffix: str = ""

    def addlocation(self, location: str) -> None:
        self.locations.append(location)

    def getlocations(self) -> list[str]:
        return self.locations

    def set_element_info(
        self, element_type: str, prefix: str = "", suffix: str = ""
    ) -> None:
        """Store element type and formatting information."""
        self._element_type = element_type
        self._prefix = prefix
        self._suffix = suffix


class AsciiDocHeaderUnit(AsciiDocUnit):
    def isheader(self) -> bool:
        """Return True to indicate this unit represents document header metadata."""
        return True


class AsciiDocFile(base.TranslationStore):
    UnitClass = AsciiDocUnit

    def __init__(self, inputfile=None, callback=None) -> None:
        """
        Construct a new object instance.

        :param inputfile: if specified, the content of this file is read and parsed.
        :param callback: a function which takes a chunk of untranslated content as
          input and returns the corresponding translated content. Defaults to
          a no-op.
        """
        base.TranslationStore.__init__(self)
        self.filename: str | None = getattr(inputfile, "name", None)
        self.callback = callback or self._dummy_callback
        self.filesrc: str = ""
        self._elements: list[
            dict[str, Any]
        ] = []  # Store parsed elements for reconstruction
        # Docpath tracking: hierarchical heading structure
        self._heading_stack: list[tuple[int, int]] = []  # (level, index) pairs
        self._section_counts: list[dict[str, int]] = [
            {}
        ]  # Stack of element counts per section
        if inputfile is not None:
            adoc_src = inputfile.read()
            inputfile.close()
            self.parse(adoc_src)

    def parse(self, data: bytes) -> None:
        """Process the given source string (binary)."""
        text = data.decode()
        lines = text.splitlines(keepends=True)

        # Parse document header if present
        lines = self._parse_header(lines)

        # Parse the rest of the document
        self._parse_content(lines)

        # Reconstruct the document
        self._reconstruct()

    def _parse_header(self, lines: list[str]) -> list[str]:
        """
        Parse document header if present.

        Returns remaining lines after header.
        """
        # Check for document header (first line starting with = )
        if not lines or not lines[0].startswith("= "):
            return lines

        # The header includes: title line, optional author/revision lines, and attributes
        # Keep going until we hit a blank line followed by section/paragraph content
        header_end = 0  # At minimum, include the title
        seen_blank = False
        i = 1

        while i < len(lines):
            line = lines[i]

            # Check for comment block delimiter ////
            # Comment blocks can appear in the header and should be included
            if self._is_comment_block_delimiter(line):
                header_end, i = self._skip_comment_block(lines, i, header_end)
                continue

            # Attributes start with : - always part of header
            if line.startswith(":"):
                header_end = i
                seen_blank = False  # Reset blank line tracking
            # Empty line - might separate header sections or end header
            elif not line.strip():
                if seen_blank:
                    # Two blank lines in a row typically ends header
                    break
                seen_blank = True
                # Check if next line starts content
                if i + 1 < len(lines):
                    next_line = lines[i + 1]
                    # If next line is a section (==) or regular paragraph, end here
                    if next_line.strip() and (
                        next_line.startswith("==")
                        or (
                            not next_line.startswith(":")
                            and not self._is_comment_block_delimiter(next_line)
                        )
                    ):
                        header_end = i
                        break
                header_end = i
            # Non-empty, non-attribute line after title
            # Could be author, revision info, or start of content
            else:
                # If we've seen a blank line and this doesn't start with :
                # it's likely the start of content
                if seen_blank:
                    break
                # Check if this looks like author/revision line (typically line 1-3 after title)
                # Author lines don't start with [ or = and are early in the document
                if i <= 3 and not line.startswith("[") and not line.startswith("="):
                    header_end = i
                else:
                    # This is content, not header
                    break
            i += 1

        if header_end >= 0:
            header_content = "".join(lines[: header_end + 1])
            header = AsciiDocHeaderUnit(header_content)
            self.addunit(header)
            self._elements.append(
                {"type": "header", "content": header_content, "unit": header}
            )
            return lines[header_end + 1 :]

        return lines

    def _is_comment_block_delimiter(self, line: str) -> bool:
        """Check if a line is a comment block delimiter (////)."""
        return line.strip() == "////"

    def _skip_comment_block(
        self, lines: list[str], i: int, header_end: int
    ) -> tuple[int, int]:
        """Skip a comment block and return updated header_end and line index."""
        # Include the comment block in the header
        header_end = i
        i += 1
        # Skip until closing delimiter
        while i < len(lines) and not self._is_comment_block_delimiter(lines[i]):
            header_end = i
            i += 1
        if i < len(lines):
            header_end = i  # Include closing delimiter
            i += 1
        return header_end, i

    def _parse_content(self, lines: list[str]) -> None:
        """Parse AsciiDoc content and extract translation units."""
        i = 0
        while i < len(lines):
            line = lines[i]

            # Skip empty lines
            if not line.strip():
                self._elements.append({"type": "empty", "content": line})
                i += 1
                continue

            # Try each parsing method in order
            if self._try_parse_conditional(lines, i):
                i = self._get_next_index(lines, i, "conditional")
            elif (
                self._try_parse_directive(line, i)
                or self._try_parse_anchor(line, i)
                or self._try_parse_block_title(line, i)
                or self._try_parse_attribute(line, i)
                or self._try_parse_heading(line, i)
                or self._try_parse_unordered_list(line, i)
                or self._try_parse_ordered_list(line, i)
                or self._try_parse_description_list(line, i)
            ):
                i += 1
            elif self._try_parse_block_delimiter(lines, i):
                i = self._get_next_index(lines, i, "block")
            elif (
                self._try_parse_list_continuation(line, i)
                or self._try_parse_comment(line, i)
                or self._try_parse_admonition(line, i)
            ):
                i += 1
            elif self._try_parse_table(lines, i):
                i = self._get_next_index(lines, i, "table")
            else:
                # Parse as paragraph
                i = self._parse_paragraph(lines, i)

    def _try_parse_conditional(self, lines: list[str], i: int) -> bool:
        """Parse conditional directives (ifdef, ifndef, ifeval)."""
        line = lines[i]
        if not re.match(r"^(ifdef|ifndef|ifeval)::", line):
            return False

        # Store the opening directive
        block_lines = [line]
        i += 1
        # Collect everything until endif
        depth = 1
        while i < len(lines) and depth > 0:
            current_line = lines[i]
            block_lines.append(current_line)
            if re.match(r"^(ifdef|ifndef|ifeval)::", current_line):
                depth += 1
            elif re.match(r"^endif::", current_line):
                depth -= 1
                if depth == 0:
                    break
            i += 1
        self._elements.append(
            {
                "type": "conditional_block",
                "content": "".join(block_lines),
                "end_index": i + 1,
            }
        )
        return True

    def _try_parse_directive(self, line: str, i: int) -> bool:
        """Parse standalone endif directive."""
        if not re.match(r"^endif::", line):
            return False
        self._elements.append({"type": "directive", "content": line})
        return True

    def _try_parse_anchor(self, line: str, i: int) -> bool:
        """Parse anchor [[anchor-id]]."""
        # Match anchor syntax: [[id]] where id contains non-bracket characters
        # Pattern [^\]]+ efficiently matches typical anchor IDs (short identifiers)
        # without complex backtracking issues
        if not re.match(r"^\[\[[^\]]+\]\]\s*$", line):
            return False
        self._elements.append({"type": "anchor", "content": line})
        return True

    def _try_parse_block_title(self, line: str, i: int) -> bool:
        """Parse block title (starts with . followed by alphanumeric)."""
        if not re.match(r"^\.[A-Za-z0-9]", line):
            return False
        self._elements.append({"type": "block_title", "content": line})
        return True

    def _try_parse_attribute(self, line: str, i: int) -> bool:
        """Parse attribute lines (e.g., [NOTE], [source,java])."""
        if not (line.strip().startswith("[") and line.strip().endswith("]")):
            return False
        self._elements.append({"type": "attribute", "content": line})
        return True

    def _try_parse_heading(self, line: str, i: int) -> bool:
        """Parse section heading."""
        heading_match = re.match(r"^(={2,6})\s+(\S.*?)$", line)
        if not heading_match:
            return False

        level = len(heading_match.group(1))
        # Strip any trailing whitespace and closing = markers
        title = heading_match.group(2).rstrip().rstrip("=").strip()

        # Update heading hierarchy and get docpath
        docpath = self._enter_heading(level)

        unit = self.addsourceunit(title)
        unit.addlocation(f"{self.filename or ''}:{i + 1}")
        unit.setdocpath(docpath)
        unit.set_element_info(
            "heading",
            heading_match.group(1) + " ",
            "\n" if line.endswith("\n") else "",
        )

        self._elements.append(
            {
                "type": "heading",
                "level": level,
                "prefix": heading_match.group(1) + " ",
                "suffix": "\n" if line.endswith("\n") else "",
                "unit": unit,
                "line": i + 1,
            }
        )
        return True

    def _try_parse_unordered_list(self, line: str, i: int) -> bool:
        """Parse unordered list item."""
        list_match = re.match(r"^(\*+)\s+(\S.*?)$", line)
        if not list_match:
            return False

        level = len(list_match.group(1))
        content = list_match.group(2).strip()

        # Handle checklist syntax [*], [x], [ ]
        checklist_prefix = ""
        # Use .*? for non-greedy matching to avoid backtracking
        checklist_match = re.match(r"^(\[[*x ]\])\s+(\S.*?)$", content)
        if checklist_match:
            checklist_prefix = checklist_match.group(1) + " "
            content = checklist_match.group(2).strip()

        docpath = self._build_docpath("li")
        unit = self.addsourceunit(content)
        unit.addlocation(f"{self.filename or ''}:{i + 1}")
        unit.setdocpath(docpath)
        prefix = list_match.group(1) + " " + checklist_prefix
        unit.set_element_info(
            "list_item",
            prefix,
            "\n" if line.endswith("\n") else "",
        )

        self._elements.append(
            {
                "type": "list_item",
                "level": level,
                "prefix": prefix,
                "suffix": "\n" if line.endswith("\n") else "",
                "unit": unit,
                "line": i + 1,
            }
        )
        return True

    def _try_parse_ordered_list(self, line: str, i: int) -> bool:
        """Parse ordered list item."""
        ordered_list_match = re.match(r"^(\.+)\s+(\S.*?)$", line)
        if not ordered_list_match:
            return False

        level = len(ordered_list_match.group(1))
        content = ordered_list_match.group(2).strip()

        docpath = self._build_docpath("li")
        unit = self.addsourceunit(content)
        unit.addlocation(f"{self.filename or ''}:{i + 1}")
        unit.setdocpath(docpath)
        unit.set_element_info(
            "list_item",
            ordered_list_match.group(1) + " ",
            "\n" if line.endswith("\n") else "",
        )

        self._elements.append(
            {
                "type": "list_item",
                "level": level,
                "prefix": ordered_list_match.group(1) + " ",
                "suffix": "\n" if line.endswith("\n") else "",
                "unit": unit,
                "line": i + 1,
            }
        )
        return True

    def _try_parse_description_list(self, line: str, i: int) -> bool:
        """Parse description list (term:: definition)."""
        # Use [^:\n]* to match non-colon/non-newline characters for the term.
        # This prevents the term capture from consuming the :: delimiter when
        # the input contains multiple colons before the actual delimiter.
        desc_list_match = re.match(r"^(\S[^:\n]*)::\s+(\S.*?)$", line)
        if not desc_list_match:
            return False

        term = desc_list_match.group(1).strip()
        definition = desc_list_match.group(2).strip()

        # Create unit for the definition only (term is part of the markup)
        docpath = self._build_docpath("dl")
        unit = self.addsourceunit(definition)
        unit.addlocation(f"{self.filename or ''}:{i + 1}")
        unit.setdocpath(docpath)
        unit.set_element_info(
            "description_list",
            f"{term}:: ",
            "\n" if line.endswith("\n") else "",
        )

        self._elements.append(
            {
                "type": "description_list",
                "term": term,
                "prefix": f"{term}:: ",
                "suffix": "\n" if line.endswith("\n") else "",
                "unit": unit,
                "line": i + 1,
            }
        )
        return True

    def _try_parse_block_delimiter(self, lines: list[str], i: int) -> bool:
        """Parse code blocks, literal blocks, example blocks, etc."""
        line = lines[i]
        if not (
            line.strip()
            and len(set(line.strip())) == 1
            and line.strip()[0] in "-=.*_+/"
        ):
            return False

        # Check if it's a delimiter (4+ repeated characters)
        delimiter = line.strip()
        if len(delimiter) < 4:
            return False

        block_lines = [line]
        i += 1
        while i < len(lines):
            block_lines.append(lines[i])
            if lines[i].strip() == delimiter:
                i += 1
                break
            i += 1

        self._elements.append(
            {"type": "code_block", "content": "".join(block_lines), "end_index": i}
        )
        return True

    def _try_parse_list_continuation(self, line: str, i: int) -> bool:
        """Parse list continuation marker (standalone +)."""
        if line.strip() != "+":
            return False
        self._elements.append({"type": "list_continuation", "content": line})
        return True

    def _try_parse_comment(self, line: str, i: int) -> bool:
        """Parse comment line."""
        if not line.startswith("//"):
            return False
        self._elements.append({"type": "comment", "content": line})
        return True

    def _try_parse_admonition(self, line: str, i: int) -> bool:
        """Parse admonition (NOTE, TIP, IMPORTANT, WARNING, CAUTION)."""
        admonition_match = re.match(
            r"^(NOTE|TIP|IMPORTANT|WARNING|CAUTION):\s+(\S.*?)$", line
        )
        if not admonition_match:
            return False

        admon_type = admonition_match.group(1)
        content = admonition_match.group(2).strip()

        docpath = self._build_docpath("admonition")
        unit = self.addsourceunit(content)
        unit.addlocation(f"{self.filename or ''}:{i + 1}")
        unit.setdocpath(docpath)
        unit.set_element_info(
            "admonition",
            f"{admon_type}: ",
            "\n" if line.endswith("\n") else "",
        )

        self._elements.append(
            {
                "type": "admonition",
                "admon_type": admon_type,
                "prefix": f"{admon_type}: ",
                "suffix": "\n" if line.endswith("\n") else "",
                "unit": unit,
                "line": i + 1,
            }
        )
        return True

    def _try_parse_table(self, lines: list[str], i: int) -> bool:
        """Parse table."""
        line = lines[i]
        if not line.strip().startswith("|"):
            return False

        table_lines = []
        start_line = i
        # Collect all table lines
        while i < len(lines) and (
            lines[i].strip().startswith("|") or not lines[i].strip()
        ):
            table_lines.append(lines[i])
            i += 1
            if i < len(lines) and not lines[i].strip():
                # Check if next non-empty line is still part of table
                next_i = i
                while next_i < len(lines) and not lines[next_i].strip():
                    next_i += 1
                if next_i >= len(lines) or not lines[next_i].strip().startswith("|"):
                    break

        # Parse table cells for translation
        # Note: This is a simple pipe-based split that doesn't handle:
        # - Escaped pipes (e.g., \|)
        # - CSV/PSV/DSV table formats (comma/colon/delimiter-separated values)
        # - Cell spanning (colspan/rowspan) with +/. notation
        # - Formatted table cells (a, e, h, l, m, s, v cell specifiers)
        # - Complex table features (cols/rows attributes, cell alignment)
        # For more sophisticated table parsing, consider using an AsciiDoc parser library.

        # Build table docpath
        table_docpath = self._build_docpath("table")
        row_index = 0

        for table_line in table_lines:
            if table_line.strip() and "|" in table_line:
                row_index += 1
                # Extract cells (simple approach)
                cells = [cell.strip() for cell in table_line.split("|") if cell.strip()]
                for col_index, cell in enumerate(cells, 1):
                    # Skip cell separator markers and empty cells
                    if cell and not cell.startswith("="):
                        cell_docpath = f"{table_docpath}/r[{row_index}]/c[{col_index}]"
                        unit = self.addsourceunit(cell)
                        unit.addlocation(f"{self.filename or ''}:{start_line + 1}")
                        unit.setdocpath(cell_docpath)
                        unit.set_element_info("table_cell", "", "")

        self._elements.append(
            {
                "type": "table",
                "content": "".join(table_lines),
                "line": start_line + 1,
                "end_index": i,
            }
        )
        return True

    def _parse_paragraph(self, lines: list[str], i: int) -> int:
        """Parse paragraph - collect consecutive non-empty lines."""
        para_lines = []
        start_line = i
        while i < len(lines) and lines[i].strip():
            # Check if this is a special line that breaks paragraphs
            line_stripped = lines[i].strip()
            # Check for block delimiters (4+ repeated characters)
            # Include '/' for comment blocks (////) to maintain consistency with
            # _try_parse_block_delimiter. Comment delimiters should properly break
            # paragraph parsing just like other block delimiters.
            is_delimiter = (
                len(line_stripped) >= 4
                and len(set(line_stripped)) == 1
                and line_stripped[0] in "-=.*_+/"
            )
            if (
                re.match(r"^(={2,6}|\*+|\.+)\s+", lines[i])
                or is_delimiter
                or lines[i].startswith("//")
            ):
                break
            para_lines.append(lines[i])
            i += 1

        if para_lines:
            # Join paragraph lines and normalize whitespace
            para_text = " ".join(line.strip() for line in para_lines)

            if para_text:
                docpath = self._build_docpath("p")
                unit = self.addsourceunit(para_text)
                unit.addlocation(f"{self.filename or ''}:{start_line + 1}")
                unit.setdocpath(docpath)
                unit.set_element_info("paragraph", "", "\n")

                self._elements.append(
                    {
                        "type": "paragraph",
                        "unit": unit,
                        "line": start_line + 1,
                        "original_lines": para_lines,
                    }
                )
        return i

    def _get_next_index(
        self, lines: list[str], current_i: int, element_type: str
    ) -> int:
        """Get the next index after parsing a multi-line element."""
        # Find the last element added and get its end_index
        if self._elements and "end_index" in self._elements[-1]:
            return self._elements[-1]["end_index"]
        return current_i + 1

    def _reconstruct(self) -> None:
        """Reconstruct the AsciiDoc document with translations."""
        result = []

        for element in self._elements:
            elem_type = element["type"]

            if elem_type == "header":
                if element.get("unit") and element["unit"].isheader():
                    result.append(element["content"])
            elif elem_type in {
                "empty",
                "code_block",
                "comment",
                "attribute",
                "list_continuation",
                "directive",
                "anchor",
                "block_title",
                "conditional_block",
            }:
                result.append(element["content"])
            elif elem_type in {
                "heading",
                "list_item",
                "admonition",
                "description_list",
            }:
                unit = element.get("unit")
                if unit:
                    translated = self.callback(unit.source)
                    result.append(f"{element['prefix']}{translated}{element['suffix']}")
            elif elem_type == "table":
                # For tables, we need to reconstruct with translated cells
                # For now, preserve the original table structure
                # (proper table translation would require more complex parsing)
                result.append(element["content"])
            elif elem_type == "paragraph":
                unit = element.get("unit")
                if unit:
                    translated = self.callback(unit.source)
                    # Try to preserve line structure somewhat
                    result.append(f"{translated}\n")

        self.filesrc = "".join(result)

    @staticmethod
    def _dummy_callback(text: str) -> str:
        """Default callback that returns text unchanged."""
        return text

    def _build_docpath(self, element_type: str) -> str:
        """
        Build the current docpath string for a block element.

        Docpath format follows the heading hierarchy with element counters:
        h2[1]/h3[1]/p[2] - second paragraph under first h3 under first h2
        h2[1]/table[1]/r[1]/c[2] - second column of first row of first table under first h2
        """
        parts = [f"h{level}[{idx}]" for level, idx in self._heading_stack]
        counts = self._section_counts[-1]
        counts[element_type] = counts.get(element_type, 0) + 1
        parts.append(f"{element_type}[{counts[element_type]}]")
        return "/".join(parts)

    def _enter_heading(self, level: int) -> str:
        """
        Update heading stack when entering a heading and return its docpath.

        Pops any headings of equal or greater level, counts this heading at the
        current section level, and pushes a new section level.
        """
        # Pop headings of equal or greater level
        while self._heading_stack and self._heading_stack[-1][0] >= level:
            self._heading_stack.pop()
            self._section_counts.pop()
        # Count this heading at the current section level
        counts = self._section_counts[-1]
        heading_key = f"h{level}"
        counts[heading_key] = counts.get(heading_key, 0) + 1
        idx = counts[heading_key]
        # Push new section level
        self._heading_stack.append((level, idx))
        self._section_counts.append({})
        # Build and return path
        return "/".join(f"h{lvl}[{i}]" for lvl, i in self._heading_stack)
