FASTA¶

Stdlib-only FASTA parsing — no BioPython required.

from seqchain.io.fasta import iter_fasta, load_chrom_sizes

for name, seq in iter_fasta("reference.fa"):
    print(f"{name}: {len(seq)} bp")

sizes = load_chrom_sizes("reference.fa")

iter_fasta ¶

iter_fasta(path: str | Path) -> Iterator[tuple[str, str]]

Iterate over records in a FASTA file, yielding (name, sequence).

Stdlib-only FASTA parser — no BioPython required. Supports plain and gzip-compressed (.gz) files. Sequences are returned as uppercase strings.

Parameters:

Name	Type	Description	Default
`path`	`str \| Path`	Path to a FASTA file.	required

Yields:

Type	Description
`tuple[str, str]`	`(name, sequence)` tuples.

Examples:

>>> for name, seq in iter_fasta("ref.fna"):
...     print(name, len(seq))

Source code in src/seqchain/io/fasta.py

def iter_fasta(path: str | Path) -> Iterator[tuple[str, str]]:
    """Iterate over records in a FASTA file, yielding (name, sequence).

    Stdlib-only FASTA parser — no BioPython required. Supports plain
    and gzip-compressed (``.gz``) files. Sequences are returned as
    uppercase strings.

    Args:
        path: Path to a FASTA file.

    Yields:
        ``(name, sequence)`` tuples.

    Examples:
        >>> for name, seq in iter_fasta("ref.fna"):
        ...     print(name, len(seq))
    """
    path = Path(path)
    opener = gzip.open if path.name.endswith(".gz") else open

    try:
        with opener(path, "rt") as f:
            name: str | None = None
            parts: list[str] = []
            for line in f:
                line = line.strip()
                if line.startswith(">"):
                    if name is not None:
                        yield name, "".join(parts).upper()
                    name = line[1:].split()[0]
                    parts = []
                elif line:
                    parts.append(line)
            if name is not None:
                yield name, "".join(parts).upper()
    except FileNotFoundError:
        raise FileNotFoundError(f"FASTA file not found: {path}") from None

load_chrom_sizes ¶

load_chrom_sizes(path: str | Path) -> dict[str, int]

Load chromosome sizes from a FASTA file.

Parameters:

Name	Type	Description	Default
`path`	`str \| Path`	Path to a FASTA file.	required

Returns:

Type	Description
`dict[str, int]`	Dict of `{name: length}`.

Raises:

Type	Description
`FileNotFoundError`	If the file does not exist.

Examples:

>>> sizes = load_chrom_sizes("ref.fna")
>>> sizes["chr1"]
4411532

Source code in src/seqchain/io/fasta.py

def load_chrom_sizes(path: str | Path) -> dict[str, int]:
    """Load chromosome sizes from a FASTA file.

    Args:
        path: Path to a FASTA file.

    Returns:
        Dict of ``{name: length}``.

    Raises:
        FileNotFoundError: If the file does not exist.

    Examples:
        >>> sizes = load_chrom_sizes("ref.fna")
        >>> sizes["chr1"]
        4411532
    """
    return {name: len(seq) for name, seq in iter_fasta(path)}