Skip to content

FASTA

Stdlib-only FASTA parsing — no BioPython required.

from seqchain.io.fasta import iter_fasta, load_chrom_sizes

for name, seq in iter_fasta("reference.fa"):
    print(f"{name}: {len(seq)} bp")

sizes = load_chrom_sizes("reference.fa")

iter_fasta

iter_fasta(path: str | Path) -> Iterator[tuple[str, str]]

Iterate over records in a FASTA file, yielding (name, sequence).

Stdlib-only FASTA parser — no BioPython required. Supports plain and gzip-compressed (.gz) files. Sequences are returned as uppercase strings.

Parameters:

Name Type Description Default
path str | Path

Path to a FASTA file.

required

Yields:

Type Description
tuple[str, str]

(name, sequence) tuples.

Examples:

>>> for name, seq in iter_fasta("ref.fna"):
...     print(name, len(seq))
Source code in src/seqchain/io/fasta.py
def iter_fasta(path: str | Path) -> Iterator[tuple[str, str]]:
    """Iterate over records in a FASTA file, yielding (name, sequence).

    Stdlib-only FASTA parser — no BioPython required. Supports plain
    and gzip-compressed (``.gz``) files. Sequences are returned as
    uppercase strings.

    Args:
        path: Path to a FASTA file.

    Yields:
        ``(name, sequence)`` tuples.

    Examples:
        >>> for name, seq in iter_fasta("ref.fna"):
        ...     print(name, len(seq))
    """
    path = Path(path)
    opener = gzip.open if path.name.endswith(".gz") else open

    try:
        with opener(path, "rt") as f:
            name: str | None = None
            parts: list[str] = []
            for line in f:
                line = line.strip()
                if line.startswith(">"):
                    if name is not None:
                        yield name, "".join(parts).upper()
                    name = line[1:].split()[0]
                    parts = []
                elif line:
                    parts.append(line)
            if name is not None:
                yield name, "".join(parts).upper()
    except FileNotFoundError:
        raise FileNotFoundError(f"FASTA file not found: {path}") from None

load_chrom_sizes

load_chrom_sizes(path: str | Path) -> dict[str, int]

Load chromosome sizes from a FASTA file.

Parameters:

Name Type Description Default
path str | Path

Path to a FASTA file.

required

Returns:

Type Description
dict[str, int]

Dict of {name: length}.

Raises:

Type Description
FileNotFoundError

If the file does not exist.

Examples:

>>> sizes = load_chrom_sizes("ref.fna")
>>> sizes["chr1"]
4411532
Source code in src/seqchain/io/fasta.py
def load_chrom_sizes(path: str | Path) -> dict[str, int]:
    """Load chromosome sizes from a FASTA file.

    Args:
        path: Path to a FASTA file.

    Returns:
        Dict of ``{name: length}``.

    Raises:
        FileNotFoundError: If the file does not exist.

    Examples:
        >>> sizes = load_chrom_sizes("ref.fna")
        >>> sizes["chr1"]
        4411532
    """
    return {name: len(seq) for name, seq in iter_fasta(path)}