Skip to content

Intervals

Max-end segment tree for O(log n + k) interval overlap queries. IntervalIndex is the data structure behind IntervalTrack.regions().

Build an index from Regions, then query it with chromosome + coordinate range to get all overlapping entries.

intervals

Fast interval overlap lookups and set operations.

IntervalIndex provides O(log n + k) overlap queries backed by a max-end segment tree. intersect_intervals() computes pairwise intersections of two region sets — the fundamental spatial join primitive.

No external dependencies — uses bisect from stdlib.

IntervalIndex

IntervalIndex(data: dict[str, _ChromIndex])

Index for fast overlap, nearest, and proximity queries on Regions.

Backed by a per-chromosome segment tree over coordinate-sorted (start, end, region) tuples. Construction is O(n log n) via sorting; overlap queries are O(log n + k) where k is the number of results — bisect prunes the right boundary and the max-end tree prunes the left.

Examples:

>>> idx = IntervalIndex.build([
...     Region("chr1", 100, 200),
...     Region("chr1", 150, 250),
... ])
>>> len(idx.overlapping("chr1", 120, 160))
2
Source code in src/seqchain/primitives/intervals.py
def __init__(self, data: dict[str, _ChromIndex]) -> None:
    self._data = data

build staticmethod

build(regions: Sequence[Region]) -> IntervalIndex

Construct an index from a sequence of Regions.

Regions are grouped by chromosome and sorted by start position (ties broken by end position).

Parameters:

Name Type Description Default
regions Sequence[Region]

Iterable of Region objects.

required

Returns:

Type Description
IntervalIndex

A new IntervalIndex ready for queries.

Examples:

>>> idx = IntervalIndex.build([Region("chr1", 10, 20)])
>>> idx.overlapping("chr1", 15, 25)
[Region(chrom='chr1', start=10, end=20, strand='.', score=0.0, name='', tags={})]
Source code in src/seqchain/primitives/intervals.py
@staticmethod
def build(regions: Sequence[Region]) -> IntervalIndex:
    """Construct an index from a sequence of Regions.

    Regions are grouped by chromosome and sorted by start position
    (ties broken by end position).

    Args:
        regions: Iterable of `Region` objects.

    Returns:
        A new `IntervalIndex` ready for queries.

    Examples:
        >>> idx = IntervalIndex.build([Region("chr1", 10, 20)])
        >>> idx.overlapping("chr1", 15, 25)
        [Region(chrom='chr1', start=10, end=20, strand='.', score=0.0, name='', tags={})]
    """
    by_chrom: dict[str, list[tuple[int, int, Region]]] = {}
    for r in regions:
        by_chrom.setdefault(r.chrom, []).append((r.start, r.end, r))

    data: dict[str, _ChromIndex] = {}
    for chrom, entries in by_chrom.items():
        entries.sort(key=lambda t: (t[0], t[1]))
        data[chrom] = _ChromIndex(entries)

    return IntervalIndex(data)

overlapping

overlapping(chrom: str, start: int, end: int) -> list[Region]

Return all regions that overlap the query interval.

A region overlaps if region.start < end and region.end > start (half-open interval intersection). Complexity is O(log n + k) where k is the result count.

Parameters:

Name Type Description Default
chrom str

Chromosome name.

required
start int

Query interval start (inclusive).

required
end int

Query interval end (exclusive).

required

Returns:

Type Description
list[Region]

List of overlapping Region objects,

list[Region]

ordered by start position.

Examples:

>>> idx = IntervalIndex.build([Region("chr1", 10, 20)])
>>> idx.overlapping("chr1", 15, 25)
[Region(chrom='chr1', start=10, end=20, strand='.', score=0.0, name='', tags={})]
Source code in src/seqchain/primitives/intervals.py
def overlapping(self, chrom: str, start: int, end: int) -> list[Region]:
    """Return all regions that overlap the query interval.

    A region overlaps if ``region.start < end`` and
    ``region.end > start`` (half-open interval intersection).
    Complexity is O(log n + k) where *k* is the result count.

    Args:
        chrom: Chromosome name.
        start: Query interval start (inclusive).
        end: Query interval end (exclusive).

    Returns:
        List of overlapping `Region` objects,
        ordered by start position.

    Examples:
        >>> idx = IntervalIndex.build([Region("chr1", 10, 20)])
        >>> idx.overlapping("chr1", 15, 25)
        [Region(chrom='chr1', start=10, end=20, strand='.', score=0.0, name='', tags={})]
    """
    ci = self._data.get(chrom)
    if ci is None or start >= end:
        return []
    return ci.overlap(start, end)

nearest

nearest(chrom: str, pos: int, max_distance: int | None = None) -> Region | None

Return the single nearest region to a position.

Distance is measured from pos to the closest edge of each region (0 if pos is inside the region).

Parameters:

Name Type Description Default
chrom str

Chromosome name.

required
pos int

Query position.

required
max_distance int | None

If given, only consider regions within this many bp. None (default) means no limit.

None

Returns:

Type Description
Region | None

The nearest Region, or None if

Region | None

the chromosome has no regions (or none within

Region | None

max_distance).

Examples:

>>> idx = IntervalIndex.build([Region("chr1", 100, 200)])
>>> idx.nearest("chr1", 50)
Region(chrom='chr1', start=100, end=200, strand='.', score=0.0, name='', tags={})
Source code in src/seqchain/primitives/intervals.py
def nearest(
    self, chrom: str, pos: int, max_distance: int | None = None
) -> Region | None:
    """Return the single nearest region to a position.

    Distance is measured from *pos* to the closest edge of each
    region (0 if *pos* is inside the region).

    Args:
        chrom: Chromosome name.
        pos: Query position.
        max_distance: If given, only consider regions within this
            many bp. ``None`` (default) means no limit.

    Returns:
        The nearest `Region`, or ``None`` if
        the chromosome has no regions (or none within
        *max_distance*).

    Examples:
        >>> idx = IntervalIndex.build([Region("chr1", 100, 200)])
        >>> idx.nearest("chr1", 50)
        Region(chrom='chr1', start=100, end=200, strand='.', score=0.0, name='', tags={})
    """
    ci = self._data.get(chrom)
    if ci is None or ci.n == 0:
        return None

    # Find insertion point for pos
    idx = bisect.bisect_right(ci.entries, pos, key=lambda e: e[0])

    best: Region | None = None
    best_dist = float("inf")

    # Check candidates around the insertion point
    for i in (idx - 1, idx):
        if 0 <= i < ci.n:
            s, e, region = ci.entries[i]
            dist = self._point_distance(pos, s, e)
            if dist < best_dist:
                best_dist = dist
                best = region

    # Also scan backwards from idx-2 in case of ties / overlapping
    # regions with the same start
    if idx >= 2:
        s, e, region = ci.entries[idx - 2]
        dist = self._point_distance(pos, s, e)
        if dist < best_dist:
            best_dist = dist
            best = region

    if max_distance is not None and best_dist > max_distance:
        return None

    return best

within_distance

within_distance(chrom: str, pos: int, distance: int) -> list[Region]

Return all regions within a given distance of a position.

Distance is measured from pos to the closest edge of each region (0 if pos is inside the region). Complexity is O(log n + k) — delegates to overlapping() with an expanded window.

Parameters:

Name Type Description Default
chrom str

Chromosome name.

required
pos int

Query position.

required
distance int

Maximum distance in bp (inclusive).

required

Returns:

Type Description
list[Region]

List of Region objects within

list[Region]

distance bp, ordered by start position.

Examples:

>>> idx = IntervalIndex.build([
...     Region("chr1", 100, 200),
...     Region("chr1", 500, 600),
... ])
>>> len(idx.within_distance("chr1", 50, 100))
1
Source code in src/seqchain/primitives/intervals.py
def within_distance(
    self, chrom: str, pos: int, distance: int
) -> list[Region]:
    """Return all regions within a given distance of a position.

    Distance is measured from *pos* to the closest edge of each
    region (0 if *pos* is inside the region).  Complexity is
    O(log n + k) — delegates to `overlapping()` with an
    expanded window.

    Args:
        chrom: Chromosome name.
        pos: Query position.
        distance: Maximum distance in bp (inclusive).

    Returns:
        List of `Region` objects within
        *distance* bp, ordered by start position.

    Examples:
        >>> idx = IntervalIndex.build([
        ...     Region("chr1", 100, 200),
        ...     Region("chr1", 500, 600),
        ... ])
        >>> len(idx.within_distance("chr1", 50, 100))
        1
    """
    return self.overlapping(chrom, pos - distance, pos + distance + 1)

__len__

__len__() -> int

Total number of indexed regions across all chromosomes.

Returns:

Type Description
int

Region count.

Source code in src/seqchain/primitives/intervals.py
def __len__(self) -> int:
    """Total number of indexed regions across all chromosomes.

    Returns:
        Region count.
    """
    return sum(ci.n for ci in self._data.values())

chromosomes

chromosomes() -> list[str]

Return the list of chromosomes present in the index.

Returns:

Type Description
list[str]

Sorted list of chromosome names.

Examples:

>>> idx = IntervalIndex.build([Region("chr2", 0, 10), Region("chr1", 0, 10)])
>>> idx.chromosomes()
['chr1', 'chr2']
Source code in src/seqchain/primitives/intervals.py
def chromosomes(self) -> list[str]:
    """Return the list of chromosomes present in the index.

    Returns:
        Sorted list of chromosome names.

    Examples:
        >>> idx = IntervalIndex.build([Region("chr2", 0, 10), Region("chr1", 0, 10)])
        >>> idx.chromosomes()
        ['chr1', 'chr2']
    """
    chroms: list[str] = []
    for chrom in self._data:
        chroms.append(chrom)
    chroms.sort()
    return chroms

intersect_intervals

intersect_intervals(query: Iterable[Region], reference: Iterable[Region]) -> Iterator[tuple[Region, Region]]

Compute pairwise intersections of two region sets.

For each query region, finds all overlapping reference regions and yields (fragment, ref) pairs. The fragment is the query region with coordinates clipped to the intersection — it preserves the query's name, strand, score, and tags. The ref is the original reference region (unmodified).

Uses a sorted sweep-line grouped by chromosome. Reference regions are grouped by chromosome and sorted by start; each query region scans only its own chromosome's sorted slice, stopping early when ref.start >= query.end.

Parameters:

Name Type Description Default
query Iterable[Region]

Input regions to intersect.

required
reference Iterable[Region]

Reference regions to intersect against.

required

Yields:

Type Description
Region

(fragment, ref) tuples where fragment is the query region

Region

clipped to the overlap with ref. Query regions with no

tuple[Region, Region]

overlapping reference regions are silently dropped.

Examples:

>>> q = [Region("chr1", 100, 300)]
>>> r = [Region("chr1", 0, 200, name="A"), Region("chr1", 200, 400, name="B")]
>>> [(f.start, f.end, ref.name) for f, ref in intersect_intervals(q, r)]
[(100, 200, 'A'), (200, 300, 'B')]
Source code in src/seqchain/primitives/intervals.py
def intersect_intervals(
    query: Iterable[Region],
    reference: Iterable[Region],
) -> Iterator[tuple[Region, Region]]:
    """Compute pairwise intersections of two region sets.

    For each query region, finds all overlapping reference regions and
    yields ``(fragment, ref)`` pairs.  The *fragment* is the query region
    with coordinates clipped to the intersection — it preserves the
    query's name, strand, score, and tags.  The *ref* is the original
    reference region (unmodified).

    Uses a sorted sweep-line grouped by chromosome.  Reference regions
    are grouped by chromosome and sorted by start; each query region
    scans only its own chromosome's sorted slice, stopping early when
    ``ref.start >= query.end``.

    Args:
        query: Input regions to intersect.
        reference: Reference regions to intersect against.

    Yields:
        ``(fragment, ref)`` tuples where *fragment* is the query region
        clipped to the overlap with *ref*.  Query regions with no
        overlapping reference regions are silently dropped.

    Examples:
        >>> q = [Region("chr1", 100, 300)]
        >>> r = [Region("chr1", 0, 200, name="A"), Region("chr1", 200, 400, name="B")]
        >>> [(f.start, f.end, ref.name) for f, ref in intersect_intervals(q, r)]
        [(100, 200, 'A'), (200, 300, 'B')]
    """
    # Group reference by chromosome without calling list() — use dict + append
    ref_by_chrom: dict[str, list[Region]] = {}
    for r in reference:
        if r.chrom not in ref_by_chrom:
            ref_by_chrom[r.chrom] = []
        ref_by_chrom[r.chrom].append(r)
    # Sort each chromosome's slice in-place (sorted() not needed — .sort() is fine)
    for chrom_refs in ref_by_chrom.values():
        chrom_refs.sort(key=lambda r: (r.start, r.end))

    for q in query:
        chrom_refs = ref_by_chrom.get(q.chrom, [])
        for ref in chrom_refs:
            if ref.start >= q.end:
                break  # sorted by start — no further refs can overlap
            if ref.end <= q.start:
                continue
            int_start = max(q.start, ref.start)
            int_end = min(q.end, ref.end)
            if int_start < int_end:
                fragment = replace(q, start=int_start, end=int_end)
                yield (fragment, ref)

overlay

overlay(query: Iterable[Region], reference: Iterable[Region], *, name_tag: str | None = None) -> Iterator[Region]

Spatial join with tag merge.

For each query region, finds all overlapping reference regions via intersect_intervals(), clips to the intersection, and merges the reference region's tags into the fragment. If name_tag is given, promotes the reference region's name field into a tag on the output fragment.

Parameters:

Name Type Description Default
query Iterable[Region]

Input regions to overlay.

required
reference Iterable[Region]

Reference regions to join against.

required
name_tag str | None

If given, sets tags[name_tag] = ref.name on each output fragment. Without this, the reference's name field would not transfer (it's a field, not a tag).

None

Yields:

Type Description
Region

Regions with coordinates clipped to the intersection and tags

Region

merged from both query and reference. Query regions with no

Region

overlapping reference are silently dropped.

Examples:

>>> q = [Region("chr1", 100, 300, name="state", tags={"mark": "H3"})]
>>> r = [Region("chr1", 0, 200, name="promoter", tags={"gene": "G1"})]
>>> result = list(overlay(q, r, name_tag="feature_type"))
>>> result[0].tags["feature_type"]
'promoter'
>>> result[0].tags["gene"]
'G1'
>>> result[0].tags["mark"]
'H3'
Source code in src/seqchain/primitives/intervals.py
def overlay(
    query: Iterable[Region],
    reference: Iterable[Region],
    *,
    name_tag: str | None = None,
) -> Iterator[Region]:
    """Spatial join with tag merge.

    For each query region, finds all overlapping reference regions via
    `intersect_intervals()`, clips to the intersection, and merges
    the reference region's tags into the fragment.  If *name_tag* is
    given, promotes the reference region's ``name`` field into a tag on
    the output fragment.

    Args:
        query: Input regions to overlay.
        reference: Reference regions to join against.
        name_tag: If given, sets ``tags[name_tag] = ref.name`` on each
            output fragment.  Without this, the reference's ``name``
            field would not transfer (it's a field, not a tag).

    Yields:
        Regions with coordinates clipped to the intersection and tags
        merged from both query and reference.  Query regions with no
        overlapping reference are silently dropped.

    Examples:
        >>> q = [Region("chr1", 100, 300, name="state", tags={"mark": "H3"})]
        >>> r = [Region("chr1", 0, 200, name="promoter", tags={"gene": "G1"})]
        >>> result = list(overlay(q, r, name_tag="feature_type"))
        >>> result[0].tags["feature_type"]
        'promoter'
        >>> result[0].tags["gene"]
        'G1'
        >>> result[0].tags["mark"]
        'H3'
    """
    for fragment, ref in intersect_intervals(query, reference):
        merged = {**fragment.tags, **ref.tags}
        if name_tag and ref.name:
            merged[name_tag] = ref.name
        yield replace(fragment, tags=merged)