diff --git a/client/src/components/Collections/pairing.ts b/client/src/components/Collections/pairing.ts index 351aab5288eb..872fd2bfff12 100644 --- a/client/src/components/Collections/pairing.ts +++ b/client/src/components/Collections/pairing.ts @@ -3,6 +3,7 @@ import type { GenericPair } from "@/components/Collections/common/buildCollectio export const COMMON_FILTERS = { illumina: ["_1", "_2"] as [string, string], Rs: ["_R1", "_R2"] as [string, string], + Fs: ["_F", "_R"] as [string, string], dot12s: [".1.fastq", ".2.fastq"] as [string, string], }; export type CommonFiltersType = keyof typeof COMMON_FILTERS; @@ -16,6 +17,7 @@ export function guessInitialFilterType(elements: HasName[]): CommonFiltersType | let illumina = 0; let dot12s = 0; let Rs = 0; + let Fs = 0; //should we limit the forEach? What if there are 1000s of elements? elements.forEach((element) => { @@ -23,19 +25,23 @@ export function guessInitialFilterType(elements: HasName[]): CommonFiltersType | dot12s++; } else if (element.name?.includes("_R1") || element.name?.includes("_R2")) { Rs++; + } else if (element.name?.includes("_F") || element.name?.includes("_R")) { + Fs++; } else if (element.name?.includes("_1") || element.name?.includes("_2")) { illumina++; } }); // if we cannot filter don't set an initial filter and hide all the data - if (illumina === 0 && dot12s === 0 && Rs === 0) { + if (illumina === 0 && dot12s === 0 && Rs === 0 && Fs === 0) { return null; - } else if (illumina > dot12s && illumina > Rs) { + } else if (illumina > dot12s && illumina > Rs && illumina > Fs) { return "illumina"; - } else if (dot12s > illumina && dot12s > Rs) { + } else if (dot12s > illumina && dot12s > Rs && dot12s > Fs) { return "dot12s"; - } else if (Rs > illumina && Rs > dot12s) { + } else if (Rs > illumina && Rs > dot12s && Rs > Fs) { return "Rs"; + } else if (Fs > illumina && Fs > dot12s && Fs > Rs) { + return "Fs"; } else { return "illumina"; } diff --git a/lib/galaxy/model/dataset_collections/auto_pairing.py b/lib/galaxy/model/dataset_collections/auto_pairing.py index 351c4657e52c..7433bb07e3d9 100644 --- a/lib/galaxy/model/dataset_collections/auto_pairing.py +++ b/lib/galaxy/model/dataset_collections/auto_pairing.py @@ -20,6 +20,7 @@ class HasName(Protocol): COMMON_FILTERS: dict[str, tuple[str, str]] = { "illumina": ("_1", "_2"), "Rs": ("_R1", "_R2"), + "Fs": ("_F", "_R"), "dot12s": (".1.fastq", ".2.fastq"), } @@ -116,25 +117,32 @@ def guess_initial_filter_type(elements: list[T]) -> Optional[str]: illumina = 0 dot12s = 0 Rs = 0 + Fs = 0 # Iterate through elements and count occurrences of filter patterns + # Order matters: more specific patterns must be checked before less specific ones + # (_R1/_R2 before _F/_R since _R is a substring of _R1) for element in elements: if ".1.fastq" in element.name or ".2.fastq" in element.name: dot12s += 1 elif "_R1" in element.name or "_R2" in element.name: Rs += 1 + elif "_F" in element.name or "_R" in element.name: + Fs += 1 elif "_1" in element.name or "_2" in element.name: illumina += 1 # Determine the most likely filter type - if illumina == 0 and dot12s == 0 and Rs == 0: + if illumina == 0 and dot12s == 0 and Rs == 0 and Fs == 0: return None - elif illumina > dot12s and illumina > Rs: + elif illumina > dot12s and illumina > Rs and illumina > Fs: return "illumina" - elif dot12s > illumina and dot12s > Rs: + elif dot12s > illumina and dot12s > Rs and dot12s > Fs: return "dot12s" - elif Rs > illumina and Rs > dot12s: + elif Rs > illumina and Rs > dot12s and Rs > Fs: return "Rs" + elif Fs > illumina and Fs > dot12s and Fs > Rs: + return "Fs" else: return "illumina" diff --git a/lib/galaxy/model/dataset_collections/auto_pairing_spec.yml b/lib/galaxy/model/dataset_collections/auto_pairing_spec.yml index 8b81d649abc8..f0dc8615a17c 100644 --- a/lib/galaxy/model/dataset_collections/auto_pairing_spec.yml +++ b/lib/galaxy/model/dataset_collections/auto_pairing_spec.yml @@ -74,3 +74,35 @@ input: forward: input_1.fastq.bz2 reverse: input_2.fastq.bz2 + +- doc: Simple _F/_R split. + inputs: + - input_F.fastq + - input_R.fastq + paired: + input: + forward: input_F.fastq + reverse: input_R.fastq + +- doc: Compressed _F/_R split (.gz). + inputs: + - ERR042228_F.fq.gz + - ERR042228_R.fq.gz + paired: + ERR042228: + forward: ERR042228_F.fq.gz + reverse: ERR042228_R.fq.gz + +- doc: Multiple _F/_R pairs with common prefixes. + inputs: + - ERR042228_F.fq.gz + - ERR042228_R.fq.gz + - ERR636028_F.fq.gz + - ERR636028_R.fq.gz + paired: + ERR042228: + forward: ERR042228_F.fq.gz + reverse: ERR042228_R.fq.gz + ERR636028: + forward: ERR636028_F.fq.gz + reverse: ERR636028_R.fq.gz