Skip to content

Commit

Permalink
Allow a list of file extensions for input FASTA files
Browse files Browse the repository at this point in the history
  • Loading branch information
njbirth committed Jan 12, 2024
1 parent ef287bb commit fa7dce5
Show file tree
Hide file tree
Showing 6 changed files with 15 additions and 14 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "cocopye"
version = "0.2.2"
version = "0.3"
authors = [
{ name = "Niklas Birth, Nicolina Leppich, Dr. Peter Meinicke" },
]
Expand Down
6 changes: 3 additions & 3 deletions src/cocopye/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ def core(cocopye_db: str,
uproc_model: str,
infolder: str,
pfam_version: int,
file_extension: str,
file_extensions: List[str],
num_threads: int,
print_progress: bool = True
) -> List[Result]:
Expand All @@ -113,13 +113,13 @@ def core(cocopye_db: str,
os.path.join(pfam_db, pfam_version),
uproc_model,
infolder,
file_extension,
file_extensions,
num_threads,
print_progress
)

if pfam_result is None:
print("\nError: No input file with extension " + file_extension + " found.")
print("\nError: No input file with extensions " + str(file_extensions) + " found.")
print("You can use --file-extension to specify a different one. Exiting.")
sys.exit(1)

Expand Down
13 changes: 7 additions & 6 deletions src/cocopye/pfam.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def count_pfams(
pfam_dir: str,
model_dir: str,
bin_folder: str,
file_extension: str = "fna",
file_extensions: List[str],
num_threads: int = 8,
print_progress: bool = True
) -> Optional[Tuple[npt.NDArray[np.uint8], List[str], List[float]]]:
Expand All @@ -49,8 +49,8 @@ def count_pfams(
:param pfam_dir: Path to the UProC Pfam database directory
:param model_dir: Path to UProC model directory
:param bin_folder: Folder containing input bins in FASTA format
:param file_extension: File extension of the input FASTA files. Probably something like .fna or .fasta. Each file in
the bin folder that has this extension is considered a bin.
:param file_extensions: A list of allowed file extensions of the input FASTA files. Probably something like .fna or
.fasta. Each file in the bin folder that has on of these extensions is considered a bin.
:param num_threads: Number of threads that UProC should use. It is possible (and likely) that UProC ignores this
parameter, but you can try.
:param print_progress: Print a progress bar to stdout
Expand All @@ -59,7 +59,7 @@ def count_pfams(
extension) in the same order as they appear in the QueryMatrix. The third element is a list of count-ratios of the
input bins (number of pfams divided by bin size).
"""
bins = [file.rpartition(".")[0] for file in os.listdir(bin_folder) if file.rpartition(".")[2] == file_extension]
bins = [file for file in os.listdir(bin_folder) if file.rpartition(".")[2] in file_extensions]

if len(bins) == 0:
return None
Expand All @@ -79,13 +79,14 @@ def count_pfams(

lengths = {}

for bin_id in tqdm(
for bin_file in tqdm(
bins,
ncols=0,
desc="\033[0;37m[" + str(datetime.now()) + "]\033[0m Counting Pfams",
disable=not print_progress
):
for record in SeqIO.parse(os.path.join(bin_folder, bin_id + "." + file_extension), "fasta"):
bin_id = bin_file.rpartition(".")[0]
for record in SeqIO.parse(os.path.join(bin_folder, bin_file), "fasta"):
lengths[bin_id] = len(str(record.seq))
process_orf.stdin.write(">" + bin_id + "$$" + record.id + "\n")
process_orf.stdin.write(str(record.seq) + "\n")
Expand Down
4 changes: 2 additions & 2 deletions src/cocopye/ui/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,8 @@ def parse_args() -> argparse.Namespace:
help="Input folder containing bins in FASTA format")
run_parser.add_argument("-o", "--outfile", default="cocopye_output.csv",
help="Output file (default: cocopye_output.csv)")
run_parser.add_argument("--file-extension", default="fna",
help="File extension of the bin FASTA files (default: fna)")
run_parser.add_argument("--file-extensions", default="fasta,fna,fa",
help="Allowed file extensions for the FASTA files (default: fasta,fna,fa)")
run_parser.add_argument("-t", "--threads", default=str(min(8, numba.config.NUMBA_NUM_THREADS)),
help="Number of threads")
run_parser.add_argument("-v", "--verbosity", default="standard",
Expand Down
2 changes: 1 addition & 1 deletion src/cocopye/ui/terminal/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ def run():
config.CONFIG["external"]["uproc_models"],
config.ARGS.infolder,
24 if config.ARGS.pfam24 else 28,
config.ARGS.file_extension,
config.ARGS.file_extensions.split(","),
config.ARGS.threads
)

Expand Down
2 changes: 1 addition & 1 deletion src/cocopye/ui/web/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def estimate_task(self, config, pfam_version: int, infolder: str, debug: bool =
config["external"]["uproc_models"],
infolder,
pfam_version,
"fna",
["fna"],
1,
print_progress=debug
)[0]
Expand Down

0 comments on commit fa7dce5

Please sign in to comment.