Spaces:
Runtime error
Runtime error
| import os | |
| import datetime | |
| class InterproScan(): | |
| def __init__(self, bash_path): | |
| self.bash_path = bash_path | |
| def run(self, fasta_file, goterms, pathways, save_dir) -> dict: | |
| start_time = datetime.datetime.now() | |
| temp_dir = f"{os.path.dirname(save_dir)}/temp" | |
| if not os.path.exists(temp_dir): | |
| os.makedirs(temp_dir) | |
| seqs = self.read_fasta_to_list(fasta_file) | |
| seqtype = self.is_protein_sequence(seqs) | |
| # Call the InterproScan | |
| cmd = f"{self.bash_path} \ | |
| -i {fasta_file} -o {save_dir} -f JSON" | |
| cmd += f" -T {temp_dir}" | |
| if goterms: | |
| cmd += " -goterms" | |
| if pathways: | |
| cmd += " -pa" | |
| if seqtype: | |
| cmd += f" -t p" | |
| else: | |
| cmd += f" -t n" | |
| print(cmd) | |
| try: | |
| os.system(cmd) | |
| end_time = datetime.datetime.now() | |
| spend_time = (end_time - start_time).total_seconds() | |
| if os.listdir(save_dir): | |
| print(f"InterproScan successfully completed. Output saved to {save_dir[len(self.out_dir)+1:]}.") | |
| return {"output_dir": save_dir[len(self.out_dir)+1:], "duration": spend_time} | |
| else: | |
| raise Exception("InterproScan encountered an error. Please check your inputs and options.") | |
| except Exception as e: | |
| return {"error": str(e)} | |
| def is_protein_sequence(self, sequences): | |
| sequence = "".join(sequences) | |
| # ATCG AUCG | |
| if len(set(sequence.upper())) > 6: | |
| return True | |
| else: | |
| return False | |
| def read_fasta_to_list(self, file_path): | |
| sequences = [] | |
| current_header = None | |
| current_seq = [] | |
| with open(file_path, 'r') as f: | |
| for line in f: | |
| line = line.strip() | |
| if line.startswith(">"): | |
| if current_header is not None: | |
| sequences.append("".join(current_seq)) | |
| current_header = line[1:] | |
| current_seq = [] | |
| else: | |
| current_seq.append(line) | |
| if current_header is not None: | |
| sequences.append("".join(current_seq)) | |
| return sequences | |
| if __name__ == '__main__': | |
| # Test | |
| interproscan = InterproScan("interproscan/interproscan-5.75-106.0/interproscan.sh") | |
| from utils.utils import get_protein_sequence_biopython, tofasta | |
| import pickle | |
| uids = [] | |
| seqs = [] | |
| with open("/zhangjiawei/interproscan/example/difference_20241122_ec_dict_list.pkl", "rb") as f: | |
| datas = pickle.load(f) | |
| for data in datas: | |
| uids.append(data["uniprot_id"]) | |
| seqs.append(data["sequence"]) | |
| fasta_file = "example/protein_go_clean.fasta" | |
| # seqs = [get_protein_sequence_biopython(uid) for uid in uids] | |
| tofasta(fasta_file, uids, seqs) | |
| input_args = { | |
| "fasta_file": fasta_file, | |
| "goterms": True, | |
| "pathways": True, | |
| "save_dir": "output/interproscan" | |
| } | |
| interproscan.run(**input_args) | |