LucaOne

LucaOne: Generalized Biological Foundation Model with Unified Nucleic Acid and Protein Language.

Github Page: https://github.com/LucaOne/LucaOne

This repo contains weights (checkpoint=17600000) and core codes (modified to suit HF API, might be unstable in the current stage) for LucaOne general-purpose language model (LucaOneGPLM).

To calculate the embedding of a nucleotide/protein sequence:

import torch
from transformers import AutoModel, AutoTokenizer

def gene_seq_replace(seq):
    '''
    Nucleic acid （gene replace: A->1, U/T->2, C->3, G->4, N->5
    :param seq:
    :return:
    '''
    new_seq = ""
    for ch in seq:
        if ch in ["A", "a"]:
            new_seq += "1"
        elif ch in ["T", "U", "t", "u"]:
            new_seq += "2"
        elif ch in ["C", "c"]:
            new_seq += "3"
        elif ch in ["G", "g"]:
            new_seq += "4"
        else: # unknown
            new_seq += "5"
    return new_seq


model = AutoModel.from_pretrained("Yuanfei/LucaOne", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("Yuanfei/LucaOne", trust_remote_code=True)

# Test input
seq = "ATCGCGAGTAGCGAGNNNAGCGAT"
seq_type = "gene" # or "prot"

if seq_type == "gene":
    seq = gene_seq_replace(seq)

print("seq len: %d:" % len(seq))

# Test run
seq_encoded = tokenizer.encode(seq)
input_ids = torch.tensor(seq_encoded, dtype=torch.int64).unsqueeze(0)

print("input_ids:")
print(input_ids)

if seq_type == "gene":
    token_type_ids = torch.zeros_like(input_ids)
else:
    token_type_ids = torch.ones_like(input_ids)

encoding = {
    "input_ids": input_ids, 
    "token_type_ids": token_type_ids, 
}

if seq_type == "prot":
    new_encoding = {}
    for item in encoding.items():
        new_encoding[item[0] + "_b"] = item[1]
    encoding = new_encoding

batch = encoding
batch["return_dict"] = True

res = model(**batch)

if seq_type == "prot":
    embedding = res.hidden_states_b
else:
    embedding = res.hidden_states

print("embedding matrix(include [CLS] and [SEP]):")
print(embedding)
print(embedding.shape)

print("[CLS] embedding vector:")
cls_vec = embedding[0, 0, :]
print(cls_vec)
print(cls_vec.shape)

If there is an error when loading tokenizer: "ValueError: Tokenizer class AlphabetTokenizer does not exist or is not currently imported." then try to run the alphabet.py first.