ipd commited on
Commit
1c5d630
·
1 Parent(s): 9d70548

add readme

Browse files
.idea/.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # Default ignored files
2
+ /shelf/
3
+ /workspace.xml
.idea/inspectionProfiles/Project_Default.xml ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <component name="InspectionProjectProfileManager">
2
+ <profile version="1.0">
3
+ <option name="myName" value="Project Default" />
4
+ <inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
5
+ <option name="ignoredPackages">
6
+ <value>
7
+ <list size="16">
8
+ <item index="0" class="java.lang.String" itemvalue="accelerate" />
9
+ <item index="1" class="java.lang.String" itemvalue="matplotlib" />
10
+ <item index="2" class="java.lang.String" itemvalue="torch-geometric" />
11
+ <item index="3" class="java.lang.String" itemvalue="torchinfo" />
12
+ <item index="4" class="java.lang.String" itemvalue="caikit" />
13
+ <item index="5" class="java.lang.String" itemvalue="pytorch-fast-transformers" />
14
+ <item index="6" class="java.lang.String" itemvalue="e3nn" />
15
+ <item index="7" class="java.lang.String" itemvalue="rdkit" />
16
+ <item index="8" class="java.lang.String" itemvalue="PyImpetus" />
17
+ <item index="9" class="java.lang.String" itemvalue="torch-scatter" />
18
+ <item index="10" class="java.lang.String" itemvalue="torch-nl" />
19
+ <item index="11" class="java.lang.String" itemvalue="torch-sparse" />
20
+ <item index="12" class="java.lang.String" itemvalue="mordred" />
21
+ <item index="13" class="java.lang.String" itemvalue="xgboost" />
22
+ <item index="14" class="java.lang.String" itemvalue="mamba-ssm" />
23
+ <item index="15" class="java.lang.String" itemvalue="evaluate" />
24
+ </list>
25
+ </value>
26
+ </option>
27
+ </inspection_tool>
28
+ </profile>
29
+ </component>
.idea/inspectionProfiles/profiles_settings.xml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ <component name="InspectionProjectProfileManager">
2
+ <settings>
3
+ <option name="USE_PROJECT_PROFILE" value="false" />
4
+ <version value="1.0" />
5
+ </settings>
6
+ </component>
.idea/materials.selfies-ted.iml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <module type="PYTHON_MODULE" version="4">
3
+ <component name="NewModuleRootManager">
4
+ <content url="file://$MODULE_DIR$" />
5
+ <orderEntry type="inheritedJdk" />
6
+ <orderEntry type="sourceFolder" forTests="false" />
7
+ </component>
8
+ </module>
.idea/misc.xml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.9" project-jdk-type="Python SDK" />
4
+ </project>
.idea/modules.xml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="ProjectModuleManager">
4
+ <modules>
5
+ <module fileurl="file://$PROJECT_DIR$/.idea/materials.selfies-ted.iml" filepath="$PROJECT_DIR$/.idea/materials.selfies-ted.iml" />
6
+ </modules>
7
+ </component>
8
+ </project>
.idea/vcs.xml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="VcsDirectoryMappings">
4
+ <mapping directory="" vcs="Git" />
5
+ </component>
6
+ </project>
README.md CHANGED
@@ -1,3 +1,87 @@
1
  ---
2
  license: apache-2.0
 
 
 
 
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  license: apache-2.0
3
+ library_name: transformers
4
+ pipeline_tag: feature-extraction
5
+ tags:
6
+ - chemistry
7
  ---
8
+
9
+ # selfies-ted
10
+
11
+ selfies-ted is a project for encoding SMILES (Simplified Molecular Input Line Entry System) into SELFIES (SELF-referencing Embedded Strings) and generating embeddings for molecular representations.
12
+
13
+ ![selfies-ted](selfies-ted.png)
14
+ ## Model Architecture
15
+
16
+ Configuration details
17
+
18
+ Encoder and Decoder FFN dimensions: 256
19
+ Number of attention heads: 4
20
+ Number of encoder and decoder layers: 2
21
+ Total number of hidden layers: 6
22
+ Maximum position embeddings: 128
23
+ Model dimension (d_model): 256
24
+
25
+ ## Pretrained Models and Training Logs
26
+ We provide checkpoints of the selfies-ted model pre-trained on a dataset of molecules curated from PubChem. The pre-trained model shows competitive performance on molecular representation tasks. For model weights: "HuggingFace link".
27
+
28
+ To install and use the pre-trained model:
29
+
30
+ Download the selfies_ted_model.pkl file from the "HuggingFace link".
31
+ Add the selfies-ted selfies_ted_model.pkl to the models/ directory. The directory structure should look like the following:
32
+
33
+ ```
34
+ models/
35
+ └── selfies_ted_model.pkl
36
+ ```
37
+
38
+ ## Installation
39
+
40
+ To use this project, you'll need to install the required dependencies. We recommend using a virtual environment:
41
+
42
+ ```bash
43
+ python -m venv venv
44
+ source venv/bin/activate # On Windows use `venv\Scripts\activate`
45
+ ```
46
+
47
+ Install the required dependencies
48
+
49
+ ```
50
+ pip install -r requirements.txt
51
+ ```
52
+
53
+
54
+ ## Usage
55
+
56
+ ### Import
57
+
58
+ ```
59
+ import load
60
+ ```
61
+ ### Training the Model
62
+
63
+ To train the model, use the train.py script:
64
+
65
+ ```
66
+ python train.py -f <path_to_your_data_file>
67
+ ```
68
+
69
+
70
+ Note: The actual usage may depend on the specific implementation in load.py. Please refer to the source code for detailed functionality.
71
+
72
+ ### Load the model and tokenizer
73
+ ```
74
+ load.load("path/to/checkpoint.pkl")
75
+ ```
76
+ ### Encode SMILES strings
77
+ ```
78
+ smiles_list = ["COC", "CCO"]
79
+ ```
80
+ ```
81
+ embeddings = load.encode(smiles_list)
82
+ ```
83
+
84
+
85
+ ## Example Notebook
86
+
87
+ Example notebook of this project is `selfies-ted-example.ipynb`.