add readme
Browse files- .idea/.gitignore +3 -0
- .idea/inspectionProfiles/Project_Default.xml +29 -0
- .idea/inspectionProfiles/profiles_settings.xml +6 -0
- .idea/materials.selfies-ted.iml +8 -0
- .idea/misc.xml +4 -0
- .idea/modules.xml +8 -0
- .idea/vcs.xml +6 -0
- README.md +84 -0
.idea/.gitignore
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
# Default ignored files
|
2 |
+
/shelf/
|
3 |
+
/workspace.xml
|
.idea/inspectionProfiles/Project_Default.xml
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<component name="InspectionProjectProfileManager">
|
2 |
+
<profile version="1.0">
|
3 |
+
<option name="myName" value="Project Default" />
|
4 |
+
<inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
|
5 |
+
<option name="ignoredPackages">
|
6 |
+
<value>
|
7 |
+
<list size="16">
|
8 |
+
<item index="0" class="java.lang.String" itemvalue="accelerate" />
|
9 |
+
<item index="1" class="java.lang.String" itemvalue="matplotlib" />
|
10 |
+
<item index="2" class="java.lang.String" itemvalue="torch-geometric" />
|
11 |
+
<item index="3" class="java.lang.String" itemvalue="torchinfo" />
|
12 |
+
<item index="4" class="java.lang.String" itemvalue="caikit" />
|
13 |
+
<item index="5" class="java.lang.String" itemvalue="pytorch-fast-transformers" />
|
14 |
+
<item index="6" class="java.lang.String" itemvalue="e3nn" />
|
15 |
+
<item index="7" class="java.lang.String" itemvalue="rdkit" />
|
16 |
+
<item index="8" class="java.lang.String" itemvalue="PyImpetus" />
|
17 |
+
<item index="9" class="java.lang.String" itemvalue="torch-scatter" />
|
18 |
+
<item index="10" class="java.lang.String" itemvalue="torch-nl" />
|
19 |
+
<item index="11" class="java.lang.String" itemvalue="torch-sparse" />
|
20 |
+
<item index="12" class="java.lang.String" itemvalue="mordred" />
|
21 |
+
<item index="13" class="java.lang.String" itemvalue="xgboost" />
|
22 |
+
<item index="14" class="java.lang.String" itemvalue="mamba-ssm" />
|
23 |
+
<item index="15" class="java.lang.String" itemvalue="evaluate" />
|
24 |
+
</list>
|
25 |
+
</value>
|
26 |
+
</option>
|
27 |
+
</inspection_tool>
|
28 |
+
</profile>
|
29 |
+
</component>
|
.idea/inspectionProfiles/profiles_settings.xml
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<component name="InspectionProjectProfileManager">
|
2 |
+
<settings>
|
3 |
+
<option name="USE_PROJECT_PROFILE" value="false" />
|
4 |
+
<version value="1.0" />
|
5 |
+
</settings>
|
6 |
+
</component>
|
.idea/materials.selfies-ted.iml
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
+
<module type="PYTHON_MODULE" version="4">
|
3 |
+
<component name="NewModuleRootManager">
|
4 |
+
<content url="file://$MODULE_DIR$" />
|
5 |
+
<orderEntry type="inheritedJdk" />
|
6 |
+
<orderEntry type="sourceFolder" forTests="false" />
|
7 |
+
</component>
|
8 |
+
</module>
|
.idea/misc.xml
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
+
<project version="4">
|
3 |
+
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.9" project-jdk-type="Python SDK" />
|
4 |
+
</project>
|
.idea/modules.xml
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
+
<project version="4">
|
3 |
+
<component name="ProjectModuleManager">
|
4 |
+
<modules>
|
5 |
+
<module fileurl="file://$PROJECT_DIR$/.idea/materials.selfies-ted.iml" filepath="$PROJECT_DIR$/.idea/materials.selfies-ted.iml" />
|
6 |
+
</modules>
|
7 |
+
</component>
|
8 |
+
</project>
|
.idea/vcs.xml
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
+
<project version="4">
|
3 |
+
<component name="VcsDirectoryMappings">
|
4 |
+
<mapping directory="" vcs="Git" />
|
5 |
+
</component>
|
6 |
+
</project>
|
README.md
CHANGED
@@ -1,3 +1,87 @@
|
|
1 |
---
|
2 |
license: apache-2.0
|
|
|
|
|
|
|
|
|
3 |
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
license: apache-2.0
|
3 |
+
library_name: transformers
|
4 |
+
pipeline_tag: feature-extraction
|
5 |
+
tags:
|
6 |
+
- chemistry
|
7 |
---
|
8 |
+
|
9 |
+
# selfies-ted
|
10 |
+
|
11 |
+
selfies-ted is a project for encoding SMILES (Simplified Molecular Input Line Entry System) into SELFIES (SELF-referencing Embedded Strings) and generating embeddings for molecular representations.
|
12 |
+
|
13 |
+
![selfies-ted](selfies-ted.png)
|
14 |
+
## Model Architecture
|
15 |
+
|
16 |
+
Configuration details
|
17 |
+
|
18 |
+
Encoder and Decoder FFN dimensions: 256
|
19 |
+
Number of attention heads: 4
|
20 |
+
Number of encoder and decoder layers: 2
|
21 |
+
Total number of hidden layers: 6
|
22 |
+
Maximum position embeddings: 128
|
23 |
+
Model dimension (d_model): 256
|
24 |
+
|
25 |
+
## Pretrained Models and Training Logs
|
26 |
+
We provide checkpoints of the selfies-ted model pre-trained on a dataset of molecules curated from PubChem. The pre-trained model shows competitive performance on molecular representation tasks. For model weights: "HuggingFace link".
|
27 |
+
|
28 |
+
To install and use the pre-trained model:
|
29 |
+
|
30 |
+
Download the selfies_ted_model.pkl file from the "HuggingFace link".
|
31 |
+
Add the selfies-ted selfies_ted_model.pkl to the models/ directory. The directory structure should look like the following:
|
32 |
+
|
33 |
+
```
|
34 |
+
models/
|
35 |
+
└── selfies_ted_model.pkl
|
36 |
+
```
|
37 |
+
|
38 |
+
## Installation
|
39 |
+
|
40 |
+
To use this project, you'll need to install the required dependencies. We recommend using a virtual environment:
|
41 |
+
|
42 |
+
```bash
|
43 |
+
python -m venv venv
|
44 |
+
source venv/bin/activate # On Windows use `venv\Scripts\activate`
|
45 |
+
```
|
46 |
+
|
47 |
+
Install the required dependencies
|
48 |
+
|
49 |
+
```
|
50 |
+
pip install -r requirements.txt
|
51 |
+
```
|
52 |
+
|
53 |
+
|
54 |
+
## Usage
|
55 |
+
|
56 |
+
### Import
|
57 |
+
|
58 |
+
```
|
59 |
+
import load
|
60 |
+
```
|
61 |
+
### Training the Model
|
62 |
+
|
63 |
+
To train the model, use the train.py script:
|
64 |
+
|
65 |
+
```
|
66 |
+
python train.py -f <path_to_your_data_file>
|
67 |
+
```
|
68 |
+
|
69 |
+
|
70 |
+
Note: The actual usage may depend on the specific implementation in load.py. Please refer to the source code for detailed functionality.
|
71 |
+
|
72 |
+
### Load the model and tokenizer
|
73 |
+
```
|
74 |
+
load.load("path/to/checkpoint.pkl")
|
75 |
+
```
|
76 |
+
### Encode SMILES strings
|
77 |
+
```
|
78 |
+
smiles_list = ["COC", "CCO"]
|
79 |
+
```
|
80 |
+
```
|
81 |
+
embeddings = load.encode(smiles_list)
|
82 |
+
```
|
83 |
+
|
84 |
+
|
85 |
+
## Example Notebook
|
86 |
+
|
87 |
+
Example notebook of this project is `selfies-ted-example.ipynb`.
|