Skip to content
Open
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
.DS_Store
.idea/
5 changes: 4 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
Ariel: Minor updates to code to make it install with pip as of Jul-2023

# BioBert Embeddings
Token and sentence level embeddings from BioBERT model (Biomedical Domain).

Expand Down Expand Up @@ -27,6 +29,7 @@ sentence embedding generated is 768 dimensional embedding.

```python
from biobert_embedding.embedding import BiobertEmbedding
from scipy.spatial.distance import cosine as cosine_distance

## Example 1
text = "Breast cancers with HER2 amplification have a higher risk of CNS metastasis and poorer prognosis."\
Expand All @@ -50,7 +53,7 @@ print("Shape of Sentence Embedding = ",len(sentence_embedding))
sentence_vector1 = biobert.sentence_vector('Breast cancers with HER2 amplification have a higher risk of CNS metastasis and poorer prognosis.')
sentence_vector2 = biobert.sentence_vector('Breast cancers with HER2 amplification are more aggressive, have a higher risk of CNS metastasis, and poorer prognosis.')

cosine_sim = 1 - distance.cosine(sentence_vector1, sentence_vector2)
cosine_sim = 1 - cosine_distance(sentence_vector1, sentence_vector2)
print('cosine similarity:', cosine_sim)
#cosine similarity: 0.992756187915802
```
79 changes: 0 additions & 79 deletions biobert_embedding/downloader.py

This file was deleted.

64 changes: 54 additions & 10 deletions biobert_embedding/embedding.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,58 @@
import os
import torch
import logging
import tensorflow as tf
from pathlib import Path
from biobert_embedding import downloader
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM
import requests
from tqdm import tqdm
from pytorch_pretrained_bert import BertTokenizer, BertModel

__author__ = 'Jitendra Jangid, Ariel Lubonja'


huggingface_repo = "https://huggingface.co/Ariel4/biobert-embeddings/resolve/main/"

__author__ = 'Jitendra Jangid'

#Create and configure logger
logging.basicConfig(filename='app.log', filemode='w',format='%(asctime)s %(message)s', level=logging.INFO)

logger = logging.getLogger(__name__)


def download_or_use_existing(model_folder_path, filename):
if os.path.isfile(model_folder_path + filename):
print(f"Using existing " + model_folder_path + filename)
else:
# Download with Progress Bar
response = requests.get(huggingface_repo + filename, stream=True)

print("Downloading " + filename + " from HuggingFace")

total = int(response.headers.get('content-length', 0))
with tqdm(total=total, unit='iB', unit_scale=True, ncols=70) as bar:
with open(model_folder_path + filename, 'wb') as f:
for data in response.iter_content(chunk_size=1024):
size = f.write(data)
bar.update(size)

print("File Downloaded! It is stored in: " + model_folder_path+filename)


def setup_model(model_folder_path="models/"):
"""
Verify if the model is already downloaded, if not download it.
"""
pytorch_model_filename = "pytorch_model.bin"
config_json_filename = "config.json"
vocab_filename = "vocab.txt"

if not os.path.exists(model_folder_path):
os.makedirs(model_folder_path)

download_or_use_existing(model_folder_path, pytorch_model_filename)
download_or_use_existing(model_folder_path, config_json_filename)
download_or_use_existing(model_folder_path, vocab_filename)

return model_folder_path

class BiobertEmbedding(object):
"""
Encoding from BioBERT model (BERT finetuned on PubMed articles).
Expand All @@ -25,18 +65,19 @@ class BiobertEmbedding(object):
"""

def __init__(self, model_path=None):

if model_path is not None:
self.model_path = model_path
else:
self.model_path = downloader.get_BioBert("google drive")
model_path = setup_model() # Folder containing pytorch_model.bin, config.json and vocab.txt

self.model_path = model_path

self.tokens = ""
self.sentence_tokens = ""
# This needs the model folder path, not the pytorch_model.bin path
self.tokenizer = BertTokenizer.from_pretrained(self.model_path)
# Load pre-trained model (weights)
self.model = BertModel.from_pretrained(self.model_path)
logger.info("Initialization Done !!")



def process_text(self, text):

Expand All @@ -47,6 +88,9 @@ def process_text(self, text):


def handle_oov(self, tokenized_text, word_embeddings):
"""
Handle out-of-vocabulary words by appending the word embeddings of the subwords
"""
embeddings = []
tokens = []
oov_len = 1
Expand Down
13 changes: 6 additions & 7 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,17 @@
setuptools.setup(
name="biobert-embedding",
packages=['biobert_embedding'],
version="0.1.2",
author="Jitendra Jangid",
author_email="jitujangid38@gmail.com",
version="0.1.4",
author="Jitendra Jangid, Ariel Lubonja",
author_email="jitujangid38@gmail.com, ariellubonja@live.com",
description="Embeddings from BioBERT",
long_description=long_description,
long_description_content_type="text/markdown",
url="https://github.com/overfitter/biobert_embedding",
download_url="https://github.com/overfitter/biobert_embedding/archive/v0.1.2.tar.gz",
url="https://github.com/ariellubonja/biobert_embedding",
download_url="https://github.com/ariellubonja/biobert_embedding/archive/v0.1.3.tar.gz",
install_requires=[
'torch==1.2.0',
'torch==2.1.2',
'pytorch-pretrained-bert==0.6.2',
'tensorflow',
],

classifiers=[
Expand Down