Skip to content
This repository was archived by the owner on Nov 24, 2023. It is now read-only.
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@

[//]: # (Custom component imports)

import DocString from '@site/src/components/DocString';
import PythonCode from '@site/src/components/PythonCode';
import AppDisplay from '@site/src/components/AppDisplay';
import SectionBreak from '@site/src/components/SectionBreak';
import AppendixSection from '@site/src/components/AppendixSection';

[//]: # (Docstring)

import DocstringSource from '!!raw-loader!./a1-[autogen]/docstring.txt';
import PythonSource from '!!raw-loader!./a1-[autogen]/python_code.txt';

<DocString>{DocstringSource}</DocString>
<PythonCode GLink='AI_ML/ANOMALY_DETECTION/ISOLATION_FOREST/ISOLATION_FOREST.py'>{PythonSource}</PythonCode>

<SectionBreak />



[//]: # (Examples)

## Examples

<AppDisplay
GLink='AI_ML/ANOMALY_DETECTION/ISOLATION_FOREST'
nodeLabel='ISOLATION_FOREST'>
</AppDisplay>

<SectionBreak />



[//]: # (Appendix)

import Notes from '!!raw-loader!./appendix/notes.md';
import Hardware from '!!raw-loader!./appendix/hardware.md';
import Media from '!!raw-loader!./appendix/media.md';

## Appendix

<AppendixSection index={0} folderPath='nodes/AI_ML/ANOMALY_DETECTION/ISOLATION_FOREST/appendix/'>{Notes}</AppendixSection>
<AppendixSection index={1} folderPath='nodes/AI_ML/ANOMALY_DETECTION/ISOLATION_FOREST/appendix/'>{Hardware}</AppendixSection>
<AppendixSection index={2} folderPath='nodes/AI_ML/ANOMALY_DETECTION/ISOLATION_FOREST/appendix/'>{Media}</AppendixSection>


Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@

The ISOLATION_FOREST node uses the Isolation Forest algorithm to detect anomalous points in a tabular dataset.
Reference: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.IsolationForest.html

Parameters
----------
contamination: float, default=0 (auto)
The estimated proportion of outliers in the data set.

Returns
-------
dataframe
The original dataframe for the input data including two columns: 'anomaly_scores' and 'anomaly'.
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
from flojoy import flojoy, DataFrame as FlojoyDataFrame
from sklearn.ensemble import IsolationForest


@flojoy
def ISOLATION_FOREST(
default: FlojoyDataFrame,
contamination: float = 0
) -> FlojoyDataFrame:


df = default.m
if contamination == 0:
contamination = "auto"
model = IsolationForest(contamination=contamination)
model.fit(df)
results = model.decision_function(df)
df['anomaly'] = model.predict(df)
df['anomaly_scores'] = results

return FlojoyDataFrame(df=df)
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ def NLP_CONNECT_VIT_GPT2(default: Image) -> DataFrame:
import pandas as pd

import transformers
import torch
import torchvision.transforms.functional as TF
from flojoy import snapshot_download

Expand All @@ -36,10 +37,11 @@ def NLP_CONNECT_VIT_GPT2(default: Image) -> DataFrame:
feature_extractor = transformers.ViTImageProcessor.from_pretrained(local_repo_path)
tokenizer = transformers.AutoTokenizer.from_pretrained(local_repo_path)

pixel_values = feature_extractor(images=[image], return_tensors="pt").pixel_values # type: ignore
output_ids = model.generate(pixel_values, max_length=16, num_beams=4) # type: ignore
preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True) # type: ignore
pred = preds[0].strip()
with torch.inference_mode():
pixel_values = feature_extractor(images=[image], return_tensors="pt").pixel_values # type: ignore
output_ids = model.generate(pixel_values, max_length=16, num_beams=4) # type: ignore
preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True) # type: ignore
pred = preds[0].strip()

df_pred = pd.DataFrame.from_records([(pred,)], columns=["caption"])

Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from typing import TypedDict
from sklearn.feature_extraction.text import CountVectorizer
from flojoy import flojoy, DataFrame, Matrix, Vector
import numpy as np
import pandas as pd


Expand All @@ -24,6 +25,6 @@ def COUNT_VECTORIZER(default: DataFrame | Matrix | Vector) -> CountVectorizerOut
X = vectorizer.fit_transform(data.flatten())

x = pd.DataFrame({"tokens": vectorizer.get_feature_names_out()})
y = X.toarray()
y = X.toarray() # type: ignore

return CountVectorizerOutput(tokens=DataFrame(df=x), word_count_vector=Vector(v=y))
Original file line number Diff line number Diff line change
@@ -1,8 +1,5 @@
import pandas as pd
from flojoy import flojoy, DataFrame, DataContainer
from prophet import Prophet
from flojoy import flojoy, run_in_venv, DataFrame, DataContainer
from typing import TypedDict
from prophet.serialize import model_to_json


class ProphetPredictOutput(TypedDict):
Expand All @@ -11,11 +8,70 @@ class ProphetPredictOutput(TypedDict):


@flojoy(deps={"prophet": "1.1.4", "holidays": "0.26", "pystan": "2.19.1.1"})
@run_in_venv(
pip_dependencies=[
"prophet==1.1.4",
]
)
def PROPHET_PREDICT(
default: DataFrame, run_forecast: bool = True, periods: int = 365
) -> ProphetPredictOutput:


import os
import sys
import pandas as pd
import numpy as np

import prophet
from prophet.serialize import model_to_json

def _make_dummy_dataframe_for_prophet():
Generate random time series data to test if prophet works
start_date = pd.Timestamp("2023-01-01")
end_date = pd.Timestamp("2023-07-20")
num_days = (end_date - start_date).days + 1
timestamps = pd.date_range(start=start_date, end=end_date, freq="D")
data = np.random.randn(num_days) # Random data points
df = pd.DataFrame({"ds": timestamps, "ys": data})
df.rename(
columns={df.columns[0]: "ds", df.columns[1]: "y"}, inplace=True
) # PROPHET model expects first column to be `ds` and second to be `y`
return df

def _apply_macos_prophet_hotfix():
This is a hotfix for MacOS. See https://github.com/facebook/prophet/issues/2250#issuecomment-1559516328 for more detail

if not sys.platform == "darwin":
return

# Test if prophet works (i.e. if the hotfix had already been applied)
try:
_dummy_df = _make_dummy_dataframe_for_prophet()
prophet.Prophet().fit(_dummy_df)
except RuntimeError as e:
print(f"Could not run prophet, applying hotfix...")
else:
return

prophet_dir = prophet.__path__[0] # type: ignore
# Get stan dir
stan_dir = os.path.join(prophet_dir, "stan_model")
# Find cmdstan-xxxxx dir
cmdstan_basename = [x for x in os.listdir(stan_dir) if x.startswith("cmdstan")]
assert len(cmdstan_basename) == 1, "Could not find cmdstan dir"
cmdstan_basename = cmdstan_basename[0]
# Run (from stan_dir) : install_name_tool -add_rpath @executable_path/<CMDSTAN_BASENAME>/stan/lib/stan_math/lib/tbb prophet_model.bin
cmd = f"install_name_tool -add_rpath @executable_path/{cmdstan_basename}/stan/lib/stan_math/lib/tbb prophet_model.bin"
cwd = os.getcwd()
os.chdir(stan_dir)
return_code = os.system(cmd)
os.chdir(cwd)
if return_code != 0:
raise RuntimeError("Could not apply hotfix")

_apply_macos_prophet_hotfix()

df = default.m
first_col = df.iloc[:, 0]
if not pd.api.types.is_datetime64_any_dtype(first_col):
Expand All @@ -25,7 +81,7 @@ def PROPHET_PREDICT(
df.rename(
columns={df.columns[0]: "ds", df.columns[1]: "y"}, inplace=True
) # PROPHET model expects first column to be `ds` and second to be `y`
model = Prophet()
model = prophet.Prophet()
model.fit(df)
extra = {"prophet": model_to_json(model), "run_forecast": run_forecast}
# If run_forecast, the return df is the forecast, otherwise the original
Expand Down
Original file line number Diff line number Diff line change
@@ -1,55 +1,64 @@
from flojoy import flojoy, Image

import torch
from torchvision import transforms
import torchvision.transforms.functional as TF

from PIL import Image as PIL_Image
import numpy as np
from flojoy import flojoy, run_in_venv, Image


@flojoy
@run_in_venv(
pip_dependencies=[
"torch==2.0.1",
"torchvision==0.15.2",
"Pillow==9.5.0",
"numpy==1.24.3",
]
)
def DEEPLAB_V3(default: Image) -> Image:

input_image = default

import os
import numpy as np
from PIL import Image as PIL_Image
import torch
from torchvision import transforms
import torchvision.transforms.functional as TF
from flojoy.utils import FLOJOY_CACHE_DIR

# Parse input image
input_image = default
r, g, b, a = input_image.r, input_image.g, input_image.b, input_image.a
nparray = (
np.stack((r, g, b, a), axis=2) if a is not None else np.stack((r, g, b), axis=2)
)

# Convert input image
input_image = TF.to_pil_image(nparray).convert("RGB")

# Set torch hub cache directory
torch.hub.set_dir(os.path.join(FLOJOY_CACHE_DIR, "torch_hub"))
model = torch.hub.load(
"pytorch/vision:v0.10.0", "deeplabv3_resnet50", pretrained=True
)
model.eval()

# Preprocessing
preprocess_transform = transforms.Compose(
[
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
]
)

# Feed the input image to the model
input_tensor = preprocess_transform(input_image)
input_batch = input_tensor.unsqueeze(0)

with torch.no_grad():
with torch.inference_mode():
output = model(input_batch)["out"][0]

# Fetch the output
output_predictions = output.argmax(0)
palette = torch.tensor([2**25 - 1, 2**15 - 1, 2**21 - 1])
colors = torch.as_tensor([i for i in range(21)])[:, None] * palette
colors = (colors % 255).numpy().astype("uint8")

# plot the semantic segmentation predictions of 21 classes in each color
r = PIL_Image.fromarray(output_predictions.byte().cpu().numpy()).resize(
input_image.size
)
r.putpalette(colors)
out_img = np.array(r.convert("RGB"))

# Build the output image
return Image(
r=out_img[:, :, 0],
g=out_img[:, :, 1],
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
The BART_LARGE_CNN node takes an input dataframe with multiple rows and a single "text" column,
and produces a dataframe with a single "summary_text" column. The "summary_text" column contains a summary
The BART_LARGE_CNN node takes an input dataframe with multiple rows and a single column,
and produces a dataframe with a single "summary_text" column. The "summary_text" column contains a summary
of the text in the corresponding row of the input dataframe.

Parameters
Expand Down
Original file line number Diff line number Diff line change
@@ -1,35 +1,47 @@
from flojoy import flojoy, DataFrame
import torch
from transformers import BartTokenizer, BartForConditionalGeneration
import pandas as pd
from flojoy import flojoy, run_in_venv, DataFrame


@flojoy
@run_in_venv(
pip_dependencies=[
"transformers==4.30.2",
"torch==2.0.1",
"torchvision==0.15.2",
"pandas==1.5.3",
]
)
def BART_LARGE_CNN(default: DataFrame) -> DataFrame:


import torch
from flojoy import snapshot_download
from transformers import BartTokenizer, BartForConditionalGeneration
import pandas as pd

input_df = default.m

assert (
len(input_df.columns.tolist()) == 1
), "Can only take a single-column dataframe as input"

# Load the pre-trained BART model
model = BartForConditionalGeneration.from_pretrained(
"facebook/bart-large-cnn", revision="3d22493"
)
tokenizer = BartTokenizer.from_pretrained(
"facebook/bart-large-cnn", revision="3d22493"
# Load the repo from either the local cache or from the web, and get the local path
local_path = snapshot_download(
repo_id="facebook/bart-large-cnn", revision="3d22493"
)

# Load the pre-trained BART model
model = BartForConditionalGeneration.from_pretrained(local_path)
tokenizer = BartTokenizer.from_pretrained(local_path)

def _chunk_text(text):
inputs_no_trunc = tokenizer(
text, max_length=None, return_tensors="pt", truncation=False
)
chunks = []
for i in range(
0, len(inputs_no_trunc["input_ids"][0]), tokenizer.model_max_length
):
chunk = inputs_no_trunc["input_ids"][0][i : i + tokenizer.model_max_length]
step = 1024
# step = tokenizer.model_max_length - 1
for i in range(0, len(inputs_no_trunc["input_ids"][0]), step):
chunk = inputs_no_trunc["input_ids"][0][i : i + step]
chunks.append(torch.unsqueeze(chunk, 0))
return chunks

Expand All @@ -39,7 +51,7 @@ def BART_LARGE_CNN(default: DataFrame) -> DataFrame:
model.generate(
chunk,
num_beams=4,
max_length=tokenizer.model_max_length // 2,
max_length=1024 // 2,
early_stopping=True,
)
for chunk in chunks
Expand All @@ -59,7 +71,8 @@ def BART_LARGE_CNN(default: DataFrame) -> DataFrame:

column = input_df.columns[0]

output_df = pd.DataFrame(
input_df[column].apply(_summarize_text).rename("summary_text")
)
with torch.inference_mode():
output_df = pd.DataFrame(
input_df[column].apply(_summarize_text).rename("summary_text")
)
return DataFrame(df=output_df)
Loading