flojoy-ai · flojoy-bot · Jul 25, 2023
diff --git a/docs/nodes/AI_ML/ANOMALY_DETECTION/ISOLATION_FOREST/ISOLATION_FOREST.md b/docs/nodes/AI_ML/ANOMALY_DETECTION/ISOLATION_FOREST/ISOLATION_FOREST.md
@@ -0,0 +1,47 @@
+
+[//]: # (Custom component imports)
+
+import DocString from '@site/src/components/DocString';
+import PythonCode from '@site/src/components/PythonCode';
+import AppDisplay from '@site/src/components/AppDisplay';
+import SectionBreak from '@site/src/components/SectionBreak';
+import AppendixSection from '@site/src/components/AppendixSection';
+
+[//]: # (Docstring)
+
+import DocstringSource from '!!raw-loader!./a1-[autogen]/docstring.txt';
+import PythonSource from '!!raw-loader!./a1-[autogen]/python_code.txt';
+
+<DocString>{DocstringSource}</DocString>
+<PythonCode GLink='AI_ML/ANOMALY_DETECTION/ISOLATION_FOREST/ISOLATION_FOREST.py'>{PythonSource}</PythonCode>
+
+<SectionBreak />
+
+
+
+[//]: # (Examples)
+
+## Examples
+
+<AppDisplay 
+  GLink='AI_ML/ANOMALY_DETECTION/ISOLATION_FOREST'
+  nodeLabel='ISOLATION_FOREST'>
+</AppDisplay>
+
+<SectionBreak />
+
+
+
+[//]: # (Appendix)
+
+import Notes from '!!raw-loader!./appendix/notes.md';
+import Hardware from '!!raw-loader!./appendix/hardware.md';
+import Media from '!!raw-loader!./appendix/media.md';
+
+## Appendix
+
+<AppendixSection index={0} folderPath='nodes/AI_ML/ANOMALY_DETECTION/ISOLATION_FOREST/appendix/'>{Notes}</AppendixSection>
+<AppendixSection index={1} folderPath='nodes/AI_ML/ANOMALY_DETECTION/ISOLATION_FOREST/appendix/'>{Hardware}</AppendixSection>
+<AppendixSection index={2} folderPath='nodes/AI_ML/ANOMALY_DETECTION/ISOLATION_FOREST/appendix/'>{Media}</AppendixSection>
+
+
diff --git a/docs/nodes/AI_ML/ANOMALY_DETECTION/ISOLATION_FOREST/a1-[autogen]/docstring.txt b/docs/nodes/AI_ML/ANOMALY_DETECTION/ISOLATION_FOREST/a1-[autogen]/docstring.txt
@@ -0,0 +1,13 @@
+
+The ISOLATION_FOREST node uses the Isolation Forest algorithm to detect anomalous points in a tabular dataset.
+Reference: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.IsolationForest.html
+
+Parameters
+----------
+contamination: float, default=0 (auto)
+    The estimated proportion of outliers in the data set.
+
+Returns
+-------
+dataframe
+    The original dataframe for the input data including two columns: 'anomaly_scores' and 'anomaly'.
diff --git a/docs/nodes/AI_ML/ANOMALY_DETECTION/ISOLATION_FOREST/a1-[autogen]/python_code.txt b/docs/nodes/AI_ML/ANOMALY_DETECTION/ISOLATION_FOREST/a1-[autogen]/python_code.txt
@@ -0,0 +1,21 @@
+from flojoy import flojoy, DataFrame as FlojoyDataFrame
+from sklearn.ensemble import IsolationForest
+
+
+@flojoy
+def ISOLATION_FOREST(
+    default: FlojoyDataFrame, 
+    contamination: float = 0
+) -> FlojoyDataFrame:
+
+
+    df = default.m
+    if contamination == 0:
+        contamination = "auto"
+    model = IsolationForest(contamination=contamination)
+    model.fit(df)
+    results = model.decision_function(df)
+    df['anomaly'] = model.predict(df)
+    df['anomaly_scores'] = results
+
+    return FlojoyDataFrame(df=df)
diff --git a/docs/nodes/AI_ML/ANOMALY_DETECTION/ISOLATION_FOREST/appendix/hardware.md b/docs/nodes/AI_ML/ANOMALY_DETECTION/ISOLATION_FOREST/appendix/hardware.md
diff --git a/docs/nodes/AI_ML/ANOMALY_DETECTION/ISOLATION_FOREST/appendix/media.md b/docs/nodes/AI_ML/ANOMALY_DETECTION/ISOLATION_FOREST/appendix/media.md
diff --git a/docs/nodes/AI_ML/ANOMALY_DETECTION/ISOLATION_FOREST/appendix/notes.md b/docs/nodes/AI_ML/ANOMALY_DETECTION/ISOLATION_FOREST/appendix/notes.md
diff --git a/docs/nodes/AI_ML/IMAGE_CAPTIONING/NLP_CONNECT_VIT_GPT2/a1-[autogen]/python_code.txt b/docs/nodes/AI_ML/IMAGE_CAPTIONING/NLP_CONNECT_VIT_GPT2/a1-[autogen]/python_code.txt
@@ -16,6 +16,7 @@ def NLP_CONNECT_VIT_GPT2(default: Image) -> DataFrame:
     import pandas as pd
 
     import transformers
+    import torch
     import torchvision.transforms.functional as TF
     from flojoy import snapshot_download
 
@@ -36,10 +37,11 @@ def NLP_CONNECT_VIT_GPT2(default: Image) -> DataFrame:
     feature_extractor = transformers.ViTImageProcessor.from_pretrained(local_repo_path)
     tokenizer = transformers.AutoTokenizer.from_pretrained(local_repo_path)
 
-    pixel_values = feature_extractor(images=[image], return_tensors="pt").pixel_values  # type: ignore
-    output_ids = model.generate(pixel_values, max_length=16, num_beams=4)  # type: ignore
-    preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)  # type: ignore
-    pred = preds[0].strip()
+    with torch.inference_mode():
+        pixel_values = feature_extractor(images=[image], return_tensors="pt").pixel_values  # type: ignore
+        output_ids = model.generate(pixel_values, max_length=16, num_beams=4)  # type: ignore
+        preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)  # type: ignore
+        pred = preds[0].strip()
 
     df_pred = pd.DataFrame.from_records([(pred,)], columns=["caption"])
 

diff --git a/docs/nodes/AI_ML/NLP/COUNT_VECTORIZER/a1-[autogen]/python_code.txt b/docs/nodes/AI_ML/NLP/COUNT_VECTORIZER/a1-[autogen]/python_code.txt
@@ -1,6 +1,7 @@
 from typing import TypedDict
 from sklearn.feature_extraction.text import CountVectorizer
 from flojoy import flojoy, DataFrame, Matrix, Vector
+import numpy as np
 import pandas as pd
 
 
@@ -24,6 +25,6 @@ def COUNT_VECTORIZER(default: DataFrame | Matrix | Vector) -> CountVectorizerOut
     X = vectorizer.fit_transform(data.flatten())
 
     x = pd.DataFrame({"tokens": vectorizer.get_feature_names_out()})
-    y = X.toarray()
+    y = X.toarray()  # type: ignore
 
     return CountVectorizerOutput(tokens=DataFrame(df=x), word_count_vector=Vector(v=y))
diff --git a/docs/nodes/AI_ML/PREDICT_TIME_SERIES/PROPHET_PREDICT/a1-[autogen]/python_code.txt b/docs/nodes/AI_ML/PREDICT_TIME_SERIES/PROPHET_PREDICT/a1-[autogen]/python_code.txt
@@ -1,8 +1,5 @@
-import pandas as pd
-from flojoy import flojoy, DataFrame, DataContainer
-from prophet import Prophet
+from flojoy import flojoy, run_in_venv, DataFrame, DataContainer
 from typing import TypedDict
-from prophet.serialize import model_to_json
 
 
 class ProphetPredictOutput(TypedDict):
@@ -11,11 +8,70 @@ class ProphetPredictOutput(TypedDict):
 
 
 @flojoy(deps={"prophet": "1.1.4", "holidays": "0.26", "pystan": "2.19.1.1"})
+@run_in_venv(
+    pip_dependencies=[
+        "prophet==1.1.4",
+    ]
+)
 def PROPHET_PREDICT(
     default: DataFrame, run_forecast: bool = True, periods: int = 365
 ) -> ProphetPredictOutput:
 
 
+    import os
+    import sys
+    import pandas as pd
+    import numpy as np
+
+    import prophet
+    from prophet.serialize import model_to_json
+
+    def _make_dummy_dataframe_for_prophet():
+        Generate random time series data to test if prophet works
+        start_date = pd.Timestamp("2023-01-01")
+        end_date = pd.Timestamp("2023-07-20")
+        num_days = (end_date - start_date).days + 1
+        timestamps = pd.date_range(start=start_date, end=end_date, freq="D")
+        data = np.random.randn(num_days)  # Random data points
+        df = pd.DataFrame({"ds": timestamps, "ys": data})
+        df.rename(
+            columns={df.columns[0]: "ds", df.columns[1]: "y"}, inplace=True
+        )  # PROPHET model expects first column to be `ds` and second to be `y`
+        return df
+
+    def _apply_macos_prophet_hotfix():
+        This is a hotfix for MacOS. See https://github.com/facebook/prophet/issues/2250#issuecomment-1559516328 for more detail
+
+        if not sys.platform == "darwin":
+            return
+
+        # Test if prophet works (i.e. if the hotfix had already been applied)
+        try:
+            _dummy_df = _make_dummy_dataframe_for_prophet()
+            prophet.Prophet().fit(_dummy_df)
+        except RuntimeError as e:
+            print(f"Could not run prophet, applying hotfix...")
+        else:
+            return
+
+        prophet_dir = prophet.__path__[0]  # type: ignore
+        # Get stan dir
+        stan_dir = os.path.join(prophet_dir, "stan_model")
+        # Find cmdstan-xxxxx dir
+        cmdstan_basename = [x for x in os.listdir(stan_dir) if x.startswith("cmdstan")]
+        assert len(cmdstan_basename) == 1, "Could not find cmdstan dir"
+        cmdstan_basename = cmdstan_basename[0]
+        # Run (from stan_dir) : install_name_tool -add_rpath @executable_path/<CMDSTAN_BASENAME>/stan/lib/stan_math/lib/tbb prophet_model.bin
+        cmd = f"install_name_tool -add_rpath @executable_path/{cmdstan_basename}/stan/lib/stan_math/lib/tbb prophet_model.bin"
+        cwd = os.getcwd()
+        os.chdir(stan_dir)
+        return_code = os.system(cmd)
+        os.chdir(cwd)
+        if return_code != 0:
+            raise RuntimeError("Could not apply hotfix")
+
+    _apply_macos_prophet_hotfix()
+
     df = default.m
     first_col = df.iloc[:, 0]
     if not pd.api.types.is_datetime64_any_dtype(first_col):
@@ -25,7 +81,7 @@ def PROPHET_PREDICT(
     df.rename(
         columns={df.columns[0]: "ds", df.columns[1]: "y"}, inplace=True
     )  # PROPHET model expects first column to be `ds` and second to be `y`
-    model = Prophet()
+    model = prophet.Prophet()
     model.fit(df)
     extra = {"prophet": model_to_json(model), "run_forecast": run_forecast}
     # If run_forecast, the return df is the forecast, otherwise the original

diff --git a/docs/nodes/AI_ML/SEGMENTATION/DEEPLAB_V3/a1-[autogen]/python_code.txt b/docs/nodes/AI_ML/SEGMENTATION/DEEPLAB_V3/a1-[autogen]/python_code.txt
@@ -1,55 +1,64 @@
-from flojoy import flojoy, Image
-
-import torch
-from torchvision import transforms
-import torchvision.transforms.functional as TF
-
-from PIL import Image as PIL_Image
-import numpy as np
+from flojoy import flojoy, run_in_venv, Image
 
 
 @flojoy
+@run_in_venv(
+    pip_dependencies=[
+        "torch==2.0.1",
+        "torchvision==0.15.2",
+        "Pillow==9.5.0",
+        "numpy==1.24.3",
+    ]
+)
 def DEEPLAB_V3(default: Image) -> Image:
 
-    input_image = default
 
+    import os
+    import numpy as np
+    from PIL import Image as PIL_Image
+    import torch
+    from torchvision import transforms
+    import torchvision.transforms.functional as TF
+    from flojoy.utils import FLOJOY_CACHE_DIR
+
+    # Parse input image
+    input_image = default
     r, g, b, a = input_image.r, input_image.g, input_image.b, input_image.a
     nparray = (
         np.stack((r, g, b, a), axis=2) if a is not None else np.stack((r, g, b), axis=2)
     )
-
+    # Convert input image
     input_image = TF.to_pil_image(nparray).convert("RGB")
-
+    # Set torch hub cache directory
+    torch.hub.set_dir(os.path.join(FLOJOY_CACHE_DIR, "torch_hub"))
     model = torch.hub.load(
         "pytorch/vision:v0.10.0", "deeplabv3_resnet50", pretrained=True
     )
     model.eval()
-
+    # Preprocessing
     preprocess_transform = transforms.Compose(
         [
             transforms.ToTensor(),
             transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
         ]
     )
-
+    # Feed the input image to the model
     input_tensor = preprocess_transform(input_image)
     input_batch = input_tensor.unsqueeze(0)
-
-    with torch.no_grad():
+    with torch.inference_mode():
         output = model(input_batch)["out"][0]
-
+    # Fetch the output
     output_predictions = output.argmax(0)
     palette = torch.tensor([2**25 - 1, 2**15 - 1, 2**21 - 1])
     colors = torch.as_tensor([i for i in range(21)])[:, None] * palette
     colors = (colors % 255).numpy().astype("uint8")
-
     # plot the semantic segmentation predictions of 21 classes in each color
     r = PIL_Image.fromarray(output_predictions.byte().cpu().numpy()).resize(
         input_image.size
     )
     r.putpalette(colors)
     out_img = np.array(r.convert("RGB"))
-
+    # Build the output image
     return Image(
         r=out_img[:, :, 0],
         g=out_img[:, :, 1],

diff --git a/docs/nodes/AI_ML/TEXT_SUMMARIZATION/BART_LARGE_CNN/a1-[autogen]/docstring.txt b/docs/nodes/AI_ML/TEXT_SUMMARIZATION/BART_LARGE_CNN/a1-[autogen]/docstring.txt
@@ -1,5 +1,5 @@
-The BART_LARGE_CNN node takes an input dataframe with multiple rows and a single "text" column,
-    and produces a dataframe with a single "summary_text" column.  The "summary_text" column contains a summary
+The BART_LARGE_CNN node takes an input dataframe with multiple rows and a single column,
+    and produces a dataframe with a single "summary_text" column. The "summary_text" column contains a summary
     of the text in the corresponding row of the input dataframe.
 
     Parameters

diff --git a/docs/nodes/AI_ML/TEXT_SUMMARIZATION/BART_LARGE_CNN/a1-[autogen]/python_code.txt b/docs/nodes/AI_ML/TEXT_SUMMARIZATION/BART_LARGE_CNN/a1-[autogen]/python_code.txt
@@ -1,35 +1,47 @@
-from flojoy import flojoy, DataFrame
-import torch
-from transformers import BartTokenizer, BartForConditionalGeneration
-import pandas as pd
+from flojoy import flojoy, run_in_venv, DataFrame
 
 
 @flojoy
+@run_in_venv(
+    pip_dependencies=[
+        "transformers==4.30.2",
+        "torch==2.0.1",
+        "torchvision==0.15.2",
+        "pandas==1.5.3",
+    ]
+)
 def BART_LARGE_CNN(default: DataFrame) -> DataFrame:
 
+
+    import torch
+    from flojoy import snapshot_download
+    from transformers import BartTokenizer, BartForConditionalGeneration
+    import pandas as pd
+
     input_df = default.m
 
     assert (
         len(input_df.columns.tolist()) == 1
     ), "Can only take a single-column dataframe as input"
 
-    # Load the pre-trained BART model
-    model = BartForConditionalGeneration.from_pretrained(
-        "facebook/bart-large-cnn", revision="3d22493"
-    )
-    tokenizer = BartTokenizer.from_pretrained(
-        "facebook/bart-large-cnn", revision="3d22493"
+    # Load the repo from either the local cache or from the web, and get the local path
+    local_path = snapshot_download(
+        repo_id="facebook/bart-large-cnn", revision="3d22493"
     )
 
+    # Load the pre-trained BART model
+    model = BartForConditionalGeneration.from_pretrained(local_path)
+    tokenizer = BartTokenizer.from_pretrained(local_path)
+
     def _chunk_text(text):
         inputs_no_trunc = tokenizer(
             text, max_length=None, return_tensors="pt", truncation=False
         )
         chunks = []
-        for i in range(
-            0, len(inputs_no_trunc["input_ids"][0]), tokenizer.model_max_length
-        ):
-            chunk = inputs_no_trunc["input_ids"][0][i : i + tokenizer.model_max_length]
+        step = 1024
+        # step = tokenizer.model_max_length - 1
+        for i in range(0, len(inputs_no_trunc["input_ids"][0]), step):
+            chunk = inputs_no_trunc["input_ids"][0][i : i + step]
             chunks.append(torch.unsqueeze(chunk, 0))
         return chunks
 
@@ -39,7 +51,7 @@ def BART_LARGE_CNN(default: DataFrame) -> DataFrame:
             model.generate(
                 chunk,
                 num_beams=4,
-                max_length=tokenizer.model_max_length // 2,
+                max_length=1024 // 2,
                 early_stopping=True,
             )
             for chunk in chunks
@@ -59,7 +71,8 @@ def BART_LARGE_CNN(default: DataFrame) -> DataFrame:
 
     column = input_df.columns[0]
 
-    output_df = pd.DataFrame(
-        input_df[column].apply(_summarize_text).rename("summary_text")
-    )
+    with torch.inference_mode():
+        output_df = pd.DataFrame(
+            input_df[column].apply(_summarize_text).rename("summary_text")
+        )
     return DataFrame(df=output_df)