Batch Processing¶

PHOENIX provides efficient batch screening for analyzing multiple compounds.

Basic Usage¶

from phoenix import screen

smiles_list = [
    "CCO",                                    # Ethanol
    "CC(=O)C",                                # Acetone
    "c1ccccc1[N+](=O)[O-]",                   # Nitrobenzene
    "Cc1c([N+](=O)[O-])cc([N+](=O)[O-])cc1[N+](=O)[O-]",  # TNT
]

results = screen(smiles_list)
print(f"Screened {results.successful} compounds successfully")
print(f"Failed: {results.failed}")

BatchResult¶

The screen() function returns a BatchResult object:

Attributes¶

Attribute	Type	Description
`dataframe`	pd.DataFrame	Results DataFrame
`successful`	int	Number of successful screenings
`failed`	int	Number of failed screenings

Methods¶

Method	Description
`to_csv(path)`	Export to CSV file
`to_json()`	Export to JSON string
`summary()`	Get summary statistics

DataFrame Columns¶

The results DataFrame contains:

Column	Type	Description
`smiles`	str	Input SMILES
`canonical_smiles`	str	Canonicalized SMILES
`formula`	str	Molecular formula
`mw`	float	Molecular weight (g/mol)
`delta_hf_kJ_mol`	float	ΔHf° (kJ/mol)
`delta_hd_kJ_mol`	float	ΔHd (kJ/mol)
`delta_hd_cal_g`	float	ΔHd (cal/g)
`ob_percent`	float	Oxygen balance (%)
`hazard_class`	str	HIGH, MEDIUM, or LOW
`triggered_criteria`	list	CHETAH criteria triggered
`alerts`	list	Functional group alerts
`gas_volume_L_g`	float	Gas generation (L/g)
`error`	str	Error type if failed
`error_message`	str	Error details if failed

Working with Results¶

Accessing the DataFrame¶

results = screen(smiles_list)
df = results.dataframe

# View all columns
print(df.columns.tolist())

# Basic statistics
print(df.describe())

Filtering Results¶

df = results.dataframe

# High hazard compounds
high_hazard = df[df['hazard_class'] == 'HIGH']
print(f"High hazard: {len(high_hazard)}")

# Compounds with specific criteria
criterion_1 = df[df['triggered_criteria'].apply(lambda x: 1 in x if x else False)]

# Compounds with alerts
has_alerts = df[df['alerts'].apply(lambda x: len(x) > 0 if x else False)]

Sorting Results¶

df = results.dataframe

# Sort by ΔHd (most exothermic first)
sorted_df = df.sort_values('delta_hd_cal_g')

# Sort by hazard class
hazard_order = {'HIGH': 0, 'MEDIUM': 1, 'LOW': 2, None: 3}
df['hazard_rank'] = df['hazard_class'].map(hazard_order)
sorted_df = df.sort_values('hazard_rank')

Exporting Results¶

To CSV¶

results = screen(smiles_list)
results.to_csv("screening_results.csv")

To JSON¶

json_str = results.to_json()
print(json_str)

# Write to file
with open("results.json", "w") as f:
    f.write(json_str)

To Excel¶

df = results.dataframe
df.to_excel("results.xlsx", index=False)

Summary Statistics¶

results = screen(smiles_list)
summary = results.summary()

print(f"Total compounds: {summary['total_compounds']}")
print(f"Successful: {summary['successful']}")
print(f"Failed: {summary['failed']}")
print(f"Hazard distribution: {summary['hazard_class_counts']}")
print(f"ΔHd range: {summary['delta_hd_cal_g_min']:.1f} to {summary['delta_hd_cal_g_max']:.1f} cal/g")

Progress Reporting¶

Use a callback for progress updates:

def progress_callback(current, total):
    percent = (current / total) * 100
    print(f"\rProgress: {current}/{total} ({percent:.1f}%)", end="")

results = screen(smiles_list, progress_callback=progress_callback)
print()  # Newline after progress

With tqdm¶

from tqdm import tqdm

pbar = tqdm(total=len(smiles_list))

def update_progress(current, total):
    pbar.update(1)

results = screen(smiles_list, progress_callback=update_progress)
pbar.close()

Error Handling¶

Failed compounds are captured in the DataFrame:

results = screen(smiles_list)
df = results.dataframe

# Find failed compounds
failed = df[df['error'].notna()]

for _, row in failed.iterrows():
    print(f"SMILES: {row['smiles']}")
    print(f"Error: {row['error']}")
    print(f"Message: {row['error_message']}")

Common Errors¶

Error	Cause
`InvalidSmilesError`	Invalid SMILES string
`UnsupportedElementError`	Contains unsupported element
`UnsupportedStructureError`	Charged/radical species
`MissingGroupError`	Benson GA data unavailable

Graceful Degradation¶

Batch processing continues even when individual compounds fail:

smiles_list = [
    "CCO",           # Valid
    "invalid",       # Invalid SMILES
    "[Fe]",          # Unsupported element
    "c1ccccc1",      # Valid
]

results = screen(smiles_list)

print(f"Successful: {results.successful}")  # 2
print(f"Failed: {results.failed}")          # 2

Large-Scale Screening¶

Memory Considerations¶

For very large datasets, process in chunks:

import pandas as pd

def screen_in_chunks(all_smiles, chunk_size=1000):
    all_dfs = []

    for i in range(0, len(all_smiles), chunk_size):
        chunk = all_smiles[i:i+chunk_size]
        results = screen(chunk)
        all_dfs.append(results.dataframe)
        print(f"Processed {min(i+chunk_size, len(all_smiles))}/{len(all_smiles)}")

    return pd.concat(all_dfs, ignore_index=True)

# Usage
all_smiles = [...]  # Large list
combined_df = screen_in_chunks(all_smiles)

Parallel Processing¶

For CPU-bound screening:

from concurrent.futures import ProcessPoolExecutor
import pandas as pd

def screen_chunk(smiles_chunk):
    return screen(smiles_chunk).dataframe

def parallel_screen(all_smiles, n_workers=4, chunk_size=100):
    chunks = [all_smiles[i:i+chunk_size]
              for i in range(0, len(all_smiles), chunk_size)]

    with ProcessPoolExecutor(max_workers=n_workers) as executor:
        dfs = list(executor.map(screen_chunk, chunks))

    return pd.concat(dfs, ignore_index=True)

# Usage
combined_df = parallel_screen(all_smiles)

Practical Examples¶

Screening a Chemical Library¶

from phoenix import screen

# Read SMILES from file
with open("compounds.smi") as f:
    smiles_list = [line.strip() for line in f if line.strip()]

# Screen all compounds
results = screen(smiles_list)

# Save results
results.to_csv("screening_results.csv")

# Print summary
summary = results.summary()
print(f"\nScreening Summary:")
print(f"  Total: {summary['total_compounds']}")
print(f"  Successful: {summary['successful']}")
print(f"  High hazard: {summary['hazard_class_counts'].get('HIGH', 0)}")
print(f"  Medium hazard: {summary['hazard_class_counts'].get('MEDIUM', 0)}")
print(f"  Low hazard: {summary['hazard_class_counts'].get('LOW', 0)}")

Identifying High-Risk Compounds¶

results = screen(smiles_list)
df = results.dataframe

# Find high-risk compounds
high_risk = df[
    (df['hazard_class'] == 'HIGH') |
    (df['delta_hd_cal_g'] < -200)
]

print("High-risk compounds:")
for _, row in high_risk.iterrows():
    print(f"  {row['formula']}: {row['hazard_class']} ({row['delta_hd_cal_g']:.0f} cal/g)")

Hazard Statistics Report¶

results = screen(smiles_list)
df = results.dataframe

# Generate report
print("Hazard Screening Report")
print("=" * 50)
print(f"Compounds screened: {len(df)}")
print(f"Successful: {results.successful}")
print(f"Failed: {results.failed}")

print("\nHazard Class Distribution:")
for hazard, count in df['hazard_class'].value_counts().items():
    pct = count / results.successful * 100
    print(f"  {hazard}: {count} ({pct:.1f}%)")

print("\nΔHd Statistics:")
successful_df = df[df['error'].isna()]
print(f"  Min: {successful_df['delta_hd_cal_g'].min():.1f} cal/g")
print(f"  Max: {successful_df['delta_hd_cal_g'].max():.1f} cal/g")
print(f"  Mean: {successful_df['delta_hd_cal_g'].mean():.1f} cal/g")

print("\nMost Common Alerts:")
all_alerts = []
for alerts in successful_df['alerts'].dropna():
    all_alerts.extend(alerts)
from collections import Counter
for alert, count in Counter(all_alerts).most_common(5):
    print(f"  {alert}: {count}")

Next Steps¶

Error Handling: Handling exceptions
Hazard Evaluation: Individual compound analysis
API Reference: BatchResult API