// Detect anomalies and outliers in datasets using statistical and ML methods. Use for data cleaning, fraud detection, or quality control analysis.
| name | outlier-detective |
| description | Detect anomalies and outliers in datasets using statistical and ML methods. Use for data cleaning, fraud detection, or quality control analysis. |
Detect anomalies and outliers in numeric data using multiple methods.
from outlier_detective import OutlierDetective
detective = OutlierDetective()
detective.load_csv("sales_data.csv")
# Detect outliers in a column
outliers = detective.detect("revenue", method="iqr")
print(f"Found {len(outliers)} outliers")
# Get full report
report = detective.analyze("revenue")
print(report)
# Detect outliers using IQR method
python outlier_detective.py --input data.csv --column sales --method iqr
# Use Z-score with custom threshold
python outlier_detective.py --input data.csv --column price --method zscore --threshold 3
# Analyze all numeric columns
python outlier_detective.py --input data.csv --all
# Generate visualization
python outlier_detective.py --input data.csv --column revenue --plot boxplot.png
# Export outliers to CSV
python outlier_detective.py --input data.csv --column value --output outliers.csv
# Use Isolation Forest (ML)
python outlier_detective.py --input data.csv --method isolation_forest
class OutlierDetective:
def __init__(self)
# Data loading
def load_csv(self, filepath: str, **kwargs) -> 'OutlierDetective'
def load_dataframe(self, df: pd.DataFrame) -> 'OutlierDetective'
# Detection (single column)
def detect(self, column: str, method: str = "iqr", **kwargs) -> pd.DataFrame
def analyze(self, column: str) -> dict
# Detection (multi-column)
def detect_multivariate(self, columns: list = None, method: str = "isolation_forest") -> pd.DataFrame
def analyze_all(self) -> dict
# Visualization
def plot_boxplot(self, column: str, output: str) -> str
def plot_scatter(self, col1: str, col2: str, output: str) -> str
def plot_distribution(self, column: str, output: str) -> str
# Export
def get_outliers(self, column: str, method: str = "iqr") -> pd.DataFrame
def get_clean_data(self, column: str, method: str = "iqr") -> pd.DataFrame
outliers = detective.detect("price", method="iqr", multiplier=1.5)
outliers = detective.detect("price", method="zscore", threshold=3)
outliers = detective.detect("price", method="modified_zscore", threshold=3.5)
outliers = detective.detect_multivariate(
method="isolation_forest",
contamination=0.1
)
outliers = detective.detect_multivariate(
method="lof",
n_neighbors=20
)
# Returns DataFrame of outlier rows with additional columns:
# - outlier_score: How extreme the value is
# - outlier_reason: Description of why it's an outlier
index value outlier_score outlier_reason
0 15 5000 4.2 Above Q3 + 1.5×IQR
1 42 -1000 -3.8 Below Q1 - 1.5×IQR
{
"column": "revenue",
"total_rows": 1000,
"outlier_count": 23,
"outlier_percent": 2.3,
"methods": {
"iqr": {"count": 23, "indices": [...]},
"zscore": {"count": 18, "indices": [...]},
"modified_zscore": {"count": 20, "indices": [...]}
},
"stats": {
"mean": 5432.10,
"median": 4890.00,
"std": 1234.56,
"min": -1000.00,
"max": 15000.00,
"q1": 3500.00,
"q3": 6200.00,
"iqr": 2700.00
},
"bounds": {
"lower": -550.00,
"upper": 10250.00
}
}
detective = OutlierDetective()
detective.load_csv("raw_data.csv")
# Analyze and visualize
report = detective.analyze("price")
print(f"Found {report['outlier_count']} outliers ({report['outlier_percent']:.1f}%)")
# Get clean data
clean_data = detective.get_clean_data("price", method="iqr")
clean_data.to_csv("clean_data.csv")
detective = OutlierDetective()
detective.load_csv("transactions.csv")
# Use multiple methods for consensus
iqr_outliers = set(detective.detect("amount", method="iqr").index)
zscore_outliers = set(detective.detect("amount", method="zscore").index)
# Transactions flagged by both methods
high_confidence = iqr_outliers & zscore_outliers
print(f"High-confidence anomalies: {len(high_confidence)}")
detective = OutlierDetective()
detective.load_csv("sensors.csv")
# Detect multivariate outliers
outliers = detective.detect_multivariate(
columns=["temp", "pressure", "humidity"],
method="isolation_forest",
contamination=0.05
)
print(f"Anomalous readings: {len(outliers)}")
# Box plot with outliers highlighted
detective.plot_boxplot("revenue", "revenue_boxplot.png")
# Distribution with bounds
detective.plot_distribution("price", "price_dist.png")
# Scatter plot (2D outliers)
detective.plot_scatter("feature1", "feature2", "scatter.png")