diff --git a/clean/utils.py b/clean/utils.py new file mode 100644 index 0000000..e398342 --- /dev/null +++ b/clean/utils.py @@ -0,0 +1,65 @@ + +# Numpy and pandas by default assume a narrow screen - this fixes that +from fastai2.vision.all import * +from nbdev.showdoc import * +from ipywidgets import widgets +from pandas.api.types import CategoricalDtype + +import matplotlib as mpl +# mpl.rcParams['figure.dpi']= 200 +mpl.rcParams['savefig.dpi']= 200 +mpl.rcParams['font.size']=12 + +set_seed(42) +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = False +pd.set_option('display.max_columns',999) +np.set_printoptions(linewidth=200) +torch.set_printoptions(linewidth=200) + +import graphviz +def gv(s): return graphviz.Source('digraph G{ rankdir="LR"' + s + '; }') + +def get_image_files_sorted(path, recurse=True, folders=None): return get_image_files(path, recurse, folders).sorted() + + +# + +# pip install azure-cognitiveservices-search-imagesearch + +from azure.cognitiveservices.search.imagesearch import ImageSearchClient as api +from msrest.authentication import CognitiveServicesCredentials as auth + +def search_images_bing(key, term, min_sz=128): + client = api('https://api.cognitive.microsoft.com', auth(key)) + return L(client.images.search(query=term, count=150, min_height=min_sz, min_width=min_sz).value) + + +# - + +def plot_function(f, tx=None, ty=None, title=None, min=-2, max=2, figsize=(6,4)): + x = torch.linspace(min,max) + fig,ax = plt.subplots(figsize=figsize) + ax.plot(x,f(x)) + if tx is not None: ax.set_xlabel(tx) + if ty is not None: ax.set_ylabel(ty) + if title is not None: ax.set_title(title) + +# + +from sklearn.tree import export_graphviz + +def draw_tree(t, df, size=10, ratio=0.6, precision=0, **kwargs): + s=export_graphviz(t, out_file=None, feature_names=df.columns, filled=True, rounded=True, + special_characters=True, rotate=False, precision=precision, **kwargs) + return graphviz.Source(re.sub('Tree {', f'Tree {{ size={size}; ratio={ratio}', s)) + + +# + +from scipy.cluster import hierarchy as hc + +def cluster_columns(df, figsize=(10,6), font_size=12): + corr = np.round(scipy.stats.spearmanr(df).correlation, 4) + corr_condensed = hc.distance.squareform(1-corr) + z = hc.linkage(corr_condensed, method='average') + fig = plt.figure(figsize=figsize) + hc.dendrogram(z, labels=df.columns, orientation='left', leaf_font_size=font_size) + plt.show()