<img style='float:left' src='https://i.ibb.co/pzSzM5y/logo-200x200.png' width='200px'>
<br><br><br><br>

**Work with data & machine learning models**

* easily store data in a high-performance data cluster (MongoDB)
* store your fitted or unfitted scikit-learn models
* run predictions on the compute cluster directly from stored data
* store & use remote data (ftp, http, s3)

**Easily use compute resources in the cluster**

* fit models in the compute cluster, in parallel
* perform grid search
* all asynchronously

**Share data, notebooks**

* write, store & share notebooks directly online, no setup required
* run jobs on a regular schedule
* share notebooks and data across users

**Automatic REST API for any client**

* datasets
* models
* jobs (reports)
* arbitrary custom scripts (python)

**On-Premise or On-Cloud Custom Installation**

* customizable backends (e.g. Spark, R, SAS)
* custom runtimes (e.g. dask, Spark)
* arbitrary data storage extensions API
* custom data types extensions API
* native-Python data streaming API (like Spark Streaming, much simpler)

In [None]:
import warnings; warnings.filterwarnings('ignore')
%pip install --user -U yfinance tqdm matplotlib

In [None]:
import omegaml as om 
om.setup()

In [None]:
# list datasets stored in cluster
om.datasets.list()

In [None]:
# list models stored in clusters
om.models.list()

In [None]:
# list jobs & results stored in cluster
om.jobs.list()

In [None]:
# list custom scripts stored in cluster
om.scripts.list()

In [None]:
# store any python data
om.datasets.put(['any data'], 'mydata')
om.datasets.get('mydata')

In [None]:
# store numpy arrays and pandas dataframes
import pandas as pd
from sklearn.datasets import load_iris
X, y = load_iris(return_X_y=True)
data = pd.DataFrame(X)
data['y'] = y
data.head()
om.datasets.put(data, 'iris')
om.datasets.get('iris').head()

In [None]:
# store remote datasets as a reference (no copy)
om.datasets.put('https://raw.githubusercontent.com/MainakRepositor/Datasets/master/AirPassengers.csv', 'airpax')
om.datasets.get('airpax')

In [None]:
# store financial time series including indicies
%matplotlib inline
import pandas as pd
import datetime
import yfinance as yf

start = datetime.datetime(2017, 1, 1)
end = datetime.datetime(2018, 1, 31)
prices = yf.download('GOOGL', )
prices.head()

In [None]:
# get data back in their original format
om.datasets.put(prices, 'google', append=False)
prices = om.datasets.get('google')
prices.head()

In [None]:
# filter data in the database -- notice the nice syntax
%time om.datasets.get('google', Close__gte=120, Close__lte=140)

In [None]:
# filter & aggregate data locally (let's make it large)
from tqdm import tqdm
N = 1e6
ldf_google_large = om.datasets.getl('google-large')
dupl = int((N - len(ldf_google_large or [])) / len(prices) + 1)
for i in tqdm(range(dupl)):
    om.datasets.put(prices, 'google-large')
print("google-large has {} records".format(len(om.datasets.getl('google-large'))))

In [None]:
# filter & aggregate data locally (let's make it large)
def getdata():
  data = om.datasets.get('google-large')
  return data[(data.Close >= 100) & (data.Close <= 140)].mean() 

%time getdata()

In [None]:
# filter and aggregate by database - 2-3x faster
%time om.datasets.getl('google-large', Close__gte=100, Close__lte=140).mean().iloc[0]

In [None]:
# index based access by loading data first
def getdata():
    dfx = om.datasets.get('google-large')
    return dfx.loc[pd.to_datetime('2017-01-03')]
%time getdata()

In [None]:
# index-based access directly in database
dfx = om.datasets.getl('google-large')
%time dfx.loc[pd.to_datetime('2017-01-03')].value

In [None]:
# train models locally
%matplotlib inline
import pandas as pd 

from sklearn.svm import SVR

prices = om.datasets.get('google')
X = prices[['High', 'Low']].rolling(5).mean().dropna()
y = prices.iloc[4:]['Close']
print(X.shape, y.shape)

train_loc = X.shape[0] // 2

model = SVR(kernel='linear', tol=0.1)
model.fit(X.iloc[0:train_loc], y.iloc[0:train_loc])

r2 = model.score(X, y)
yhat = pd.DataFrame({'yhat': model.predict(X[train_loc:])})
yhat.index = X.index[train_loc:]

ax = prices.iloc[train_loc:]['Close'].plot()
yhat.plot(color='r', ax=ax)

In [None]:
# predict remotely

# store models and new data
om.models.put(model, 'google-predict')
om.datasets.put(X[train_loc:], 'google-rolling', append=False)

# then predict remotely
pred = om.runtime.model('google-predict').predict('google-rolling[High,Low]').get()

# show results
pred = pd.DataFrame({'yhat': pred}, index=range(len(pred)))
actual = om.datasets.get('google[Close]').iloc[train_loc:]
pred.index = actual.index[:len(pred)]
ax = actual.plot()
pred.plot(color='r', ax=ax)

In [None]:
# we can also train remote
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import numpy as np

iris = load_iris()
X = iris.data
y = iris.target

df = pd.DataFrame(X)
df['y'] = y

from sklearn.cluster import KMeans
model = KMeans(n_clusters=8)

# fit & predict remote
om.models.drop('iris-model', True)
om.models.put(model, 'iris-model')
om.runtime.model('iris-model').fit(X, y).get()

# get back remote fitted model and show results
model = om.models.get('iris-model')
labels = model.labels_

fig, ax = plt.subplots(subplot_kw={"projection": "3d"})
ax.scatter(X[:, 3], X[:, 0], X[:, 2],
               c=labels.astype(float), edgecolor='k')
fig.show()

In [None]:
# we store lots of information on models
om.models.metadata('iris-model').attributes

In [None]:
# perform gridsearch on cluster
om.datasets.put(df, 'iris', append=False)
params = {
            'n_clusters': range(1,8),
         }
om.runtime.model('iris-model').gridsearch('iris[^y]', 'iris[y]', parameters=params).get()

In [None]:
# see what gridsearch results we have
gsresult = om.models.metadata('iris-model')['attributes']['gridsearch']

In [None]:
# look at gridsearch results
gsModel = gsresult[0]['gsModel']
gs = om.models.get(gsModel)
gs.best_estimator_

In [None]:
# use the model REST API 
import requests
from omegaml.client.auth import AuthenticationEnv
import omegaml as om 

# -- setup authentication and API URL
auth = AuthenticationEnv.active().get_restapi_auth(om=om)
url = getattr(om.defaults, 'OMEGA_RESTAPI_URL', 'http://localhost:5000')
modelname = 'iris-model'
dataset = 'iris'
# -- prepare dataset
om.datasets.put(pd.DataFrame(X), 'iris', append=False)
# -- call REST API
print('Requesting from', url)
resp = requests.put('{url}/api/v1/model/{modelname}/predict?datax={dataset}'.format(**locals()), auth=auth)
resp.json()

In [None]:
# use the datasets REST API 
import requests
import time
time.sleep(5)

print('Requesting from', url)
resp = requests.get('{url}/api/v1/dataset/{dataset}'.format(**locals()), json={}, auth=auth)
resp.json()

In [None]:
# deploy lambda-style arbitrary algorithms
# om.scripts.put('pkg:///app/omegapkg/demo/helloworld/', 'helloworld')

In [None]:
# run lambdas
# from datetime import datetime
# dtnow = datetime.now().isoformat()
# om.runtime.script('helloworld').run(foo=dtnow).get()

In [None]:
# Commercial Edition
# use REST API to run lambdas
# import requests
# from omegacommon.auth import OmegaRestApiAuth
# auth = OmegaRestApiAuth(**auth_config)
# resp = requests.post('https://omegaml.omegaml.io/api/v1/script/helloworld/run/', 
#                      params=dict(foo=dtnow), auth=auth)
# resp.json()

In [None]:
# run jobs (python notebooks) online
if 'scheduled-report.ipynb' in om.jobs.list():
    om.runtime.job('scheduled-report').run()
    om.jobs.list()

## Commercial Edition

### per-user online dashboard 
http://omegaml.omegaml.io/dashboard
    
### per-user online notebook automated setup
http://omjobs.omegaml.io/