## Download and extract dataset files

!cd .. && make dataset && cd notebooks

>>> Downloading and saving data files...
Data files already downloaded.
>>> OK.


              
                ## Import and configure libraries

import json
import warnings
from pathlib import Path

import modin.pandas as pd
import pandas
import plotly.io as pio
from pandas_profiling import ProfileReport
from tqdm import tqdm

warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)


pio.renderers.default = "notebook"


pd.options.plotting.backend = "plotly"

# Set constants
DATA_PATH = Path("../data")
FRAMES_JSON_PATH = Path(DATA_PATH, "raw/frames.json")


              
                ## Load the dataset in a Pandas Dataframe (in memory)

raw_data = pd.read_json(FRAMES_JSON_PATH)

raw_data.describe(include="all")

[codecarbon INFO @ 18:20:35] Energy consumed for RAM : 0.000024 kWh. RAM Power : 5.7580060958862305 W
[codecarbon INFO @ 18:20:35] Energy consumed for all CPUs : 0.000000 kWh. All CPUs Power : 0.0 W
[codecarbon INFO @ 18:20:35] 0.000024 kWh of electricity used since the begining.


              
                frames = raw_data[["id", "wizard_id", "user_id"]]
frames[["userSurveyRating", "wizardSurveyTaskSuccessful"]] = [
    [x["userSurveyRating"], x["wizardSurveyTaskSuccessful"]]
    for x in raw_data.labels
]
frames = frames.astype(
    {"userSurveyRating": "float", "wizardSurveyTaskSuccessful": "bool"}
)
frames.describe(include="all")


              
                for turn in raw_data["turns"]:
    print()
    print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
    print()

    known_facts = {}
    for i, frame in enumerate(turn):
        print(f'{i} - { frame["author"] } says : \n"{ frame["text"] }"')

        known_facts.update(
            {
                info_key: info[-1]["val"] if not info[-1]["negated"] else None
                for f in frame["labels"]["frames"]
                for info_key, info in f["info"].items()
            }
        )

        print(f"Known facts : \n{known_facts}")
        print()

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX

0 - user says :
"I'd like to book a trip to Atlantis from Caprica on Saturday, August 13, 2016 for 8 adults. I have a tight budget of 1700."
Known facts :
{'intent': 'book', 'budget': '1700.0', 'dst_city': 'Atlantis', 'or_city': 'Caprica', 'str_date': 'august 13', 'n_adults': '8'}

1 - wizard says :
"Hi...I checked a few options for you, and unfortunately, we do not currently have any trips that meet this criteria.  Would you like to book an alternate travel option?"
Known facts :
{'intent': 'book', 'budget': '1700.0', 'dst_city': 'Atlantis', 'or_city': 'Caprica', 'str_date': 'august 13', 'n_adults': '8', 'NO_RESULT': True}

2 - user says :
"Yes, how about going to Neverland from Caprica on August 13, 2016 for 5 adults. For this trip, my budget would be 1900."
Known facts :
{'intent': 'book', 'budget': '1900.0', 'dst_city': 'Neverland', 'or_city': 'Caprica', 'str_date': 'august 13', 'n_adults': '5', 'NO_RESULT': True}

3 - wizard says :
"I checked the availability for this date and there were no trips available.  Would you like to select some alternate dates?"
Known facts :
{'intent': 'book', 'budget': '1900.0', 'dst_city': 'Neverland', 'or_city': 'Caprica', 'str_date': 'august 13', 'n_adults': '5', 'NO_RESULT': True}

4 - user says :
"I have no flexibility for dates... but I can leave from Atlantis rather than Caprica. How about that?"
Known facts :
{'intent': 'book', 'budget': '1700.0', 'dst_city': 'Atlantis', 'or_city': 'Atlantis', 'str_date': 'august 13', 'n_adults': '8', 'NO_RESULT': True, 'flex': False}

5 - wizard says :
"I checked the availability for that date and there were no trips available.  Would you like to select some alternate dates?"
Known facts :
{'intent': 'book', 'budget': '1700.0', 'dst_city': 'Atlantis', 'or_city': 'Atlantis', 'str_date': 'august 13', 'n_adults': '8', 'NO_RESULT': True, 'flex': False}

6 - user says :
"I suppose I'll speak with my husband to see if we can choose other dates, and then I'll come back to you.Thanks for your help"
Known facts :
{'intent': 'book', 'budget': '1700.0', 'dst_city': 'Atlantis', 'or_city': 'Atlantis', 'str_date': 'august 13', 'n_adults': '8', 'NO_RESULT': True, 'flex': False}
...


              
                if Path(DATA_PATH, "processed/turns.csv").exists():
    turns = pd.read_csv(Path(DATA_PATH, "processed/turns.csv"))

else:
    turns = pd.DataFrame()

    for turn in tqdm(raw_data["turns"]):
        known_facts = {}

        for i, frame in enumerate(turn):
            if frame["author"] == "wizard":
                continue

            turn_dict = {
                "text": frame["text"],
            }

            turn_dict.update(
                {f"old_{key}": value for key, value in known_facts.items()}
            )

            known_facts.update(
                {
                    info_key: info[-1]["val"]
                    if not info[-1]["negated"]
                    else None
                    for f in frame["labels"]["frames"]
                    for info_key, info in f["info"].items()
                }
            )

            turn_dict.update(
                {f"new_{key}": value for key, value in known_facts.items()}
            )

            turns = turns.append(turn_dict, ignore_index=True)

    turns.to_csv(Path(DATA_PATH, "processed/turns.csv"), index=False)

turns

[codecarbon INFO @ 18:20:50] Energy consumed for RAM : 0.000048 kWh. RAM Power : 5.7580060958862305 W
[codecarbon INFO @ 18:20:50] Energy consumed for all CPUs : 0.000000 kWh. All CPUs Power : 0.0 W
[codecarbon INFO @ 18:20:50] 0.000048 kWh of electricity used since the begining.


              
                turns.describe(include="all")


              
                columns = ["text"] + [
    f"{prefix}_{key}"
    for key in ["or_city", "dst_city", "str_date", "end_date", "budget"]
    for prefix in ["old", "new"]
]
data = turns[columns]

data


              
                data.describe(include="all")


              
                ## Publish Articles Metadata ProfileReport

profile = ProfileReport(
    pandas.DataFrame(data),
    title="Pandas Profiling Report",
    explorative=True,
    minimal=True,
)
profile.to_file(Path("../docs/profile_report.html"))

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

	user_id	turns	wizard_id	id	labels
count	1369	1369	1369	1369	1369
unique	11	1369	12	1369	16
top	U22K1SX9N	[{'text': 'I'd like to book a trip to Atlantis...	U21T9NMKM	e2c0fc6c-2134-4891-8353-ef16d8412c9a	{'userSurveyRating': 5.0, 'wizardSurveyTaskSuc...
freq	345	1	301	1	929

	id	wizard_id	user_id	userSurveyRating	wizardSurveyTaskSuccessful
count	1369	1369	1369	1366.000000	1369
unique	1369	12	11	NaN	2
top	e2c0fc6c-2134-4891-8353-ef16d8412c9a	U21T9NMKM	U22K1SX9N	NaN	True
freq	1	301	345	NaN	1287
mean	NaN	NaN	NaN	4.573419	NaN
std	NaN	NaN	NaN	0.839596	NaN
min	NaN	NaN	NaN	1.000000	NaN
25%	NaN	NaN	NaN	4.000000	NaN
50%	NaN	NaN	NaN	5.000000	NaN
75%	NaN	NaN	NaN	5.000000	NaN
max	NaN	NaN	NaN	5.000000	NaN

	text	new_intent	new_budget	new_dst_city	new_or_city	new_str_date	new_n_adults	old_intent	old_budget	old_dst_city	...	new_count_seat	old_count_seat	new_dst_city_ok	old_dst_city_ok	new_impl_anaphora	old_impl_anaphora	new_str_date_ok	new_end_date_ok	old_str_date_ok	old_end_date_ok
0	I'd like to book a trip to Atlantis from Capri...	book	1700.0	Atlantis	Caprica	august 13	8	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	Yes, how about going to Neverland from Caprica...	book	1900.0	Neverland	Caprica	august 13	5	book	1700.0	Atlantis	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2	I have no flexibility for dates... but I can l...	book	1700.0	Atlantis	Atlantis	august 13	8	book	1900.0	Neverland	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
3	I suppose I'll speak with my husband to see if...	book	1700.0	Atlantis	Atlantis	august 13	8	book	1700.0	Atlantis	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
4	Hello, I am looking to book a vacation from Go...	book	2100.0	Mos Eisley	Gotham City	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
10402	5 adults and 7 kids! Yup, the lot of us. We wa...	book	32800.0	-1	Tampa	NaN	5	book	NaN	-1	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
10403	Oh yes! Between September 12 and 26!	book	32800.0	-1	Tampa	september 12	5	book	32800.0	-1	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
10404	That sounds amazing, and it's within those dat...	book	32800.0	Queenstown	Tampa	september 12	5	book	32800.0	-1	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
10405	Ok perfect, book me!	book	32800.0	Queenstown	Tampa	september 12	5	book	32800.0	Queenstown	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
10406	Thanks!	book	32800.0	Queenstown	Tampa	september 12	5	book	32800.0	Queenstown	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

	text	new_intent	new_budget	new_dst_city	new_or_city	new_str_date	new_n_adults	old_intent	old_budget	old_dst_city	...	new_count_seat	old_count_seat	new_dst_city_ok	old_dst_city_ok	new_impl_anaphora	old_impl_anaphora	new_str_date_ok	new_end_date_ok	old_str_date_ok	old_end_date_ok
count	10407	9362	6229	9631	9620	7430	5570	8078	5255	8307	...	7	6	8	7	5	4	3	3	2	2
unique	9695	1	228	392	339	155	57	1	225	382	...	1	1	1	1	1	1	1	1	1	1
top	Thanks!	book	-1	Punta Cana	-1	-1	1	book	-1	-1	...	two	two	True	True	category	category	True	True	True	True
freq	73	9362	1704	283	174	655	2462	8078	1469	257	...	7	6	8	7	5	4	3	3	2	2

	text	old_or_city	new_or_city	old_dst_city	new_dst_city	old_str_date	new_str_date	old_end_date	new_end_date	old_budget	new_budget
0	I'd like to book a trip to Atlantis from Capri...	NaN	Caprica	NaN	Atlantis	NaN	august 13	NaN	NaN	NaN	1700.0
1	Yes, how about going to Neverland from Caprica...	Caprica	Caprica	Atlantis	Neverland	august 13	august 13	NaN	NaN	1700.0	1900.0
2	I have no flexibility for dates... but I can l...	Caprica	Atlantis	Neverland	Atlantis	august 13	august 13	NaN	NaN	1900.0	1700.0
3	I suppose I'll speak with my husband to see if...	Atlantis	Atlantis	Atlantis	Atlantis	august 13	august 13	NaN	NaN	1700.0	1700.0
4	Hello, I am looking to book a vacation from Go...	NaN	Gotham City	NaN	Mos Eisley	NaN	NaN	NaN	NaN	NaN	2100.0
...	...	...	...	...	...	...	...	...	...	...	...
10402	5 adults and 7 kids! Yup, the lot of us. We wa...	Tampa	Tampa	-1	-1	NaN	NaN	NaN	NaN	NaN	32800.0
10403	Oh yes! Between September 12 and 26!	Tampa	Tampa	-1	-1	NaN	september 12	NaN	26	32800.0	32800.0
10404	That sounds amazing, and it's within those dat...	Tampa	Tampa	-1	Queenstown	september 12	september 12	26	26	32800.0	32800.0
10405	Ok perfect, book me!	Tampa	Tampa	Queenstown	Queenstown	september 12	september 12	26	25	32800.0	32800.0
10406	Thanks!	Tampa	Tampa	Queenstown	Queenstown	september 12	september 12	25	25	32800.0	32800.0

	text	old_or_city	new_or_city	old_dst_city	new_dst_city	old_str_date	new_str_date	old_end_date	new_end_date	old_budget	new_budget
count	10407	8287	9620	8307	9631	6287	7430	4787	5734	5255	6229
unique	9695	332	339	382	392	151	155	129	131	225	228
top	Thanks!	-1	-1	-1	Punta Cana	-1	-1	-1	-1	-1	-1
freq	73	158	174	257	283	567	655	344	404	1469	1704

Fly Me : flights booking chatbot¶