import env from '@/env';
import axios from 'axios';
import { getServiceStore } from '@/plugins/FeathersAPI';

import * as SpeechSDK from 'microsoft-cognitiveservices-speech-sdk';
import { getOrthographicMapping } from '../../helpers/orthographic-mapping';

// authorization tokens from Microsoft will last 10 minutes. we will refresh our token if a call is made after 9 minutes passes
const NINE_MINUTES = 9 * 60 * 1000;

export default {
	servicePath: 'speech-recognition',
	modelName: false,
	state: {
		// the latest authorization token
		authorizationObject: null, // {token, region}
		// when the latest token expires
		authorizationTokenExpiry: null,

		// the speech recognizer
		speechRecognizer: null,

		// stored speech recognition results in an object. key is a uuid, and value is an object with blob, src, and recognition results
		storedResults: null,

		// uuids of the current and last recognition results
		currentResultId: null,
		previousResultId: null
	},
	actions: {
		// TODO:
		// remove recognizeScriptedSpeech and recognizeUnscriptedSpeech?
		// just add documentation about scripted and unscripted to 'doSpeechRecognition'

		// users of the speech-recognition service can call two functions for speech recognition:

		// 1) recognizeScriptedSpeech
		// - this is a call to the speech recognition engine while giving it reference text that we expect to be spoken
		// - pass in audio data, src, number of phoneme alternatives desired, and must include referenceText

		// 2) recognizeUnscriptedSpeech
		// - this is a call to the speech recognition engine asking it to give us back what it thinks it heard

		// NOTE: the result is an array of elements each of which an alternative of what the engine thinks it heard. with the
		// first element of the array having the highest confidence. In the case of the scripted call, unfortunately, it only
		// returns one element in this array as it appears Pronunciation Assessment only provides that. However, for the
		// unscripted call where we don't use Pronunciation Assessment, it can potentially return multiple alternatives.

		recognizeScriptedSpeech(params) {
			return this.doSpeechRecognition(params);
		},

		recognizeUnscriptedSpeech(params) {
			return this.doSpeechRecognition(params);
		},

		// returns the most recently stored result. from that result, returns the alternative with the highest confidence
		getCurrentResult() {
			if (!this.currentResultId) {
				return null;
			}
			let result = this.storedResults[this.currentResultId]?.result;
			if (result) {
				return result[0];
			}
			return null;
		},
		getCurrentRecording() {
			if (!this.currentResultId) {
				return null;
			}
			let result = this.storedResults[this.currentResultId];
			if (result) {
				return result;
			}
			return null;
		},

		// helper function to get a valid authorization token for the Speech SDK. used internally
		// NOTE: do not cache the returned token as this action automatically gets a refreshed the token if the current token is expired
		async getAuthorizationToken() {
			// if the token is not valid, get a new one
			if (!this.isAuthorizationTokenValid()) {
				let result = await this.find();
				if (result.status === 'success') {
					this.authorizationObject = { token: result.token, region: result.region };
					this.authorizationTokenExpiry = Date.now() + NINE_MINUTES; // tokens expire in 10 minutes. use 9 minutes
					return result;
				} else {
					console.error('could not get authorization token for Speech SDK');
				}
			} else {
				// otherwise, return the one we saved
				return this.authorizationObject;
			}
			return null;
		},
		isAuthorizationTokenValid() {
			if (!this.authorizationObject) {
				return false;
			}
			let now = Date.now();
			return now < this.authorizationTokenExpiry;
		},

		async doSpeechRecognition({
			blob,
			src,
			numPhonemeAlternatives,
			referenceText,
			phraseList,
			segmentationSilenceTimeout
		}) {
			const audioConfig = await this.getAudioConfig(blob);
			const speechConfig = await this.getSpeechConfig();
			if (segmentationSilenceTimeout) {
				// used to adjust how much nonspeech audio is allowed within a phrase that's currently being spoken before that phrase is considered "done."
				// "This timeout can be set to integer values between 100 and 5000, in milliseconds, with 500 a typical default."
				// https://learn.microsoft.com/en-gb/azure/ai-services/speech-service/how-to-recognize-speech?pivots=programming-language-csharp#change-how-silence-is-handled
				speechConfig.setProperty(
					SpeechSDK.PropertyId.Speech_SegmentationSilenceTimeoutMs,
					segmentationSilenceTimeout.toString()
				);
			}
			if (!audioConfig || !speechConfig) {
				return;
			}

			// create the SpeechRecognizer
			this.speechRecognizer = new SpeechSDK.SpeechRecognizer(speechConfig, audioConfig);

			if (phraseList && phraseList.length > 0) {
				const phraseListGrammar = SpeechSDK.PhraseListGrammar.fromRecognizer(this.speechRecognizer);
				phraseListGrammar.addPhrases(phraseList);
			}

			// apply pronunciation assessment config to recognizer if it is a scripted call with reference text
			if (referenceText) {
				const pronunciationAssessmentConfig = this.getPronunciationAssessmentConfig(
					referenceText,
					numPhonemeAlternatives
				);
				if (!pronunciationAssessmentConfig) {
					return;
				}
				pronunciationAssessmentConfig.applyTo(this.speechRecognizer);
			}

			const result = await this.recognize(this.speechRecognizer);
			// console.log('finished recognize with', result);
			return await this.onRecognizedResult(blob, src, numPhonemeAlternatives, result);
		},
		async getAudioConfig(blob) {
			let pushStream = SpeechSDK.AudioInputStream.createPushStream();
			let arrayBuffer = await blob.arrayBuffer();
			pushStream.write(arrayBuffer);
			pushStream.close();
			return SpeechSDK.AudioConfig.fromStreamInput(pushStream);
		},
		async getSpeechConfig() {
			let speechConfig;
			let { token, region } = await this.getAuthorizationToken();
			if (token && region) {
				speechConfig = SpeechSDK.SpeechConfig.fromAuthorizationToken(token, region);

				// request that the resulting json includes alternates, confidence, and other info
				speechConfig.outputFormat = SpeechSDK.OutputFormat.Detailed;
			}
			return speechConfig;
		},
		getPronunciationAssessmentConfig(referenceText, numPhonemeAlternatives) {
			const pronunciationAssessmentConfigObj = {
				gradingSystem: 'HundredMark',
				granularity: 'Phoneme',
				phonemeAlphabet: 'IPA',
				nBestPhonemeCount: numPhonemeAlternatives, // number of alternative phonemes
				enableProsodyAssessment: true
			};

			// including a referenceText in the pronunciationAssessmentConfigObj can increase the
			// confidence percentage of the assessment
			if (referenceText) {
				pronunciationAssessmentConfigObj.referenceText = referenceText;
			}

			const pronunciationAssessmentConfig = SpeechSDK.PronunciationAssessmentConfig.fromJSON(
				JSON.stringify(pronunciationAssessmentConfigObj)
			);

			// to get more information for content assessment
			// pronunciationAssessmentConfig.enableContentAssessmentWithTopic('weather');

			return pronunciationAssessmentConfig;
		},
		recognize(speechRecognizer) {
			return new Promise((resolve, reject) => {
				speechRecognizer.recognizeOnceAsync(
					(result) => {
						resolve(result);
					},
					(err) => {
						console.error('speech recognition error:', err);
						reject();
					}
				);
			});
		},
		onRecognizedResult(blob, src, numPhonemeAlternatives, result) {
			let returnValue = []; // array of alternatives. index 0 is best alternative
			// console.log('onRecognizedResult', result);

			switch (result.reason) {
				case SpeechSDK.ResultReason.NoMatch: {
					let details = SpeechSDK.NoMatchDetails.fromResult(result);
					console.log('nomatch', details);
					break;
				}
				case SpeechSDK.ResultReason.Canceled: {
					let details = SpeechSDK.CancellationDetails.fromResult(result);
					console.log('canceled', details);
					break;
				}
				case SpeechSDK.ResultReason.RecognizedSpeech:
				case SpeechSDK.ResultReason.TranslatedSpeech:
				case SpeechSDK.ResultReason.RecognizedIntent: {
					let detailedResultJson = JSON.parse(result.json);
					// console.log('detailedResultJson', detailedResultJson);
					console.log('Speech recognition results', detailedResultJson['NBest']);
					if (!detailedResultJson) {
						break;
					}

					//  detailedResultJson['NBest'] = array of recognition alternates
					//  detailedResultJson['NBest'][0] = highest-confidence alternate
					//  detailedResultJson['NBest'][0]['Confidence'] = raw confidence score

					let alternatives = detailedResultJson['NBest'];
					for (let alternative of alternatives) {
						// let alternative = detailedResultJson['NBest'][0]; // highest confidence

						// Lexical is what words the engine recognized
						let text = alternative['Lexical'];

						// what was spoken with default profanity filter applied (which is to mask the bad words with stars)
						// NOTE: ITN means the text has been processed (some punctuation, capitalization, etc)
						let maskedText = alternative['MaskedITN'];

						// this is the overall confidence score for what the engine things was said
						let confidence = alternative['Confidence'];

						//let pronunciationAssessmentResult = SpeechSDK.PronunciationAssessmentResult.fromResult(result);
						//console.log('pronunciationAssessmentResult', pronunciationAssessmentResult);

						let wordsArray = alternative['Words'];
						let words = [];
						if (!wordsArray) {
							// constructing the list of words isn't really necessary here (this is typically the unscripted case), but we keep it
							// so that the structure of the returned value is the same between the scripted and unscripted calls
							let listOfWords = text.split(' ');
							for (const w of listOfWords) {
								words.push({ word: w });
							}
						} else {
							for (let wordObj of wordsArray) {
								let word = wordObj['Word'];
								// NOTE: sometimes Microsoft doesn't give us the syllables (noticed particularly in unscripted calls)
								let sounds = wordObj['Syllables']?.map((item) => item['Syllable']).join('-');
								if (!sounds) {
									console.log('speech recognition did not give us syllables for a word');
									// if no sounds were given to us from microsoft, we can't get the orthographic mapping so continue to next word
									continue;
								}
								let mapping = getOrthographicMapping(word, sounds);
								let phonemes = wordObj['Phonemes'];

								// console.log('mapping', mapping);
								// console.log('phonemes', phonemes);

								// NOTE: in the Microsoft results, each element of the phonemes array contains a sound, but it can be written in two
								// letters. These are typically sounds which are represented as two characters (eg. eɪ, aɪ, oʊ). We would like to put
								// the phoneme scores into our mapping array so that we can see what alternative phonemes match with which sounds.
								// A mapping array has mapping objects -- each element represents a sound-to-letter mapping.
								// To achieve putting the phoneme scores into our mapping array, we loop through the phoneme array from Microsoft,
								// and figure out which sound in our mapping array we should be comparing to. We use a pointer that points to a particular
								// sound in the mapping array. This pointer has an index (elementIndex) into the mapping array, and an index (soundIndex)
								// into the sounds.

								let phonemePointer = { elementIndex: 0, soundIndex: 0 };
								const getSoundInMapping = (phonemePointer, length) => {
									return mapping[phonemePointer.elementIndex].sound.substring(
										phonemePointer.soundIndex,
										phonemePointer.soundIndex + length
									);
								};
								const addPhonemeScoreToMapping = (phonemeChars, sounds, i) => {
									if (phonemeChars === sounds) {
										if (!mapping[phonemePointer.elementIndex].recognizedPhonemes) {
											mapping[phonemePointer.elementIndex].recognizedPhonemes = [];
										}
										let phonemeAlternatives;
										if (numPhonemeAlternatives > 0) {
											phonemeAlternatives = phonemes[i]['PronunciationAssessment']['NBestPhonemes'].map((item) => ({
												phoneme: item.Phoneme,
												score: item.Score
											}));
										}
										// push the alternatives and scores into an array in case there are multiple sounds in a particular mapping object
										mapping[phonemePointer.elementIndex].recognizedPhonemes.push({
											expectedPhoneme: sounds,
											phonemeAlternatives
										});

										// adjust the phoneme pointer accordingly
										phonemePointer.soundIndex += phonemeChars.length;
										if (phonemePointer.soundIndex >= mapping[phonemePointer.elementIndex].sound.length) {
											// if we've reached the end of the sounds in this mapping object, advance the elementIndex and reset soundIndex
											phonemePointer.elementIndex++;
											phonemePointer.soundIndex = 0;
										}
									} else {
										console.log("issue: a phoneme doesn't match!");
									}
								};

								if (numPhonemeAlternatives > 0) {
									// go through all the phonemes from the Microsoft results and put alternatives and scores into the mapping
									for (let i = 0; i < phonemes.length; i++) {
										let phonemeChars = phonemes[i]['Phoneme']; // the next phoneme to look at from the Microsoft results
										let sounds = getSoundInMapping(phonemePointer, phonemeChars.length); // the next sound from our mapping array
										addPhonemeScoreToMapping(phonemeChars, sounds, i); // add the phoneme score from the Microsoft results to the proper place in our mapping array
									}
								}

								let accuracyScore = wordObj['PronunciationAssessment']['AccuracyScore'];

								// let allPhonemes = phonemes.map((item) => item.Phoneme);
								// console.log(allPhonemes);

								words.push({ word, sounds, accuracyScore, mapping });
							}
						}

						let uuid = crypto.randomUUID();

						returnValue.push({ text, maskedText, confidence, words });
						// returnValue = { text, maskedText, confidence, words };

						if (!this.storedResults) {
							this.storedResults = {};
						}
						let recognitionResult = { uuid, blob, src, result: returnValue };
						this.storedResults[uuid] = recognitionResult;

						// if we don't need to keep the old recognition results, we can get rid of them as we update the previousResultId
						if (this.previousResultId) {
							this.deleteRecording(this.previousResultId);
						}

						// update the currentResultId and previousResultId accordingly
						if (this.currentResultId) {
							// if this.currentResultId is null (ie, we called deleteRecording), we don't need to set previous to null when we record something new
							this.previousResultId = this.currentResultId;
						}
						this.currentResultId = uuid;
						break;
					}
				}
			}
			this.speechRecognizer.close();
			this.speechRecognizer = null;

			return returnValue;
		},

		async uploadCurrentRecording(data) {
			if (this.currentResultId) {
				data.recognitionResults = this.getCurrentResult();
				await this.uploadRecording(this.currentResultId, data);
			}
		},
		// data param can be whatever activity would like (e.g. activityId, whether answer was overridden to correct or incorrect, etc)
		uploadRecording(uuid, data) {
			if (!(uuid in this.storedResults)) {
				console.log('recording and results not found for', uuid);
				return;
			}

			let { blob } = this.storedResults[uuid];
			if (blob) {
				let fileType = { type: 'audio/wav' };
				let filename = uuid + '.wav';
				let file = new File([blob], filename, fileType);
				let upload = new FormData();
				upload.append('file', file);
				this.uploading = true;
				let uploadUrl = env('API_URL') + '/uploads/error-recordings';
				let apiAccessToken = getServiceStore('auth').accessToken;
				axios
					.post(uploadUrl, upload, {
						headers: {
							Authorization: 'Bearer ' + apiAccessToken,
							'Content-Type': 'multipart/form-data'
						}
						// onUploadProgress: ((progressEvent) => {
						// 	console.log('progressEvent', progressEvent);
						// 	//this.uploadPercentage = parseInt(Math.round((progressEvent.loaded * 100) / progressEvent.total));
						// }).bind(this)
					})
					// .then((result) => {
					// 	console.log('result', result);
					// })
					.catch((error) => {
						console.error('issue uploading to error-recordings', error);
					});

				// update recordings table
				getServiceStore('error-recordings').create({ filename, data });
			}
		},
		// remove the audio blob and result from the store. for now, the uploaded file is not removed
		deleteRecording(uuid) {
			if (!(uuid in this.storedResults)) {
				//console.log('recording and results not found for', uuid);
				return;
			}
			delete this.storedResults[uuid];
		},
		clearAllRecordings() {
			for (let uuid in this.storedResults) {
				this.deleteRecording(uuid);
				if (this.currentResultId == uuid) {
					this.currentResultId = null;
				}
				if (this.previousResultId == uuid) {
					this.previousResultId = null;
				}
			}
		}
		// test() {
		// 	let m = getOrthographicMapping('ok', 'oʊ-keɪ');
		// 	console.log('ok', m);

		// 	m = getOrthographicMapping('mop', 'mɑp');
		// 	console.log('mop', m);

		// 	m = getOrthographicMapping('blocks', 'blɑks');
		// 	console.log('blocks', m);

		// 	m = getOrthographicMapping('toast', 'toʊst');
		// 	console.log('toast', m);

		// 	m = getOrthographicMapping('nachos', 'nɑ-tʃoʊz');
		// 	console.log('nachos', m);

		// 	m = getOrthographicMapping('cotton', 'kɑ-tən');
		// 	console.log('cotton', m);

		// 	m = getOrthographicMapping('crackers', 'kræ-kərz');
		// 	console.log('crackers', m);

		// 	m = getOrthographicMapping('hospital', 'hɑ-spɪ-təl');
		// 	console.log('hospital', m);

		// 	m = getOrthographicMapping('refrigerator', 'rə-frɪ-dʒər-eɪ-tər');
		// 	console.log('refrigerator', m);

		// 	m = getOrthographicMapping('beard', 'bird');
		// 	console.log('beard', m);

		// 	m = getOrthographicMapping('octopus', 'ɑk-tə-pəs');
		// 	console.log('octopus', m);

		// 	m = getOrthographicMapping('knee', 'ni');
		// 	console.log('knee', m);

		// 	m = getOrthographicMapping('dissatisfy', 'dɪ-sæ-tɪ-sfaɪ');
		// 	console.log('dissatisfy', m);

		// 	//---CASE1: ALL SOUNDS ACCOUNTED FOR, BUT UNMAPPED LETTER (SHOULD GROUP IT WITH PREVIOUS IF POSSIBLE -- ALSO KEEP IT IN SAME SYLLABLE... doable?)

		// 	// unmapped letter au (unmapped letter, no missing sound, should group it with previous)
		// 	m = getOrthographicMapping('restaurant', 'rɛ-strɑnt');
		// 	console.log('restaurant', m);

		// 	// unmapped letter e (unmapped letter, no missing sound, just group it with previous)
		// 	m = getOrthographicMapping('house', 'haʊs');
		// 	console.log('house', m);

		// 	// unmapped spelling ugh
		// 	m = getOrthographicMapping('though', 'θoʊ');
		// 	console.log('though', m);

		// 	// unmapped letter l
		// 	m = getOrthographicMapping('balm', 'bɑm');
		// 	console.log('balm', m);

		// 	// unmapped letter o (should group it with previous?)
		// 	m = getOrthographicMapping('graduation', 'ɡræ-dʒu-eɪ-ʃən');
		// 	console.log('graduation', m);

		// 	// ---CASE2: UNMAPPED LETTER, UNMAPPED SOUND IN SAME PLACE, SHOULD ASSIGN THEM

		// 	// unmapped spelling oe, unmapped sound u
		// 	m = getOrthographicMapping('canoe', 'kə-nu');
		// 	console.log('canoe', m);

		// 	// unmatched u letter should map to w sound
		// 	m = getOrthographicMapping('queen', 'kwin');
		// 	console.log('queen', m);

		// 	//---CASE3: ALL LETTERS ACCOUNTED FOR, BUT UNMAPPED SOUND. SHOULD GROUP SOUND WITH PREVIOUS? CHECK WITH MEGAN?

		// 	// the sound s doesn't have any spelling to map to, so it should group with the k sound
		// 	m = getOrthographicMapping('fox', 'fɑks');
		// 	console.log('fox', m);

		// 	// EXTRA TESTS

		// 	m = getOrthographicMapping('actor', 'æk-tər');
		// 	console.log('actor', m);

		// 	m = getOrthographicMapping('bird', 'bɝrd');
		// 	console.log('bird', m);

		// 	m = getOrthographicMapping('airplane', 'ɛr-pleɪn');
		// 	console.log('airplane', m);

		// 	m = getOrthographicMapping('excel', 'ɪk-sɛl');
		// 	console.log('excel', m);

		// 	m = getOrthographicMapping('game', 'ɡeɪm');
		// 	console.log('game', m);

		// 	m = getOrthographicMapping('asparagus', 'ə-spæ-rə-ɡəs');
		// 	console.log('asparagus', m);

		// 	m = getOrthographicMapping('dough', 'doʊ');
		// 	console.log('dough', m);

		// 	m = getOrthographicMapping('bicycle', 'baɪ-sə-kəl');
		// 	console.log('bicycle', m);

		// 	m = getOrthographicMapping('cabbage', 'kæ-bɪdʒ');
		// 	console.log('cabbage', m);

		// 	m = getOrthographicMapping('cylinder', 'sɪ-lən-dər'); // 2
		// 	console.log('cylinder', m);

		// 	m = getOrthographicMapping('calculator', 'kæl-kjə-leɪ-tər');
		// 	console.log('calculator', m);

		// 	m = getOrthographicMapping('daughter', 'dɔ-tər'); // 3
		// 	console.log('daughter', m);

		// 	m = getOrthographicMapping('doughnut', 'doʊ-nʌt');
		// 	console.log('doughnut', m);

		// 	m = getOrthographicMapping('finger', 'fɪŋ-ɡər');
		// 	console.log('finger', m);

		// 	m = getOrthographicMapping('computer', 'kəm-pju-tər');
		// 	console.log('computer', m);

		// 	m = getOrthographicMapping('cube', 'kjub');
		// 	console.log('cube', m);

		// 	m = getOrthographicMapping('chocolate', 'tʃɑ-klət');
		// 	console.log('chocolate', m);

		// 	m = getOrthographicMapping('blood', 'blʌd');
		// 	console.log('blood', m);

		// 	m = getOrthographicMapping('honour', 'hon-nər');
		// 	console.log('honour', m);

		// 	m = getOrthographicMapping('honor', 'hon-nər');
		// 	console.log('honor', m);

		// 	m = getOrthographicMapping('box', 'bɑks');
		// 	console.log('box', m);

		// 	m = getOrthographicMapping('candlestick', 'kændl-stɪk');
		// 	console.log('candlestick', m);

		// 	m = getOrthographicMapping('fire', 'faɪ-ər');
		// 	console.log('fire', m);

		// 	m = getOrthographicMapping('x-ray', 'ɛks-reɪ');
		// 	console.log('x-ray', m);

		// 	m = getOrthographicMapping('TV', 'ti-vi');
		// 	console.log('TV', m);

		// 	m = getOrthographicMapping('Christmas', 'krɪ-sməs');
		// 	console.log('Christmas', m);

		// 	m = getOrthographicMapping('vegetable', 'vɛ-dʒə-tə-bəl');
		// 	console.log('vegetable', m);

		// 	m = getOrthographicMapping('choir', 'kwaɪ-ər');
		// 	console.log('choir', m);

		// 	m = getOrthographicMapping('mozzarella', 'mɑt-sə-rɛ-lə');
		// 	console.log('mozzarella', m);

		// 	m = getOrthographicMapping('squirrel', 'skwɝr-əl');
		// 	console.log('squirrel', m);

		// 	m = getOrthographicMapping('bottlecap', 'bɑ-təl-kæp');
		// 	console.log('bottlecap', m);

		// 	m = getOrthographicMapping('firefighter', 'faɪ-ər-faɪ-tər');
		// 	console.log('firefighter', m);
		// 	// console.log('firefighter', JSON.stringify(m, null, 5));

		// 	m = getOrthographicMapping('square', 'skwɛr');
		// 	console.log('square', m);
		// 	// console.log('firefighter', JSON.stringify(m, null, 5));

		// 	m = getOrthographicMapping('tongue', 'tʌŋ');
		// 	console.log('tongue', m);

		// 	m = getOrthographicMapping('box', 'bɑkst');
		// 	console.log('box', m);
		// }
	}
};
